diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index b11eac058a1814fa9d1dcfb3358cba2879230b2c..6b76e3cc1d1a5905a027b08f7d08df4a47cc33b3 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -181,6 +181,13 @@ IF(WITH_XPU)
         DSTS ${dst_dir} ${dst_dir})
 ENDIF()
 
+IF(WITH_IPU)
+    set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/install/ipu")
+    copy(inference_lib_dist
+        SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/platform/device/ipu/libpaddle_ipu.so
+        DSTS ${dst_dir})
+ENDIF()
+
 # CMakeCache Info
 copy(inference_lib_dist
         SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
@@ -189,6 +196,7 @@ copy(inference_lib_dist
 copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})
 
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
+
 if(WIN32)
     if(WITH_STATIC_LIB)
         set(paddle_inference_lib $<TARGET_FILE_DIR:paddle_inference>/libpaddle_inference.lib
@@ -304,7 +312,7 @@ copy(fluid_lib_dist
         )
 
 set(module "platform")
-set(platform_lib_deps profiler_proto error_codes_proto)
+set(platform_lib_deps profiler_proto errors)
 if(WITH_GPU)
   set(platform_lib_deps ${platform_lib_deps} external_error_proto)
 endif(WITH_GPU)
@@ -317,7 +325,7 @@ copy(fluid_lib_dist
 
 set(module "string")
 copy(fluid_lib_dist
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
+        SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/${module}/*.h ${PADDLE_SOURCE_DIR}/paddle/utils/${module}/tinyformat/*.h 
         DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
         )
 
diff --git a/cmake/pten.cmake b/cmake/pten.cmake
index 70d61027da872aa19d91c8fbc13d6acee007d048..8e1d233986209b8e4f51065db998ebd46e1290cd 100644
--- a/cmake/pten.cmake
+++ b/cmake/pten.cmake
@@ -243,3 +243,29 @@ function(register_kernels)
         endif()
     endforeach()
 endfunction()
+
+function(append_op_util_declare TARGET)
+    file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET} target_content)
+    string(REGEX MATCH "(PT_REGISTER_API_NAME|PT_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}")
+    string(REPLACE "PT_REGISTER_ARG_MAPPING_FN" "PT_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}")
+    string(REPLACE "PT_REGISTER_API_NAME" "PT_REGISTER_API_NAME" util_declare "${util_declare}")
+    string(APPEND util_declare ");")
+    file(APPEND ${op_utils_header} "${util_declare}")
+endfunction()
+
+function(register_op_utils TARGET_NAME)
+    set(utils_srcs)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs EXCLUDES DEPS)
+    cmake_parse_arguments(register_op_utils "${options}" "${oneValueArgs}"
+        "${multiValueArgs}" ${ARGN})
+
+    file(GLOB SIGNATURES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_sig.cc")
+    foreach(target ${SIGNATURES})
+        append_op_util_declare(${target})
+        list(APPEND utils_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${target})
+    endforeach()
+
+    cc_library(${TARGET_NAME} SRCS ${utils_srcs} DEPS ${register_op_utils_DEPS})
+endfunction()
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 4b88689b9b6dfa7383d79a834afa3f23debb0890..3e7669849882687510dd8193e7c2762d08f332a0 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(utils)
 add_subdirectory(scripts)
 add_subdirectory(testing)
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 08c2d2e05558bd616674522a1c8d1c4c2698d196..75966399148d455debac27e2fe890ae7fa0ffb0a 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -4,7 +4,6 @@ add_subdirectory(distributed)
 add_subdirectory(framework)
 add_subdirectory(imperative)
 add_subdirectory(operators)
-add_subdirectory(string)
 add_subdirectory(pybind)
 add_subdirectory(eager)
 # NOTE: please add subdirectory inference at last.
diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt
index 24923d72681869348ec7db816349bdef010c973d..5ae2e26e87c7b33a75325f5b585ca115bd3b6308 100644
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -13,17 +13,7 @@ if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
 endif()
 
 add_subdirectory(common)
-add_subdirectory(service)
-add_subdirectory(table)
+add_subdirectory(ps)
 add_subdirectory(test)
 add_subdirectory(index_dataset)
 add_subdirectory(fleet_executor)
-
-get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
-
-set_source_files_properties(fleet.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_library(fleet
-        SRCS fleet.cc
-        DEPS framework_proto ps_framework_proto ps_service variable_helper scope op_registry fs shell ${RPC_DEPS})
-
-target_link_libraries(fleet z)
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index 6454a349505131a461d99fe90db9dd69cb916507..452c666a1523cb81f7857684896997f1ad20d20d 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -13,11 +13,13 @@
 // limitations under the License.
 
 #include <glog/logging.h>
+#include <chrono>  // NOLINT
 
 #include "paddle/fluid/distributed/fleet_executor/dist_model.h"
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -37,10 +39,110 @@ bool IsPersistable(const framework::VarDesc *var) {
   }
   return false;
 }
+
+bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
+                                 framework::LoDTensor *input_tensor,
+                                 const platform::Place &place) {
+  VLOG(3) << "Loading data from DistModelTensor for " << input_data.name;
+  framework::DDim dims = framework::make_ddim(input_data.shape);
+  void *input_tensor_ptr;
+  if (input_data.dtype == DistModelDataType::INT64) {
+    input_tensor_ptr = input_tensor->mutable_data<int64_t>(dims, place);
+  } else if (input_data.dtype == DistModelDataType::FLOAT32) {
+    input_tensor_ptr = input_tensor->mutable_data<float>(dims, place);
+  } else if (input_data.dtype == DistModelDataType::INT32) {
+    input_tensor_ptr = input_tensor->mutable_data<int32_t>(dims, place);
+  } else {
+    // Q(fleet exe dev): for input/output, should we support fp16
+    LOG(ERROR) << "unsupported feed type " << input_data.dtype;
+    return false;
+  }
+
+  PADDLE_ENFORCE_NOT_NULL(
+      input_tensor_ptr,
+      paddle::platform::errors::Fatal(
+          "LoDTensor creation failed. DistModel loaded data failed."));
+  PADDLE_ENFORCE_NOT_NULL(input_data.data.data(),
+                          paddle::platform::errors::InvalidArgument(
+                              "DistModelTensor contains no data."));
+
+  if (platform::is_cpu_place(place)) {
+    VLOG(3) << "Loading data for CPU.";
+    std::memcpy(static_cast<void *>(input_tensor_ptr), input_data.data.data(),
+                input_data.data.length());
+  } else if (platform::is_gpu_place(place)) {
+    VLOG(3) << "Loading data for GPU.";
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto *dev_ctx =
+        dynamic_cast<const platform::CUDADeviceContext *>(pool.Get(place));
+    auto gpu_place = place;
+    memory::Copy(gpu_place, static_cast<void *>(input_tensor_ptr),
+                 platform::CPUPlace(), input_data.data.data(),
+                 input_data.data.length(), dev_ctx->stream());
+#else
+    PADDLE_THROW(paddle::platform::errors::Fatal(
+        "Paddle wasn't compiled with CUDA, but place is GPU."));
+#endif
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "DistModel only supports CPU and GPU."));
+  }
+
+  framework::LoD dst_lod;
+  for (auto &src_lod : input_data.lod) {
+    dst_lod.emplace_back(src_lod);
+  }
+  input_tensor->set_lod(dst_lod);
+  return true;
+}
+
+std::string DistModelDTypeToString(DistModelDataType dtype) {
+  switch (dtype) {
+    case DistModelDataType::FLOAT32:
+      return "float32";
+    case DistModelDataType::FLOAT16:
+      return "float16";
+    case DistModelDataType::INT64:
+      return "int64";
+    case DistModelDataType::INT32:
+      return "int32";
+    case DistModelDataType::INT8:
+      return "int8";
+  }
+  return "NOT SUPPORT DTYPE";
+}
+
+bool IsPPFirstStage(const DistModelConfig &config) {
+  return config.local_rank - config.mp_degree < 0;
+}
+
+bool IsPPLastStage(const DistModelConfig &config) {
+  return config.local_rank + config.mp_degree >= config.nranks;
+}
+
+class DistModelTimer {
+ public:
+  void tic() { tic_time = std::chrono::high_resolution_clock::now(); }
+  double toc() {
+    std::chrono::high_resolution_clock::time_point toc_time =
+        std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> time_elapse =
+        std::chrono::duration_cast<std::chrono::duration<double>>(toc_time -
+                                                                  tic_time);
+    double time_elapse_in_ms =
+        static_cast<double>(time_elapse.count()) * 1000.0;
+    return time_elapse_in_ms;
+  }
+
+ private:
+  std::chrono::high_resolution_clock::time_point tic_time;
+};
+
 }  // namespace
 
 bool DistModel::Init() {
-  /* TODO(fleet exe dev): implement this funct */
+  carrier_id_ = "inference";
   bool init_method = (!config_.model_dir.empty() || config_.program_desc);
   PADDLE_ENFORCE_EQ(init_method, true,
                     platform::errors::InvalidArgument(
@@ -127,10 +229,9 @@ bool DistModel::CommInit() {
     InsertCommOp("mp_comm_id", mp_group_nranks, mp_group_rank, peer_endpoints,
                  comm_init_block, config_.mp_ring_id);
   }
-  if (config_.pp_degree) {
-    // NOTE: the last pp stage doesn't need init pp comm
+  if (config_.pp_degree > 1) {
     VLOG(3) << "Init comm group for pp.";
-    if (config_.local_rank - config_.mp_degree >= 0) {
+    if (!IsPPFirstStage(config_)) {
       PADDLE_ENFORCE_EQ(config_.pp_upstream_ring_id >= 0, true,
                         platform::errors::InvalidArgument(
                             "pp upstream ring id must be provided for "
@@ -143,7 +244,7 @@ bool DistModel::CommInit() {
                    comm_init_block, config_.pp_upstream_ring_id);
     }
 
-    if (config_.local_rank + config_.mp_degree < config_.nranks) {
+    if (!IsPPLastStage(config_)) {
       PADDLE_ENFORCE_EQ(config_.pp_downstream_ring_id >= 0, true,
                         platform::errors::InvalidArgument(
                             "pp downstream ring id must be provided for "
@@ -326,7 +427,7 @@ bool DistModel::PrepareFleetExe() {
     id_to_rank.insert({i, i});
   }
   fleet_exe.reset(new FleetExecutor(executor_desc_));
-  fleet_exe->Init("inference", *(program_.get()), scope_.get(), place_, 1,
+  fleet_exe->Init(carrier_id_, *(program_.get()), scope_.get(), place_, 1,
                   {task_node_.get()}, id_to_rank);
   return true;
 }
@@ -340,8 +441,27 @@ bool DistModel::PrepareFeedAndFetch() {
         feeds_.resize(idx + 1);
       }
       feeds_[idx] = op;
-      feed_names_[op->Output("Out")[0]] = idx;
-      idx_to_feeds_[idx] = op->Output("Out")[0];
+      std::string var_name = op->Output("Out")[0];
+      feed_names_[var_name] = idx;
+      idx_to_feeds_[idx] = var_name;
+      framework::VarDesc *real_var = program_->Block(0).FindVar(var_name);
+      if (!real_var) {
+        LOG(ERROR)
+            << "The output of feed ops [" << var_name
+            << "] cannot be found in the program. Check the inference program.";
+        return false;
+      }
+      if (real_var->GetDataType() == framework::proto::VarType::FP32) {
+        feeds_to_dtype_.insert({var_name, DistModelDataType::FLOAT32});
+      } else if (real_var->GetDataType() == framework::proto::VarType::INT32) {
+        feeds_to_dtype_.insert({var_name, DistModelDataType::INT32});
+      } else if (real_var->GetDataType() == framework::proto::VarType::INT64) {
+        feeds_to_dtype_.insert({var_name, DistModelDataType::INT64});
+      } else {
+        LOG(ERROR) << "Don't support feed var dtype for: "
+                   << real_var->GetDataType();
+        return false;
+      }
     } else if (op->Type() == "fetch") {
       VLOG(3) << "fetch op with fetch var: " << op->Input("X")[0];
       int idx = BOOST_GET_CONST(int, op->GetAttr("col"));
@@ -349,15 +469,170 @@ bool DistModel::PrepareFeedAndFetch() {
         fetches_.resize(idx + 1);
       }
       fetches_[idx] = op;
-      id_to_fetches_[idx] = op->Input("X")[0];
+      idx_to_fetches_[idx] = op->Input("X")[0];
     }
   }
+
+  if (config_.pp_degree == 1) {
+    if (feeds_.size() == 0) {
+      LOG(ERROR) << "No feed ops in the inf program, please check the program.";
+      return false;
+    }
+    if (fetches_.size() == 0) {
+      LOG(ERROR) << "No fetch op in the inf program, please check the program.";
+      return false;
+    }
+  } else {
+    if (IsPPFirstStage(config_)) {
+      if (feeds_.size() == 0) {
+        LOG(ERROR) << "Feed ops are needed for the first pp stage.";
+        return false;
+      }
+    } else {
+      if (feeds_.size() > 0) {
+        LOG(WARNING) << "Feed op is found in the non-first stage of pp.";
+      } else {
+        LOG(INFO) << "No feed ops in non-first pp stage.";
+      }
+    }
+    if (IsPPLastStage(config_)) {
+      if (fetches_.size() == 0) {
+        LOG(WARNING) << "No fetch op was found in the last pp stage. Make sure "
+                        "the result has been sent to frist pp stage.";
+      }
+    } else {
+      if (fetches_.size() > 0) {
+        LOG(WARNING) << "Fetch op is found in the non-last stage of pp.";
+      } else {
+        LOG(INFO) << "No fetch op in non-last pp stage.";
+      }
+    }
+  }
+  return true;
+}
+
+bool DistModel::FeedData(const std::vector<DistModelTensor> &input_data,
+                         framework::Scope *scope) {
+  VLOG(3) << "DistModel is feeding data.";
+  if (input_data.size() != feeds_.size()) {
+    LOG(ERROR) << "Should provide " << feeds_.size() << " feeds, but got "
+               << input_data.size() << " data.";
+    return false;
+  }
+  feed_tensors_.resize(feeds_.size());
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    // feed each data separately
+    framework::LoDTensor *input_tensor = &(feed_tensors_[i]);
+    if (!LoadDataFromDistModelTensor(input_data[i], input_tensor, place_)) {
+      LOG(ERROR) << "Fail to load data from tensor " << input_data[i].name;
+      return false;
+    }
+    std::string target_name = input_data[i].name;
+    if (feed_names_.find(target_name) == feed_names_.end()) {
+      LOG(ERROR) << "The input name [" << target_name
+                 << "] cannot be found in the program."
+                 << " DistModel loads data failed.";
+      return false;
+    }
+    if (input_data[i].dtype != feeds_to_dtype_[target_name]) {
+      LOG(ERROR) << "Feed var [" << target_name << "] expected dtype is: "
+                 << DistModelDTypeToString(feeds_to_dtype_[target_name])
+                 << ". But received dtype is: "
+                 << DistModelDTypeToString(input_data[i].dtype) << ".";
+      return false;
+    }
+    int feed_idx = feed_names_[target_name];
+    framework::SetFeedVariable(scope, *input_tensor, "feed", feed_idx);
+  }
   return true;
 }
 
-void DistModel::Run(const std::vector<DistModelTensor> &input_data,
+bool DistModel::FetchResults(std::vector<DistModelTensor> *output_data,
+                             framework::Scope *scope) {
+  VLOG(3) << "DistModel is fetch results.";
+  output_data->resize(fetches_.size());
+  for (size_t i = 0; i < fetches_.size(); ++i) {
+    int idx = BOOST_GET_CONST(int, fetches_[i]->GetAttr("col"));
+    VLOG(3) << "Fetching data for [" << idx_to_fetches_[idx] << "]";
+    PADDLE_ENFORCE_EQ(
+        static_cast<size_t>(idx), i,
+        platform::errors::InvalidArgument(
+            "Fetch op's col attr(%d) should be equal to the index(%d)", idx,
+            i));
+    framework::FetchType &fetch_var =
+        framework::GetFetchVariable(*scope, "fetch", idx);
+    auto &fetch = BOOST_GET(framework::LoDTensor, fetch_var);
+    auto type = fetch.type();
+    auto output = &(output_data->at(i));
+    output->name = idx_to_fetches_[idx];
+    bool rst = false;
+    if (type == framework::proto::VarType::FP32) {
+      rst = FetchResult<float>(fetch, output);
+      output->dtype = DistModelDataType::FLOAT32;
+    } else if (type == framework::proto::VarType::INT64) {
+      rst = FetchResult<int64_t>(fetch, output);
+      output->dtype = DistModelDataType::INT64;
+    } else if (type == framework::proto::VarType::INT32) {
+      rst = FetchResult<int32_t>(fetch, output);
+      output->dtype = DistModelDataType::INT32;
+    } else {
+      LOG(ERROR) << "DistModel meets unknown fetch data type. DistModel only "
+                    "supports float32, int64 and int32 fetch type for now.";
+    }
+    if (!rst) {
+      LOG(ERROR) << "DistModel fails to fetch result " << idx_to_fetches_[idx];
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename T>
+bool DistModel::FetchResult(const framework::LoDTensor &fetch,
+                            DistModelTensor *output_data) {
+  auto shape = framework::vectorize(fetch.dims());
+  output_data->shape.assign(shape.begin(), shape.end());
+  const T *data = fetch.data<T>();
+  int64_t num_elems = fetch.numel();
+  output_data->data.Resize(num_elems * sizeof(T));
+  // The output of fetch op is always on the cpu, no need switch on place
+  memcpy(output_data->data.data(), data, num_elems * sizeof(T));
+  output_data->lod.clear();
+  for (auto &level : fetch.lod()) {
+    output_data->lod.emplace_back(level.begin(), level.end());
+  }
+  return true;
+}
+
+bool DistModel::Run(const std::vector<DistModelTensor> &input_data,
                     std::vector<DistModelTensor> *output_data) {
-  /* TODO(fleet exe dev): implement this funct */
+  // TODO(fleet exe dev): support pipeline inf mode
+  VLOG(3) << "DistModel run for once.";
+
+  DistModelTimer timer;
+  timer.tic();
+
+  if (!FeedData(input_data, scope_.get())) {
+    LOG(ERROR) << "DistModel failed at feeding data.";
+    return false;
+  }
+  double feed_elapse = timer.toc();
+  VLOG(3) << "Finish loading data, cost " << feed_elapse << "ms.";
+
+  fleet_exe->Run(carrier_id_);
+  double fleet_exe_elapse = timer.toc();
+  VLOG(3) << "Finish FleetExe running, cost " << fleet_exe_elapse - feed_elapse
+          << "ms.";
+
+  if (!FetchResults(output_data, scope_.get())) {
+    LOG(ERROR) << "DistModel failed at fetching result.";
+    return false;
+  }
+  double fetch_elapse = timer.toc();
+  VLOG(3) << "Finish fetching data, cost " << fetch_elapse - fleet_exe_elapse
+          << "ms.";
+  VLOG(3) << "DistModel finish inf, cost " << fetch_elapse << "ms";
+  return true;
 }
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.h b/paddle/fluid/distributed/fleet_executor/dist_model.h
index 96e9c018074b5f0079c62d0c89c45be8ec0e172b..e6ad94e266a964bdc3c6cfba39cbf86786a4acea 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.h
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.h
@@ -19,6 +19,7 @@
 
 #include "paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h"
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
@@ -57,7 +58,7 @@ class DistModel {
  public:
   explicit DistModel(const DistModelConfig& config) : config_(config) {}
   bool Init();
-  void Run(const std::vector<DistModelTensor>& input_data,
+  bool Run(const std::vector<DistModelTensor>& input_data,
            std::vector<DistModelTensor>* output_data);
   ~DistModel() = default;
 
@@ -75,12 +76,22 @@ class DistModel {
   void InsertCommOp(std::string tmp_var_name, int nranks, int rank,
                     const std::vector<std::string>& peer_endpoints,
                     framework::BlockDesc* block, int ring_id);
+  bool FeedData(const std::vector<DistModelTensor>& input_data,
+                framework::Scope* scope);
+  bool FetchResults(std::vector<DistModelTensor>* output_data,
+                    framework::Scope* scope);
+  template <typename T>
+  bool FetchResult(const framework::LoDTensor& fetch,
+                   DistModelTensor* output_data);
 
+  std::string carrier_id_;
+  std::vector<framework::LoDTensor> feed_tensors_;
   std::vector<framework::OpDesc*> feeds_;
   std::map<std::string, int64_t> feed_names_;
   std::map<int64_t, std::string> idx_to_feeds_;
+  std::map<std::string, DistModelDataType> feeds_to_dtype_;
   std::vector<framework::OpDesc*> fetches_;
-  std::map<int64_t, std::string> id_to_fetches_;
+  std::map<int64_t, std::string> idx_to_fetches_;
   DistModelConfig config_;
   FleetExecutorDesc executor_desc_;
   std::shared_ptr<FleetExecutor> fleet_exe;
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
index 4a04633388af21277806115b77d69ce05867519a..6bdd858d6cf9ed78c1a655c28ed58574374ce3fb 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
+++ b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
@@ -62,7 +62,7 @@ class DistModelDataBuf {
   void Free();
   void* data_{nullptr};
   size_t length_{0};
-  bool memory_owned_{false};
+  bool memory_owned_{true};
 };
 
 struct DistModelTensor {
diff --git a/paddle/fluid/distributed/ps/CMakeLists.txt b/paddle/fluid/distributed/ps/CMakeLists.txt
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e97c9db1a5199175507884e29c2f53e8a5bae07a 100644
--- a/paddle/fluid/distributed/ps/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/CMakeLists.txt
@@ -0,0 +1,4 @@
+set_property(GLOBAL PROPERTY RPC_DEPS sendrecv_rpc ${BRPC_DEPS} string_helper)
+add_subdirectory(table)
+add_subdirectory(service)
+add_subdirectory(wrapper)
diff --git a/paddle/fluid/distributed/service/CMakeLists.txt b/paddle/fluid/distributed/ps/service/CMakeLists.txt
similarity index 73%
rename from paddle/fluid/distributed/service/CMakeLists.txt
rename to paddle/fluid/distributed/ps/service/CMakeLists.txt
index d1f04e26ade7289bcb10988d02de01962a1889ab..ab6c2e26002743fc129c4a7d0e532a63aa1d610b 100644
--- a/paddle/fluid/distributed/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/service/CMakeLists.txt
@@ -8,12 +8,12 @@ brpc_library(sendrecv_rpc SRCS
         PROTO sendrecv.proto
         DEPS ${BRPC_DEPS} )
 
-set_property(GLOBAL PROPERTY RPC_DEPS sendrecv_rpc ${BRPC_DEPS} string_helper)
+#set_property(GLOBAL PROPERTY RPC_DEPS sendrecv_rpc ${BRPC_DEPS} string_helper)
 
 get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
 
-set_source_files_properties(communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(communicator/communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(ps_service/service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(brpc_ps_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(brpc_ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(ps_local_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -36,11 +36,13 @@ ps_local_client.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DE
 cc_library(client SRCS ps_client.cc DEPS downpour_client boost ${RPC_DEPS})
 cc_library(server SRCS server.cc DEPS downpour_server boost ${RPC_DEPS})
 
-cc_library(communicator SRCS communicator.cc DEPS scope client boost table math_function selected_rows_functor ${RPC_DEPS})
-cc_library(ps_service SRCS service.cc DEPS communicator client server boost ${RPC_DEPS})
+cc_library(communicator SRCS communicator/communicator.cc DEPS scope client boost table math_function selected_rows_functor ${RPC_DEPS})
+cc_library(ps_service SRCS ps_service/service.cc DEPS communicator client server boost ${RPC_DEPS})
 
 cc_library(heter_server SRCS heter_server.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
 cc_library(heter_client SRCS heter_client.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
 
-set_source_files_properties(graph_py_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_library(graph_py_service SRCS graph_py_service.cc DEPS ps_service)
+set_source_files_properties(ps_service/graph_py_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(graph_py_service SRCS ps_service/graph_py_service.cc DEPS ps_service)
+
+#add_subdirectory(communicator)
diff --git a/paddle/fluid/distributed/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
similarity index 99%
rename from paddle/fluid/distributed/service/brpc_ps_client.cc
rename to paddle/fluid/distributed/ps/service/brpc_ps_client.cc
index db1dd2ced84e53aee8a57f70a3d11301fc00b4eb..e855fcbd02553ac1ea2e753239deaa8371661b32 100644
--- a/paddle/fluid/distributed/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
@@ -16,7 +16,7 @@
 #include <sstream>
 #include <string>
 
-#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
 #include "paddle/fluid/framework/archive.h"
 
 static const int max_port = 65535;
diff --git a/paddle/fluid/distributed/service/brpc_ps_client.h b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
similarity index 98%
rename from paddle/fluid/distributed/service/brpc_ps_client.h
rename to paddle/fluid/distributed/ps/service/brpc_ps_client.h
index d5388a5cd07c9e1d982f7e08d7a0c1c361af1d0d..70f406ee248dc3d39777297d8387b4749439cf82 100644
--- a/paddle/fluid/distributed/service/brpc_ps_client.h
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
@@ -22,8 +22,8 @@
 #include "brpc/channel.h"
 #include "brpc/controller.h"
 #include "brpc/server.h"
-#include "paddle/fluid/distributed/service/brpc_utils.h"
-#include "paddle/fluid/distributed/service/ps_client.h"
+#include "paddle/fluid/distributed/ps/service/brpc_utils.h"
+#include "paddle/fluid/distributed/ps/service/ps_client.h"
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
diff --git a/paddle/fluid/distributed/service/brpc_ps_server.cc b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
similarity index 99%
rename from paddle/fluid/distributed/service/brpc_ps_server.cc
rename to paddle/fluid/distributed/ps/service/brpc_ps_server.cc
index dd7072be7de63ba90c55e176671c63ba1d444e09..58ce52552c9d22c56b314dfe0bccbb8a564edb5d 100644
--- a/paddle/fluid/distributed/service/brpc_ps_server.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
 #include <thread>  // NOLINT
 #include "butil/object_pool.h"
 #include "paddle/fluid/distributed/common/cost_timer.h"
-#include "paddle/fluid/distributed/table/depends/sparse_utils.h"
-#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/platform/profiler.h"
 
diff --git a/paddle/fluid/distributed/service/brpc_ps_server.h b/paddle/fluid/distributed/ps/service/brpc_ps_server.h
similarity index 98%
rename from paddle/fluid/distributed/service/brpc_ps_server.h
rename to paddle/fluid/distributed/ps/service/brpc_ps_server.h
index bf228a5d1b0ae58669e5f555d2f99200d6099661..4310c247438ceb9bff541fdd21e00ff70ff7b4fd 100644
--- a/paddle/fluid/distributed/service/brpc_ps_server.h
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.h
@@ -17,8 +17,8 @@
 #include "brpc/channel.h"
 #include "brpc/controller.h"
 #include "brpc/server.h"
-#include "paddle/fluid/distributed/service/brpc_utils.h"
-#include "paddle/fluid/distributed/service/server.h"
+#include "paddle/fluid/distributed/ps/service/brpc_utils.h"
+#include "paddle/fluid/distributed/ps/service/server.h"
 
 namespace brpc {
 class Controller;
diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/ps/service/brpc_utils.cc
similarity index 98%
rename from paddle/fluid/distributed/service/brpc_utils.cc
rename to paddle/fluid/distributed/ps/service/brpc_utils.cc
index 147758abfd55530d66b66bd8cad110e5202f7dc2..23b2f5545ffc2ae8939dba26e602505aa8197139 100644
--- a/paddle/fluid/distributed/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_utils.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/distributed/service/brpc_utils.h"
+#include "paddle/fluid/distributed/ps/service/brpc_utils.h"
 #include <arpa/inet.h>
 #include <netdb.h>
 #include "paddle/fluid/platform/enforce.h"
@@ -76,7 +76,7 @@ void SerializeToMultiVarMsgAndIOBuf(
 
     if (var->IsType<framework::LoDTensor>()) {
       SerializeLodTensor(var, ctx, send_var_msg, &temp_iobuf);
-    } else if (var->IsType<framework::SelectedRows>()) {
+    } else if (var->IsType<pten::SelectedRows>()) {
       SerializeSelectedRows(var, ctx, send_var_msg, &temp_iobuf);
     }
     iobuf->append(temp_iobuf);
@@ -127,7 +127,7 @@ void SerializeLodTensor(framework::Variable* var,
 void SerializeSelectedRows(framework::Variable* var,
                            const platform::DeviceContext& ctx, VarMsg* var_msg,
                            butil::IOBuf* iobuf) {
-  framework::SelectedRows* slr = var->GetMutable<framework::SelectedRows>();
+  pten::SelectedRows* slr = var->GetMutable<pten::SelectedRows>();
   auto* tensor = slr->mutable_value();
   auto* rows = slr->mutable_rows();
 
@@ -255,7 +255,7 @@ void DeserializeSelectedRows(
     butil::IOBufBytesIterator& io_buffer_itr,  // NOLINT
     const platform::DeviceContext& ctx) {
   const auto place = ctx.GetPlace();
-  auto* slr = var->GetMutable<framework::SelectedRows>();
+  auto* slr = var->GetMutable<pten::SelectedRows>();
   framework::Tensor* tensor = slr->mutable_value();
   slr->set_height(msg.slr_height());
   std::vector<int64_t> tmp_rows(msg.dims()[0]);
diff --git a/paddle/fluid/distributed/service/brpc_utils.h b/paddle/fluid/distributed/ps/service/brpc_utils.h
similarity index 98%
rename from paddle/fluid/distributed/service/brpc_utils.h
rename to paddle/fluid/distributed/ps/service/brpc_utils.h
index ebae710acc28b58a503bc9c0b455ef7c5ca10cff..b241f7f80121cc6920720e3d24332d3be129bd77 100644
--- a/paddle/fluid/distributed/service/brpc_utils.h
+++ b/paddle/fluid/distributed/ps/service/brpc_utils.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <vector>
 
 #include "brpc/channel.h"
-#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
diff --git a/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt b/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3610729d74d939b47fbd6f8e7b58219934021bca
--- /dev/null
+++ b/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt
@@ -0,0 +1,8 @@
+
+
+get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
+
+set_source_files_properties(communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+
+
+cc_library(communicator SRCS communicator.cc DEPS scope client boost table math_function selected_rows_functor ${RPC_DEPS})
diff --git a/paddle/fluid/distributed/service/communicator.cc b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
similarity index 99%
rename from paddle/fluid/distributed/service/communicator.cc
rename to paddle/fluid/distributed/ps/service/communicator/communicator.cc
index e2b81ace2914789110e9e9410e314f6db1dccf50..a73f87c1d88965ce2e1b522ac79422f94ef0ea98 100644
--- a/paddle/fluid/distributed/service/communicator.cc
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
 
 #include <google/protobuf/text_format.h>
 
 #include "gflags/gflags.h"
-#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/string_helper.h"
 
@@ -28,7 +28,7 @@ namespace paddle {
 namespace distributed {
 
 using framework::LoDTensor;
-using framework::SelectedRows;
+using pten::SelectedRows;
 
 const uint32_t MAX_FEASIGN_NUM = 1024 * 100 * 100;
 
@@ -293,7 +293,7 @@ void Communicator::RpcSendSparse(const std::string &var_name, int table_id,
   std::vector<float *> push_g_vec;
 
   auto *send_var = scope.FindVar(var_name);
-  auto *tensor = send_var->GetMutable<SelectedRows>();
+  auto *tensor = send_var->GetMutable<pten::SelectedRows>();
   auto dim = tensor->value().dims()[1];
   std::transform(tensor->rows().begin(), tensor->rows().end(),
                  std::back_inserter(sparse_push_keys),
@@ -1012,10 +1012,10 @@ void GeoCommunicator::Send(const std::vector<std::string> &var_names,
 
   auto *var = scope.FindVar(table_name);
 
-  PADDLE_ENFORCE_EQ(var->IsType<framework::SelectedRows>(), true,
+  PADDLE_ENFORCE_EQ(var->IsType<pten::SelectedRows>(), true,
                     platform::errors::InvalidArgument(
                         "Only need to send Sparse Grad in Geo mode."));
-  auto &rows = var->Get<framework::SelectedRows>().rows();
+  auto &rows = var->Get<pten::SelectedRows>().rows();
 
   // insert ids which has not been record
   for (size_t j = 0; j < rows.size(); j++) {
@@ -1290,7 +1290,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
   auto cpu_ctx = paddle::platform::CPUDeviceContext();
 
   auto *var_delta = delta_scope_->Var(varname);
-  auto *t_delta = var_delta->GetMutable<framework::SelectedRows>();
+  auto *t_delta = var_delta->GetMutable<pten::SelectedRows>();
   auto *var_t_value = t_delta->mutable_value();
   var_t_value->Resize({static_cast<int64_t>(sparse_ids.size()), dims1});
   auto *t_value = var_t_value->mutable_data<float>(cpu_ctx.GetPlace());
diff --git a/paddle/fluid/distributed/service/communicator.h b/paddle/fluid/distributed/ps/service/communicator/communicator.h
similarity index 97%
rename from paddle/fluid/distributed/service/communicator.h
rename to paddle/fluid/distributed/ps/service/communicator/communicator.h
index 7056c9aba62dd5618d185d16e7eb8bd168dc5a73..570e668d9d5d2b40d280ef905b12fcb0e4ada09b 100644
--- a/paddle/fluid/distributed/service/communicator.h
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h
@@ -29,7 +29,7 @@ limitations under the License. */
 #include <vector>
 
 #include "gflags/gflags.h"
-#include "paddle/fluid/distributed/communicator_common.h"
+#include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/framework/variable_helper.h"
@@ -41,7 +41,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"
 
-#include "paddle/fluid/distributed/service/ps_client.h"
+#include "paddle/fluid/distributed/ps/service/ps_client.h"
 
 namespace paddle {
 namespace distributed {
@@ -193,15 +193,15 @@ inline void MergeVars(const std::string &var_name,
       result.device(*cpu_ctx.eigen_device()) =
           result / static_cast<T>(vars.size());
     }
-  } else if (var0->IsType<framework::SelectedRows>()) {
-    auto &slr0 = var0->Get<framework::SelectedRows>();
-    auto *out_slr = out_var->GetMutable<framework::SelectedRows>();
+  } else if (var0->IsType<pten::SelectedRows>()) {
+    auto &slr0 = var0->Get<pten::SelectedRows>();
+    auto *out_slr = out_var->GetMutable<pten::SelectedRows>();
     out_slr->mutable_rows()->clear();
     out_slr->mutable_value()->mutable_data<T>({{}}, cpu_place);
-    std::vector<const paddle::framework::SelectedRows *> inputs;
+    std::vector<const pten::SelectedRows *> inputs;
     inputs.reserve(vars.size());
     for (auto &var : vars) {
-      inputs.push_back(&var->Get<framework::SelectedRows>());
+      inputs.push_back(&var->Get<pten::SelectedRows>());
     }
     auto dev_ctx = paddle::platform::CPUDeviceContext();
     if (merge_add) {
diff --git a/paddle/fluid/distributed/communicator_common.h b/paddle/fluid/distributed/ps/service/communicator/communicator_common.h
similarity index 100%
rename from paddle/fluid/distributed/communicator_common.h
rename to paddle/fluid/distributed/ps/service/communicator/communicator_common.h
diff --git a/paddle/fluid/distributed/service/env.cc b/paddle/fluid/distributed/ps/service/env.cc
similarity index 93%
rename from paddle/fluid/distributed/service/env.cc
rename to paddle/fluid/distributed/ps/service/env.cc
index 25bc2cc366aaacba32c22a5225d344f8618767d9..15bd31ce958685643c26af044cdc948725589105 100644
--- a/paddle/fluid/distributed/service/env.cc
+++ b/paddle/fluid/distributed/ps/service/env.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/ps/service/env.h"
 
 namespace paddle {
 namespace distributed {}  // namespace distributed
diff --git a/paddle/fluid/distributed/service/env.h b/paddle/fluid/distributed/ps/service/env.h
similarity index 100%
rename from paddle/fluid/distributed/service/env.h
rename to paddle/fluid/distributed/ps/service/env.h
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
similarity index 99%
rename from paddle/fluid/distributed/service/graph_brpc_client.cc
rename to paddle/fluid/distributed/ps/service/graph_brpc_client.cc
index a9682d6a6efcc9db33e33c3e4fef1ec60f5bedf3..301708f6b7bb3d465d8dcbd2b94bbc4c217fcc77 100644
--- a/paddle/fluid/distributed/service/graph_brpc_client.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/ps/service/graph_brpc_client.h"
 #include <algorithm>
 #include <memory>
 #include <sstream>
@@ -20,8 +20,8 @@
 #include <utility>
 #include <vector>
 #include "Eigen/Dense"
-#include "paddle/fluid/distributed/service/brpc_ps_client.h"
-#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/string/string_helper.h"
 namespace paddle {
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/ps/service/graph_brpc_client.h
similarity index 95%
rename from paddle/fluid/distributed/service/graph_brpc_client.h
rename to paddle/fluid/distributed/ps/service/graph_brpc_client.h
index 2e5d5b6ee93cbe606ed87a4c947d993ecccfc59a..06e753d028baa2d9c0002620dc445d4204046180 100644
--- a/paddle/fluid/distributed/service/graph_brpc_client.h
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.h
@@ -24,10 +24,10 @@
 #include "brpc/channel.h"
 #include "brpc/controller.h"
 #include "brpc/server.h"
-#include "paddle/fluid/distributed/service/brpc_ps_client.h"
-#include "paddle/fluid/distributed/service/graph_brpc_server.h"
-#include "paddle/fluid/distributed/service/ps_client.h"
-#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/ps/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/ps/service/ps_client.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor_util.h"
diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
similarity index 99%
rename from paddle/fluid/distributed/service/graph_brpc_server.cc
rename to paddle/fluid/distributed/ps/service/graph_brpc_server.cc
index c1348e4804e2badcfc02c61dbbb0f83892cedefb..441f489fb3097cda51fc62dc35e93264a1f7caef 100644
--- a/paddle/fluid/distributed/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/service/graph_brpc_server.h"
-#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/ps/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
 
 #include <thread>  // NOLINT
 #include <utility>
 #include "butil/endpoint.h"
 #include "iomanip"
-#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/ps/service/graph_brpc_server.h
similarity index 96%
rename from paddle/fluid/distributed/service/graph_brpc_server.h
rename to paddle/fluid/distributed/ps/service/graph_brpc_server.h
index ecd78d28ca812a1e4c3b1429e891b3d0b7d5dd95..aee0190850753786ce0f083257458caf50a63d26 100644
--- a/paddle/fluid/distributed/service/graph_brpc_server.h
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.h
@@ -20,10 +20,10 @@
 
 #include <memory>
 #include <vector>
-#include "paddle/fluid/distributed/service/brpc_ps_server.h"
-#include "paddle/fluid/distributed/service/server.h"
-#include "paddle/fluid/distributed/table/common_graph_table.h"
-#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/ps/service/server.h"
+#include "paddle/fluid/distributed/ps/table/common_graph_table.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 namespace paddle {
 namespace distributed {
 class GraphBrpcServer : public PSServer {
diff --git a/paddle/fluid/distributed/service/heter_client.cc b/paddle/fluid/distributed/ps/service/heter_client.cc
similarity index 99%
rename from paddle/fluid/distributed/service/heter_client.cc
rename to paddle/fluid/distributed/ps/service/heter_client.cc
index 95023704f9d51522386eaadee0f5c6fc01d1764d..e9e3ec1d9df471db2c8e54e5c0eaf71f9b0e9bd3 100644
--- a/paddle/fluid/distributed/service/heter_client.cc
+++ b/paddle/fluid/distributed/ps/service/heter_client.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/service/heter_client.h"
+#include "paddle/fluid/distributed/ps/service/heter_client.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/split.h"
 
diff --git a/paddle/fluid/distributed/service/heter_client.h b/paddle/fluid/distributed/ps/service/heter_client.h
similarity index 95%
rename from paddle/fluid/distributed/service/heter_client.h
rename to paddle/fluid/distributed/ps/service/heter_client.h
index 7ba47ad9a5df58a75cfe736a0c16a82f43ec9576..4f27ef75ea954dece5cd734108c64813b681c6f6 100644
--- a/paddle/fluid/distributed/service/heter_client.h
+++ b/paddle/fluid/distributed/ps/service/heter_client.h
@@ -25,9 +25,9 @@ limitations under the License. */
 #include "brpc/channel.h"
 #include "brpc/controller.h"
 #include "brpc/server.h"
-#include "paddle/fluid/distributed/service/brpc_ps_client.h"
-#include "paddle/fluid/distributed/service/brpc_utils.h"
-#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/ps/service/brpc_utils.h"
+#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable_helper.h"
diff --git a/paddle/fluid/distributed/service/heter_server.cc b/paddle/fluid/distributed/ps/service/heter_server.cc
similarity index 98%
rename from paddle/fluid/distributed/service/heter_server.cc
rename to paddle/fluid/distributed/ps/service/heter_server.cc
index fee3081f0329a92bb4903d8540dcadb73d663154..01afed3f1237515cf5c5e4ad01d329b424b5079e 100644
--- a/paddle/fluid/distributed/service/heter_server.cc
+++ b/paddle/fluid/distributed/ps/service/heter_server.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/service/heter_server.h"
+#include "paddle/fluid/distributed/ps/service/heter_server.h"
 #include "paddle/fluid/string/split.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/service/heter_server.h b/paddle/fluid/distributed/ps/service/heter_server.h
similarity index 99%
rename from paddle/fluid/distributed/service/heter_server.h
rename to paddle/fluid/distributed/ps/service/heter_server.h
index 094ee6036413d5f5469e5ab4bee14913d39aad97..86f83cb1fc4fe5ef881dbb2e8f88bd6d1bc67bc5 100644
--- a/paddle/fluid/distributed/service/heter_server.h
+++ b/paddle/fluid/distributed/ps/service/heter_server.h
@@ -25,8 +25,8 @@ limitations under the License. */
 #include "brpc/channel.h"
 #include "brpc/controller.h"
 #include "brpc/server.h"
-#include "paddle/fluid/distributed/service/brpc_utils.h"
-#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/ps/service/brpc_utils.h"
+#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/program_desc.h"
diff --git a/paddle/fluid/distributed/service/ps_client.cc b/paddle/fluid/distributed/ps/service/ps_client.cc
similarity index 90%
rename from paddle/fluid/distributed/service/ps_client.cc
rename to paddle/fluid/distributed/ps/service/ps_client.cc
index d45f41a0f58de36bb1575c1b51663f8899fb215d..fd956b758de1ae00155b37bb4d2c9e8134da09e4 100644
--- a/paddle/fluid/distributed/service/ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/ps_client.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/service/ps_client.h"
+#include "paddle/fluid/distributed/ps/service/ps_client.h"
 #include "glog/logging.h"
-#include "paddle/fluid/distributed/service/brpc_ps_client.h"
-#include "paddle/fluid/distributed/service/graph_brpc_client.h"
-#include "paddle/fluid/distributed/service/ps_local_client.h"
-#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/ps/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/ps/service/ps_local_client.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h
similarity index 97%
rename from paddle/fluid/distributed/service/ps_client.h
rename to paddle/fluid/distributed/ps/service/ps_client.h
index a408a0cc24fb51de041ecd4098b5434e9c5d91ca..7db8b0c1244594ba4483101536995f9e414382ab 100644
--- a/paddle/fluid/distributed/service/ps_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_client.h
@@ -22,10 +22,10 @@
 #include <vector>
 #include "paddle/fluid/distributed/common/cost_timer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/service/env.h"
-#include "paddle/fluid/distributed/service/sendrecv.pb.h"
-#include "paddle/fluid/distributed/table/accessor.h"
-#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/distributed/ps/service/env.h"
+#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/ps/table/accessor.h"
+#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 #include "paddle/fluid/platform/timer.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/service/ps_local_client.cc b/paddle/fluid/distributed/ps/service/ps_local_client.cc
similarity index 98%
rename from paddle/fluid/distributed/service/ps_local_client.cc
rename to paddle/fluid/distributed/ps/service/ps_local_client.cc
index e949b21b02e6d9842ffae377a17610757a65ae75..972cce135f189bee6dbba9e0b89baa288816827b 100644
--- a/paddle/fluid/distributed/service/ps_local_client.cc
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/service/ps_local_client.h"
-#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/distributed/ps/service/ps_local_client.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 
 //#define pslib_debug_dense_compress
 
diff --git a/paddle/fluid/distributed/service/ps_local_client.h b/paddle/fluid/distributed/ps/service/ps_local_client.h
similarity index 99%
rename from paddle/fluid/distributed/service/ps_local_client.h
rename to paddle/fluid/distributed/ps/service/ps_local_client.h
index 9d2b01a45fe929097c06fb264f470974410e7f4e..e73974ac562861d86e679ddbc213335d10731281 100644
--- a/paddle/fluid/distributed/service/ps_local_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.h
@@ -15,7 +15,7 @@
 #include "brpc/channel.h"
 #include "brpc/controller.h"
 #include "brpc/server.h"
-#include "paddle/fluid/distributed/service/ps_client.h"
+#include "paddle/fluid/distributed/ps/service/ps_client.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/service/ps_local_server.h b/paddle/fluid/distributed/ps/service/ps_local_server.h
similarity index 95%
rename from paddle/fluid/distributed/service/ps_local_server.h
rename to paddle/fluid/distributed/ps/service/ps_local_server.h
index 33b0b5fa796d7571e16a0f79fc6ce4de21b1e7a8..91f8bc4c9127115c9b5595270973d011778c6262 100644
--- a/paddle/fluid/distributed/service/ps_local_server.h
+++ b/paddle/fluid/distributed/ps/service/ps_local_server.h
@@ -16,7 +16,7 @@
 
 #include <memory>
 #include <vector>
-#include "paddle/fluid/distributed/service/server.h"
+#include "paddle/fluid/distributed/ps/service/server.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/service/graph_py_service.cc b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
similarity index 99%
rename from paddle/fluid/distributed/service/graph_py_service.cc
rename to paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
index 8d7a822321a2b34aae12035c549ca23f21ad16a0..b2aece98071c146b23e897900b9c7f9736c2f2de 100644
--- a/paddle/fluid/distributed/service/graph_py_service.cc
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/service/graph_py_service.h"
+#include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h"
 #include <thread>  // NOLINT
 #include "butil/endpoint.h"
 #include "iomanip"
-#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
similarity index 95%
rename from paddle/fluid/distributed/service/graph_py_service.h
rename to paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
index a860d1f58d3a23e79ca3d3a380b6067c13e76371..71b44f36d0107fa57b3beb51f29e7509d967f995 100644
--- a/paddle/fluid/distributed/service/graph_py_service.h
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
@@ -32,11 +32,11 @@
 #include "paddle/fluid/framework/variable.h"
 
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/service/env.h"
-#include "paddle/fluid/distributed/service/graph_brpc_client.h"
-#include "paddle/fluid/distributed/service/graph_brpc_server.h"
-#include "paddle/fluid/distributed/service/sendrecv.pb.h"
-#include "paddle/fluid/distributed/service/service.h"
+#include "paddle/fluid/distributed/ps/service/env.h"
+#include "paddle/fluid/distributed/ps/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/ps/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/ps/service/ps_service/service.h"
+#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/distributed/service/service.cc b/paddle/fluid/distributed/ps/service/ps_service/service.cc
similarity index 96%
rename from paddle/fluid/distributed/service/service.cc
rename to paddle/fluid/distributed/ps/service/ps_service/service.cc
index 698ceb1578f47eec83d0ae1efb3bbac6149de210..73793d2f9bd0ec8c5b485830059a730bb8d8559a 100644
--- a/paddle/fluid/distributed/service/service.cc
+++ b/paddle/fluid/distributed/ps/service/ps_service/service.cc
@@ -12,13 +12,13 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/fluid/distributed/service/service.h"
+#include "paddle/fluid/distributed/ps/service/ps_service/service.h"
 
 #include <fcntl.h>
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <google/protobuf/text_format.h>
 #include <iostream>
-#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
 #include "paddle/fluid/string/string_helper.h"
 
 using namespace std;  // NOLINT
diff --git a/paddle/fluid/distributed/service/service.h b/paddle/fluid/distributed/ps/service/ps_service/service.h
similarity index 93%
rename from paddle/fluid/distributed/service/service.h
rename to paddle/fluid/distributed/ps/service/ps_service/service.h
index 5c987267f9d2e581f0340afca1ec803a14ab6962..202c2407f15ae9fbf5087b55a65f6acd2957ddc5 100644
--- a/paddle/fluid/distributed/service/service.h
+++ b/paddle/fluid/distributed/ps/service/ps_service/service.h
@@ -20,9 +20,9 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/service/ps_client.h"
-#include "paddle/fluid/distributed/service/sendrecv.pb.h"
-#include "paddle/fluid/distributed/service/server.h"
+#include "paddle/fluid/distributed/ps/service/ps_client.h"
+#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/ps/service/server.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/ps/service/sendrecv.proto
similarity index 100%
rename from paddle/fluid/distributed/service/sendrecv.proto
rename to paddle/fluid/distributed/ps/service/sendrecv.proto
diff --git a/paddle/fluid/distributed/service/server.cc b/paddle/fluid/distributed/ps/service/server.cc
similarity index 92%
rename from paddle/fluid/distributed/service/server.cc
rename to paddle/fluid/distributed/ps/service/server.cc
index e44876e3d2b789580152626ea8c290db0d369509..5f1974e3e610c6772457514759bff83db944bf52 100644
--- a/paddle/fluid/distributed/service/server.cc
+++ b/paddle/fluid/distributed/ps/service/server.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/service/server.h"
+#include "paddle/fluid/distributed/ps/service/server.h"
 
 #include "glog/logging.h"
-#include "paddle/fluid/distributed/service/brpc_ps_server.h"
-#include "paddle/fluid/distributed/service/graph_brpc_server.h"
-#include "paddle/fluid/distributed/service/ps_local_server.h"
-#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/ps/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/ps/service/ps_local_server.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/service/server.h b/paddle/fluid/distributed/ps/service/server.h
similarity index 97%
rename from paddle/fluid/distributed/service/server.h
rename to paddle/fluid/distributed/ps/service/server.h
index ebebedc80efb83f88a7e366b39a20e93961d0087..160d4a612829531d619c69a0cd5e9cd091f94868 100644
--- a/paddle/fluid/distributed/service/server.h
+++ b/paddle/fluid/distributed/ps/service/server.h
@@ -24,8 +24,8 @@
 #include "google/protobuf/service.h"
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/service/env.h"
-#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/ps/service/env.h"
+#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt
similarity index 100%
rename from paddle/fluid/distributed/table/CMakeLists.txt
rename to paddle/fluid/distributed/ps/table/CMakeLists.txt
diff --git a/paddle/fluid/distributed/table/accessor.h b/paddle/fluid/distributed/ps/table/accessor.h
similarity index 100%
rename from paddle/fluid/distributed/table/accessor.h
rename to paddle/fluid/distributed/ps/table/accessor.h
diff --git a/paddle/fluid/distributed/table/barrier_table.cc b/paddle/fluid/distributed/ps/table/barrier_table.cc
similarity index 97%
rename from paddle/fluid/distributed/table/barrier_table.cc
rename to paddle/fluid/distributed/ps/table/barrier_table.cc
index 72394d15c54af5b346c70359b4dcde0ad2cd063c..25838e7ac2f047d9ff7bf20705459c6b1d60d26f 100644
--- a/paddle/fluid/distributed/table/barrier_table.cc
+++ b/paddle/fluid/distributed/ps/table/barrier_table.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/table/common_table.h"
+#include "paddle/fluid/distributed/ps/table/common_table.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/table/common_dense_table.cc b/paddle/fluid/distributed/ps/table/common_dense_table.cc
similarity index 99%
rename from paddle/fluid/distributed/table/common_dense_table.cc
rename to paddle/fluid/distributed/ps/table/common_dense_table.cc
index b34b143a3ce37ef9a61c41143a2dfcb1fc614eaa..607469e2f7b0d5df79d4cb7477e0eaa3f4a8323a 100644
--- a/paddle/fluid/distributed/table/common_dense_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_dense_table.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/table/common_dense_table.h"
+#include "paddle/fluid/distributed/ps/table/common_dense_table.h"
 
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/distributed/table/common_dense_table.h b/paddle/fluid/distributed/ps/table/common_dense_table.h
similarity index 91%
rename from paddle/fluid/distributed/table/common_dense_table.h
rename to paddle/fluid/distributed/ps/table/common_dense_table.h
index c8813dc33053f0c8a42a1090b262c7fde79f5ed5..a4c0f29ddb8770c8adc0d6885929aaac8a028e90 100644
--- a/paddle/fluid/distributed/table/common_dense_table.h
+++ b/paddle/fluid/distributed/ps/table/common_dense_table.h
@@ -19,10 +19,10 @@
 #include <pthread.h>
 #include <string>
 #include "Eigen/Dense"
-#include "paddle/fluid/distributed/table/accessor.h"
-#include "paddle/fluid/distributed/table/common_table.h"
-#include "paddle/fluid/distributed/table/depends/dense.h"
-#include "paddle/fluid/distributed/table/depends/initializers.h"
+#include "paddle/fluid/distributed/ps/table/accessor.h"
+#include "paddle/fluid/distributed/ps/table/common_table.h"
+#include "paddle/fluid/distributed/ps/table/depends/dense.h"
+#include "paddle/fluid/distributed/ps/table/depends/initializers.h"
 #include "paddle/fluid/string/string_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
similarity index 99%
rename from paddle/fluid/distributed/table/common_graph_table.cc
rename to paddle/fluid/distributed/ps/table/common_graph_table.cc
index 042a4dee62bda6f80ba94d16eba8abab150aa0bc..54b98cb96ce5196bb5133f777b2571f4d3d43c6e 100644
--- a/paddle/fluid/distributed/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/table/common_graph_table.h"
+#include "paddle/fluid/distributed/ps/table/common_graph_table.h"
 #include <time.h>
 #include <algorithm>
 #include <chrono>
 #include <set>
 #include <sstream>
 #include "paddle/fluid/distributed/common/utils.h"
-#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/string_helper.h"
diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h
similarity index 98%
rename from paddle/fluid/distributed/table/common_graph_table.h
rename to paddle/fluid/distributed/ps/table/common_graph_table.h
index b76ab0ae9506027091ee3f0ab356f884b83346a3..4fc5b5ab633f9e0815461413829eeef7071b5718 100644
--- a/paddle/fluid/distributed/table/common_graph_table.h
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -36,11 +36,12 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
-#include "paddle/fluid/distributed/table/accessor.h"
-#include "paddle/fluid/distributed/table/common_table.h"
-#include "paddle/fluid/distributed/table/graph/graph_node.h"
-#include "paddle/fluid/framework/rw_lock.h"
+#include "paddle/fluid/distributed/ps/table/accessor.h"
+#include "paddle/fluid/distributed/ps/table/common_table.h"
+#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/pten/core/utils/rw_lock.h"
+
 namespace paddle {
 namespace distributed {
 class GraphShard {
diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/ps/table/common_sparse_table.cc
similarity index 99%
rename from paddle/fluid/distributed/table/common_sparse_table.cc
rename to paddle/fluid/distributed/ps/table/common_sparse_table.cc
index 143b24cf3264774c8852307f4071cd03a41010d1..b44d08b937a96c806142f5d7f1ba2ae0bcdb0f5e 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_sparse_table.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/table/common_sparse_table.h"
+#include "paddle/fluid/distributed/ps/table/common_sparse_table.h"
 #include <sstream>
 
 #include "glog/logging.h"
diff --git a/paddle/fluid/distributed/table/common_sparse_table.h b/paddle/fluid/distributed/ps/table/common_sparse_table.h
similarity index 92%
rename from paddle/fluid/distributed/table/common_sparse_table.h
rename to paddle/fluid/distributed/ps/table/common_sparse_table.h
index a443710bf0fd82bc157db26184d5c2d87f191004..2e02d13e7e5aec928468dcfbde1cff5e0b9c514a 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/common_sparse_table.h
@@ -24,13 +24,13 @@
 #include <utility>
 #include <vector>
 #include "Eigen/Dense"
-#include "paddle/fluid/distributed/table/accessor.h"
-#include "paddle/fluid/distributed/table/common_table.h"
-#include "paddle/fluid/distributed/table/depends/initializers.h"
-#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
-#include "paddle/fluid/distributed/table/depends/sparse.h"
-#include "paddle/fluid/framework/rw_lock.h"
+#include "paddle/fluid/distributed/ps/table/accessor.h"
+#include "paddle/fluid/distributed/ps/table/common_table.h"
+#include "paddle/fluid/distributed/ps/table/depends/initializers.h"
+#include "paddle/fluid/distributed/ps/table/depends/large_scale_kv.h"
+#include "paddle/fluid/distributed/ps/table/depends/sparse.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/pten/core/utils/rw_lock.h"
 
 #define PSERVER_SAVE_SUFFIX ".shard"
 
@@ -110,7 +110,7 @@ struct Meta {
 
 class CommonSparseTable : public SparseTable {
  public:
-  CommonSparseTable() { rwlock_.reset(new framework::RWLock); }
+  CommonSparseTable() { rwlock_.reset(new pten::RWLock); }
   virtual ~CommonSparseTable() {}
 
   // unused method begin
@@ -193,7 +193,7 @@ class CommonSparseTable : public SparseTable {
   std::shared_ptr<SparseOptimizer> optimizer_;
   std::vector<std::shared_ptr<ValueBlock>> shard_values_;
   std::unordered_map<uint64_t, ReservoirValue<float>> pull_reservoir_;
-  std::unique_ptr<framework::RWLock> rwlock_{nullptr};
+  std::unique_ptr<pten::RWLock> rwlock_{nullptr};
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/table/common_table.h b/paddle/fluid/distributed/ps/table/common_table.h
similarity index 98%
rename from paddle/fluid/distributed/table/common_table.h
rename to paddle/fluid/distributed/ps/table/common_table.h
index bc7f17f5f245794cebf96a8a4bc69e0dce8ac997..bac826dfe0e20b42d5cc47467356bc5614383a44 100644
--- a/paddle/fluid/distributed/table/common_table.h
+++ b/paddle/fluid/distributed/ps/table/common_table.h
@@ -19,7 +19,7 @@
 #include <mutex>               // NOLINT
 #include <set>
 
-#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 
 #include "paddle/fluid/distributed/common/utils.h"
 
diff --git a/paddle/fluid/distributed/table/ctr_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
similarity index 99%
rename from paddle/fluid/distributed/table/ctr_accessor.cc
rename to paddle/fluid/distributed/ps/table/ctr_accessor.cc
index 23144f39ade396613ff91b033dca364dd05a1a77..866bd8114ccea329123e16585c33366e759d5df8 100644
--- a/paddle/fluid/distributed/table/ctr_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/table/ctr_accessor.h"
+#include "paddle/fluid/distributed/ps/table/ctr_accessor.h"
 #include <gflags/gflags.h>
 #include "glog/logging.h"
 #include "paddle/fluid/string/string_helper.h"
diff --git a/paddle/fluid/distributed/table/ctr_accessor.h b/paddle/fluid/distributed/ps/table/ctr_accessor.h
similarity index 98%
rename from paddle/fluid/distributed/table/ctr_accessor.h
rename to paddle/fluid/distributed/ps/table/ctr_accessor.h
index 8be672e8e0d15e124d8babfb7dbc30b3d38f491f..1e31fec04649b19882269fa9cce5f5d7fb4978c1 100644
--- a/paddle/fluid/distributed/table/ctr_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.h
@@ -18,8 +18,8 @@
 #include <vector>
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/table/accessor.h"
-#include "paddle/fluid/distributed/table/sparse_sgd_rule.h"
+#include "paddle/fluid/distributed/ps/table/accessor.h"
+#include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/table/depends/dense.h b/paddle/fluid/distributed/ps/table/depends/dense.h
similarity index 100%
rename from paddle/fluid/distributed/table/depends/dense.h
rename to paddle/fluid/distributed/ps/table/depends/dense.h
diff --git a/paddle/fluid/distributed/table/depends/feature_value.h b/paddle/fluid/distributed/ps/table/depends/feature_value.h
similarity index 100%
rename from paddle/fluid/distributed/table/depends/feature_value.h
rename to paddle/fluid/distributed/ps/table/depends/feature_value.h
diff --git a/paddle/fluid/distributed/table/depends/geo_recorder.h b/paddle/fluid/distributed/ps/table/depends/geo_recorder.h
similarity index 100%
rename from paddle/fluid/distributed/table/depends/geo_recorder.h
rename to paddle/fluid/distributed/ps/table/depends/geo_recorder.h
diff --git a/paddle/fluid/distributed/table/depends/initializers.h b/paddle/fluid/distributed/ps/table/depends/initializers.h
similarity index 100%
rename from paddle/fluid/distributed/table/depends/initializers.h
rename to paddle/fluid/distributed/ps/table/depends/initializers.h
diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/ps/table/depends/large_scale_kv.h
similarity index 98%
rename from paddle/fluid/distributed/table/depends/large_scale_kv.h
rename to paddle/fluid/distributed/ps/table/depends/large_scale_kv.h
index 3408ef5f91ad009a33c28fb4093a79075112c0bd..dc7766c7ceb06eb0f57094af1f4e11df72da18aa 100644
--- a/paddle/fluid/distributed/table/depends/large_scale_kv.h
+++ b/paddle/fluid/distributed/ps/table/depends/large_scale_kv.h
@@ -28,11 +28,10 @@
 
 #include "butil/object_pool.h"
 #include "paddle/fluid/distributed/common/utils.h"
-#include "paddle/fluid/distributed/table/depends/initializers.h"
-#include "paddle/fluid/distributed/thirdparty/round_robin.h"
+#include "paddle/fluid/distributed/ps/table/depends/initializers.h"
+#include "paddle/fluid/distributed/ps/thirdparty/round_robin.h"
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/threadpool.h"
@@ -43,6 +42,7 @@
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/pten/backends/dynload/port.h"
+#include "paddle/pten/core/utils/rw_lock.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/table/depends/rocksdb_warpper.h b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h
similarity index 100%
rename from paddle/fluid/distributed/table/depends/rocksdb_warpper.h
rename to paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h
diff --git a/paddle/fluid/distributed/table/depends/sparse.h b/paddle/fluid/distributed/ps/table/depends/sparse.h
similarity index 99%
rename from paddle/fluid/distributed/table/depends/sparse.h
rename to paddle/fluid/distributed/ps/table/depends/sparse.h
index 0e1d7ef03c129c2dc6f72d6e56fafb143d879bd4..d4ea7829e45f8326fdbe33ebb1c7c9cfa3d35f6f 100644
--- a/paddle/fluid/distributed/table/depends/sparse.h
+++ b/paddle/fluid/distributed/ps/table/depends/sparse.h
@@ -24,7 +24,7 @@
 #include "gflags/gflags.h"
 
 #include "paddle/fluid/distributed/common/utils.h"
-#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
+#include "paddle/fluid/distributed/ps/table/depends/large_scale_kv.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/table/depends/sparse_utils.h b/paddle/fluid/distributed/ps/table/depends/sparse_utils.h
similarity index 100%
rename from paddle/fluid/distributed/table/depends/sparse_utils.h
rename to paddle/fluid/distributed/ps/table/depends/sparse_utils.h
diff --git a/paddle/fluid/distributed/table/graph/graph_edge.cc b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
similarity index 93%
rename from paddle/fluid/distributed/table/graph/graph_edge.cc
rename to paddle/fluid/distributed/ps/table/graph/graph_edge.cc
index 0ab0d5a76d6715401dd55ce7487634b72d452ddf..d1961b655d8829716b392c24ad6f1139089eb80d 100644
--- a/paddle/fluid/distributed/table/graph/graph_edge.cc
+++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/table/graph/graph_edge.h"
+#include "paddle/fluid/distributed/ps/table/graph/graph_edge.h"
 #include <cstring>
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/table/graph/graph_edge.h b/paddle/fluid/distributed/ps/table/graph/graph_edge.h
similarity index 100%
rename from paddle/fluid/distributed/table/graph/graph_edge.h
rename to paddle/fluid/distributed/ps/table/graph/graph_edge.h
diff --git a/paddle/fluid/distributed/table/graph/graph_node.cc b/paddle/fluid/distributed/ps/table/graph/graph_node.cc
similarity index 98%
rename from paddle/fluid/distributed/table/graph/graph_node.cc
rename to paddle/fluid/distributed/ps/table/graph/graph_node.cc
index 52c708be88488465b9f7c7abac27b6ddc3b991c1..366e607261f0c350c5097fc76e7bcc87b04ee878 100644
--- a/paddle/fluid/distributed/table/graph/graph_node.cc
+++ b/paddle/fluid/distributed/ps/table/graph/graph_node.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 #include <cstring>
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/table/graph/graph_node.h b/paddle/fluid/distributed/ps/table/graph/graph_node.h
similarity index 98%
rename from paddle/fluid/distributed/table/graph/graph_node.h
rename to paddle/fluid/distributed/ps/table/graph/graph_node.h
index b7a564ef7b0bb6a9f8b307edbb674ab6a32c7404..b838c2c1258d84fec8c4a25f5855209d5b428d4c 100644
--- a/paddle/fluid/distributed/table/graph/graph_node.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h
@@ -18,7 +18,7 @@
 #include <memory>
 #include <sstream>
 #include <vector>
-#include "paddle/fluid/distributed/table/graph/graph_weighted_sampler.h"
+#include "paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h"
 namespace paddle {
 namespace distributed {
 
diff --git a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc
similarity index 98%
rename from paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc
rename to paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc
index 7a46433e3defbd51b68ed9f25e9e92f64b6d1afa..8186acec1be3da2abc18775e519ab38dac9f6dfd 100644
--- a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc
+++ b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/table/graph/graph_weighted_sampler.h"
+#include "paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h"
 #include <iostream>
 #include <memory>
 #include <unordered_map>
diff --git a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h
similarity index 96%
rename from paddle/fluid/distributed/table/graph/graph_weighted_sampler.h
rename to paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h
index 4a75a112697d322a2eb49a57d379889d34b6009f..c10617022decb2eaf3c8a9684fd3265e88722e76 100644
--- a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h
@@ -18,7 +18,7 @@
 #include <random>
 #include <unordered_map>
 #include <vector>
-#include "paddle/fluid/distributed/table/graph/graph_edge.h"
+#include "paddle/fluid/distributed/ps/table/graph/graph_edge.h"
 namespace paddle {
 namespace distributed {
 
diff --git a/paddle/fluid/distributed/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
similarity index 99%
rename from paddle/fluid/distributed/table/memory_sparse_table.cc
rename to paddle/fluid/distributed/ps/table/memory_sparse_table.cc
index 086ddcafeb48d82b576cf525df4451fce8e77c10..7ce6e9005cf56ca295a6620a209551e303c112f3 100644
--- a/paddle/fluid/distributed/table/memory_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
@@ -16,7 +16,7 @@
 #include <sstream>
 
 #include "paddle/fluid/distributed/common/cost_timer.h"
-#include "paddle/fluid/distributed/table/memory_sparse_table.h"
+#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
 #include "paddle/fluid/framework/io/fs.h"
 
 #include "boost/lexical_cast.hpp"
diff --git a/paddle/fluid/distributed/table/memory_sparse_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
similarity index 94%
rename from paddle/fluid/distributed/table/memory_sparse_table.h
rename to paddle/fluid/distributed/ps/table/memory_sparse_table.h
index cb552beab13717c270c4a8495a6794c9dc912b08..5770f25f8f41dec286993d6b586959c8c0d3a0c0 100644
--- a/paddle/fluid/distributed/table/memory_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
@@ -24,9 +24,9 @@
 #include <utility>
 #include <vector>
 #include "Eigen/Dense"
-#include "paddle/fluid/distributed/table/accessor.h"
-#include "paddle/fluid/distributed/table/common_table.h"
-#include "paddle/fluid/distributed/table/depends/feature_value.h"
+#include "paddle/fluid/distributed/ps/table/accessor.h"
+#include "paddle/fluid/distributed/ps/table/common_table.h"
+#include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
 #include "paddle/fluid/string/string_helper.h"
 
 #define PSERVER_SAVE_SUFFIX ".shard"
diff --git a/paddle/fluid/distributed/table/sparse_geo_table.cc b/paddle/fluid/distributed/ps/table/sparse_geo_table.cc
similarity index 97%
rename from paddle/fluid/distributed/table/sparse_geo_table.cc
rename to paddle/fluid/distributed/ps/table/sparse_geo_table.cc
index 655c4784156e84e7071b738adac8c24ade6bd08e..6ef4330113e8fee3d2cb0d3e541194ca7b600a82 100644
--- a/paddle/fluid/distributed/table/sparse_geo_table.cc
+++ b/paddle/fluid/distributed/ps/table/sparse_geo_table.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/table/sparse_geo_table.h"
+#include "paddle/fluid/distributed/ps/table/sparse_geo_table.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/table/sparse_geo_table.h b/paddle/fluid/distributed/ps/table/sparse_geo_table.h
similarity index 77%
rename from paddle/fluid/distributed/table/sparse_geo_table.h
rename to paddle/fluid/distributed/ps/table/sparse_geo_table.h
index 4ddb1fd706069f742debe23f6b7ec1b93692dec3..6eb913a02bc475a148ccb24797618339867f1121 100644
--- a/paddle/fluid/distributed/table/sparse_geo_table.h
+++ b/paddle/fluid/distributed/ps/table/sparse_geo_table.h
@@ -24,15 +24,15 @@
 #include <vector>
 
 #include "Eigen/Dense"
-#include "paddle/fluid/distributed/table/accessor.h"
-#include "paddle/fluid/distributed/table/common_sparse_table.h"
-#include "paddle/fluid/distributed/table/common_table.h"
-#include "paddle/fluid/distributed/table/depends/geo_recorder.h"
-#include "paddle/fluid/distributed/table/depends/initializers.h"
-#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
-#include "paddle/fluid/distributed/table/depends/sparse.h"
-#include "paddle/fluid/framework/rw_lock.h"
+#include "paddle/fluid/distributed/ps/table/accessor.h"
+#include "paddle/fluid/distributed/ps/table/common_sparse_table.h"
+#include "paddle/fluid/distributed/ps/table/common_table.h"
+#include "paddle/fluid/distributed/ps/table/depends/geo_recorder.h"
+#include "paddle/fluid/distributed/ps/table/depends/initializers.h"
+#include "paddle/fluid/distributed/ps/table/depends/large_scale_kv.h"
+#include "paddle/fluid/distributed/ps/table/depends/sparse.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/pten/core/utils/rw_lock.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/table/sparse_sgd_rule.cc b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
similarity index 99%
rename from paddle/fluid/distributed/table/sparse_sgd_rule.cc
rename to paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
index 614656a5a85d3029b82b8452b403253043bbc846..3e39d6f976d129903283060fb5111bd9eea03afc 100644
--- a/paddle/fluid/distributed/table/sparse_sgd_rule.cc
+++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/table/sparse_sgd_rule.h"
+#include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
 #include <gflags/gflags.h>
 #include "glog/logging.h"
 
diff --git a/paddle/fluid/distributed/table/sparse_sgd_rule.h b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h
similarity index 100%
rename from paddle/fluid/distributed/table/sparse_sgd_rule.h
rename to paddle/fluid/distributed/ps/table/sparse_sgd_rule.h
diff --git a/paddle/fluid/distributed/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
similarity index 99%
rename from paddle/fluid/distributed/table/ssd_sparse_table.cc
rename to paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
index 41eca72cf80717cb5f0ad731d19a9da79009ec96..60514b4e19ffaf63f285e25f1355660fabe58d48 100644
--- a/paddle/fluid/distributed/table/ssd_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #ifdef PADDLE_WITH_HETERPS
-#include "paddle/fluid/distributed/table/ssd_sparse_table.h"
+#include "paddle/fluid/distributed/ps/table/ssd_sparse_table.h"
 
 DEFINE_string(rocksdb_path, "database", "path of sparse table rocksdb file");
 
diff --git a/paddle/fluid/distributed/table/ssd_sparse_table.h b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
similarity index 93%
rename from paddle/fluid/distributed/table/ssd_sparse_table.h
rename to paddle/fluid/distributed/ps/table/ssd_sparse_table.h
index 5e85fa3ce59d13c1f996f00a4b5b7dd9114ed764..f5e8a7067e0e041f9913bef8e43ad8b35bdb2783 100644
--- a/paddle/fluid/distributed/table/ssd_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/distributed/table/common_sparse_table.h"
-#include "paddle/fluid/distributed/table/depends/rocksdb_warpper.h"
+#include "paddle/fluid/distributed/ps/table/common_sparse_table.h"
+#include "paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h"
 #ifdef PADDLE_WITH_HETERPS
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc
similarity index 80%
rename from paddle/fluid/distributed/table/table.cc
rename to paddle/fluid/distributed/ps/table/table.cc
index ac026184b8864ddb4c0b8f9ac2dfa2cc7c4c0dc3..b9b5ff12fc97a74dc4ce7b835ba981d73ca86104 100644
--- a/paddle/fluid/distributed/table/table.cc
+++ b/paddle/fluid/distributed/ps/table/table.cc
@@ -12,22 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 
 #include "glog/logging.h"
 #include "paddle/fluid/distributed/common/registerer.h"
 
-#include "paddle/fluid/distributed/table/common_dense_table.h"
-#include "paddle/fluid/distributed/table/common_graph_table.h"
-#include "paddle/fluid/distributed/table/common_sparse_table.h"
-#include "paddle/fluid/distributed/table/sparse_geo_table.h"
+#include "paddle/fluid/distributed/ps/table/common_dense_table.h"
+#include "paddle/fluid/distributed/ps/table/common_graph_table.h"
+#include "paddle/fluid/distributed/ps/table/common_sparse_table.h"
+#include "paddle/fluid/distributed/ps/table/sparse_geo_table.h"
 #ifdef PADDLE_WITH_HETERPS
-#include "paddle/fluid/distributed/table/ssd_sparse_table.h"
+#include "paddle/fluid/distributed/ps/table/ssd_sparse_table.h"
 #endif
-#include "paddle/fluid/distributed/table/ctr_accessor.h"
-#include "paddle/fluid/distributed/table/memory_sparse_table.h"
-#include "paddle/fluid/distributed/table/tensor_accessor.h"
-#include "paddle/fluid/distributed/table/tensor_table.h"
+#include "paddle/fluid/distributed/ps/table/ctr_accessor.h"
+#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
+#include "paddle/fluid/distributed/ps/table/tensor_accessor.h"
+#include "paddle/fluid/distributed/ps/table/tensor_table.h"
 
 namespace paddle {
 namespace distributed {
@@ -83,9 +83,10 @@ int32_t Table::initialize_accessor() {
 
   LOG(INFO) << "accessor initializing: table_id: " << _config.table_id()
             << ", accessor_name: " << _config.accessor().accessor_class();
-  auto *accessor = CREATE_PSCORE_CLASS(
-      ValueAccessor,
-      _config.accessor().accessor_class()) if (accessor == NULL) {
+  auto *accessor =
+      CREATE_PSCORE_CLASS(ValueAccessor, _config.accessor().accessor_class());
+
+  if (accessor == NULL) {
     LOG(ERROR) << "accessor is unregisteg, table_id:" << _config.table_id()
                << ", accessor_name:" << _config.accessor().accessor_class();
     return -1;
diff --git a/paddle/fluid/distributed/table/table.h b/paddle/fluid/distributed/ps/table/table.h
similarity index 96%
rename from paddle/fluid/distributed/table/table.h
rename to paddle/fluid/distributed/ps/table/table.h
index f6568b4336fbbdee10236d4d8642cd6d1e28b2d9..da1bb668ccfa3c5f1a4f876a396847b6b3853772 100644
--- a/paddle/fluid/distributed/table/table.h
+++ b/paddle/fluid/distributed/ps/table/table.h
@@ -21,9 +21,9 @@
 #include <string>
 #include <utility>
 #include "paddle/fluid/distributed/common/afs_warpper.h"
-#include "paddle/fluid/distributed/table/accessor.h"
-#include "paddle/fluid/distributed/table/depends/sparse_utils.h"
-#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/distributed/ps/table/accessor.h"
+#include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h"
+#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/distributed/table/tensor_accessor.cc b/paddle/fluid/distributed/ps/table/tensor_accessor.cc
similarity index 98%
rename from paddle/fluid/distributed/table/tensor_accessor.cc
rename to paddle/fluid/distributed/ps/table/tensor_accessor.cc
index b1ece52c133a7169273d1a2f62da4d34a01cb029..70a580c1e53a931dc2affd29db01b72691c68a39 100644
--- a/paddle/fluid/distributed/table/tensor_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/tensor_accessor.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/table/tensor_accessor.h"
+#include "paddle/fluid/distributed/ps/table/tensor_accessor.h"
 #include "Eigen/Dense"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/table/tensor_accessor.h b/paddle/fluid/distributed/ps/table/tensor_accessor.h
similarity index 98%
rename from paddle/fluid/distributed/table/tensor_accessor.h
rename to paddle/fluid/distributed/ps/table/tensor_accessor.h
index 9f4e2bc0def4faf9b750e663bfda99e51b1a2347..5041b8fdf8733eff676b5fce1a972e39182df48e 100644
--- a/paddle/fluid/distributed/table/tensor_accessor.h
+++ b/paddle/fluid/distributed/ps/table/tensor_accessor.h
@@ -20,7 +20,7 @@
 
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/table/accessor.h"
+#include "paddle/fluid/distributed/ps/table/accessor.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/table/tensor_table.cc b/paddle/fluid/distributed/ps/table/tensor_table.cc
similarity index 98%
rename from paddle/fluid/distributed/table/tensor_table.cc
rename to paddle/fluid/distributed/ps/table/tensor_table.cc
index 0199f0528a9098b521ca11af522c6d189cc5169a..dfe778fa61e9e003ac1b3de48bf837be1d88ea22 100644
--- a/paddle/fluid/distributed/table/tensor_table.cc
+++ b/paddle/fluid/distributed/ps/table/tensor_table.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/table/tensor_table.h"
+#include "paddle/fluid/distributed/ps/table/tensor_table.h"
 
 DECLARE_double(eager_delete_tensor_gb);
 namespace paddle {
diff --git a/paddle/fluid/distributed/table/tensor_table.h b/paddle/fluid/distributed/ps/table/tensor_table.h
similarity index 99%
rename from paddle/fluid/distributed/table/tensor_table.h
rename to paddle/fluid/distributed/ps/table/tensor_table.h
index 080682d131420b5b57ce470b6b570fe24a1925b3..64d81327acc55ba0655bfc33efaa0d9d9f59649e 100644
--- a/paddle/fluid/distributed/table/tensor_table.h
+++ b/paddle/fluid/distributed/ps/table/tensor_table.h
@@ -24,7 +24,7 @@
 #include <vector>
 
 #include "paddle/fluid/distributed/common/utils.h"
-#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/distributed/thirdparty/round_robin.h b/paddle/fluid/distributed/ps/thirdparty/round_robin.h
similarity index 100%
rename from paddle/fluid/distributed/thirdparty/round_robin.h
rename to paddle/fluid/distributed/ps/thirdparty/round_robin.h
diff --git a/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt b/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6279b6aa95412cb282cbe6ad3e5edb7b33adf289
--- /dev/null
+++ b/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt
@@ -0,0 +1,9 @@
+
+get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
+
+set_source_files_properties(fleet.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(fleet
+        SRCS fleet.cc
+        DEPS framework_proto ps_framework_proto ps_service variable_helper scope op_registry fs shell ${RPC_DEPS})
+
+target_link_libraries(fleet z)
diff --git a/paddle/fluid/distributed/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc
similarity index 99%
rename from paddle/fluid/distributed/fleet.cc
rename to paddle/fluid/distributed/ps/wrapper/fleet.cc
index 5caeab832a3e746720dae2104e6f91d325e101fd..0588dbdf0fc61298d33eeb6db5b3de91a6de8256 100644
--- a/paddle/fluid/distributed/fleet.cc
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/distributed/fleet.h"
-#include "paddle/fluid/distributed/service/communicator.h"
-#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
+#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/ps/wrapper/fleet.h
similarity index 98%
rename from paddle/fluid/distributed/fleet.h
rename to paddle/fluid/distributed/ps/wrapper/fleet.h
index be7fe8ea23fac1da5c55916c5ccaa7108a2b2bf9..1ec580c4d920d45b3bf43981494fde460095bcae 100644
--- a/paddle/fluid/distributed/fleet.h
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.h
@@ -23,8 +23,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/fluid/distributed/communicator_common.h"
-#include "paddle/fluid/distributed/service/service.h"
+#include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h"
+#include "paddle/fluid/distributed/ps/service/ps_service/service.h"
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/framework/io/fs.h"
 #include "paddle/fluid/framework/io/shell.h"
@@ -49,7 +49,7 @@ class PSCore;
 
 using framework::LoDTensor;
 using framework::Scope;
-using framework::SelectedRows;
+using pten::SelectedRows;
 using framework::Variable;
 
 using RpcCtxMap = std::unordered_map<std::string, CommContext>;
diff --git a/paddle/fluid/distributed/test/barrier_table_test.cc b/paddle/fluid/distributed/test/barrier_table_test.cc
index 8dc2aa2299be7d72e07e98dae0352a9d791d3f99..0715f777fa5cb286ff393190a3d94dd86e74518a 100644
--- a/paddle/fluid/distributed/test/barrier_table_test.cc
+++ b/paddle/fluid/distributed/test/barrier_table_test.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/table/common_table.h"
-#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/distributed/ps/table/common_table.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
index f83c7bdb15fa1cad53a033f0444a6854910475e1..d7d9d1ed1bafd95e9d6db75c1e848693a3de55b1 100644
--- a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/distributed/service/brpc_ps_client.h"
-#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
index f9c2b55eb4fee2e9bbaa49183b23192d04e61733..4f7b608c8bfb9366e010abda8fc72e68d72fa4e3 100644
--- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
@@ -18,9 +18,9 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/service/brpc_ps_client.h"
-#include "paddle/fluid/distributed/service/brpc_ps_server.h"
-#include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/ps/service/env.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/distributed/test/brpc_utils_test.cc b/paddle/fluid/distributed/test/brpc_utils_test.cc
index 19198b4d207d157629dd3847040a19d9f30ba9b8..608f647d148e4243c6e683e5e600424dd79d8192 100644
--- a/paddle/fluid/distributed/test/brpc_utils_test.cc
+++ b/paddle/fluid/distributed/test/brpc_utils_test.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/distributed/service/brpc_utils.h"
+#include "paddle/fluid/distributed/ps/service/brpc_utils.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
@@ -56,7 +56,7 @@ void CreateVarsOnScope(framework::Scope* scope, platform::Place* place,
 
   // var 3
   framework::Variable* var3 = scope->Var("x3");
-  auto* slr = var3->GetMutable<framework::SelectedRows>();
+  auto* slr = var3->GetMutable<pten::SelectedRows>();
   slr->set_height(564);
   auto* tensor3 = slr->mutable_value();
   auto* rows = slr->mutable_rows();
@@ -111,7 +111,7 @@ void RunMultiVarMsg(platform::Place place) {
 
   // check var3
   framework::Variable* var3 = scope_recv.FindVar("x3");
-  auto* slr = var3->GetMutable<framework::SelectedRows>();
+  auto* slr = var3->GetMutable<pten::SelectedRows>();
   EXPECT_EQ(slr->rows().size(), 564);
   for (int i = 0; i < 564; ++i) {
     EXPECT_EQ(slr->rows()[i], i);
diff --git a/paddle/fluid/distributed/test/ctr_accessor_test.cc b/paddle/fluid/distributed/test/ctr_accessor_test.cc
index 8c667cad605fcc6b581d91ebbb6e2be812e1d1be..835b1a361573d4991e05551af10b2bd1567db388 100644
--- a/paddle/fluid/distributed/test/ctr_accessor_test.cc
+++ b/paddle/fluid/distributed/test/ctr_accessor_test.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/distributed/table/ctr_accessor.h"
+#include "paddle/fluid/distributed/ps/table/ctr_accessor.h"
 #include <cmath>
 #include <iostream>
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/table/sparse_sgd_rule.h"
+#include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/test/dense_table_test.cc b/paddle/fluid/distributed/test/dense_table_test.cc
index 2e48b791dc8db510749aec7eed2184b8ef232381..c9a038e000e149f354db2bab72b48c04a721a5f6 100644
--- a/paddle/fluid/distributed/test/dense_table_test.cc
+++ b/paddle/fluid/distributed/test/dense_table_test.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/table/common_dense_table.h"
+#include "paddle/fluid/distributed/ps/table/common_dense_table.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/test/feature_value_test.cc b/paddle/fluid/distributed/test/feature_value_test.cc
index 9bd00dcc56fc2da43135d0ffc9fc36821fb59941..32e3944d35a1c69ce375db207427a535018da481 100644
--- a/paddle/fluid/distributed/test/feature_value_test.cc
+++ b/paddle/fluid/distributed/test/feature_value_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/distributed/table/depends/feature_value.h"
+#include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
 #include <vector>
 #include "gtest/gtest.h"
 
diff --git a/paddle/fluid/distributed/test/geo_table_test.cc b/paddle/fluid/distributed/test/geo_table_test.cc
index c9f15db3f788e13ca2f9a8279358358f1c50131b..b148c32f4968ce5a8c6b939978f7a983f15be702 100644
--- a/paddle/fluid/distributed/test/geo_table_test.cc
+++ b/paddle/fluid/distributed/test/geo_table_test.cc
@@ -21,11 +21,11 @@ limitations under the License. */
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/table/common_dense_table.h"
-#include "paddle/fluid/distributed/table/common_sparse_table.h"
-#include "paddle/fluid/distributed/table/depends/sparse_utils.h"
-#include "paddle/fluid/distributed/table/sparse_geo_table.h"
-#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/distributed/ps/table/common_dense_table.h"
+#include "paddle/fluid/distributed/ps/table/common_sparse_table.h"
+#include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h"
+#include "paddle/fluid/distributed/ps/table/sparse_geo_table.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc
index 714fbb1e4aa2d8abb10eebe464cd8ac11ad1dc18..e808d2a81539acc78a0c01155e1a63e357cead78 100644
--- a/paddle/fluid/distributed/test/graph_node_split_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_split_test.cc
@@ -21,16 +21,16 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/service/brpc_ps_client.h"
-#include "paddle/fluid/distributed/service/brpc_ps_server.h"
-#include "paddle/fluid/distributed/service/env.h"
-#include "paddle/fluid/distributed/service/graph_brpc_client.h"
-#include "paddle/fluid/distributed/service/graph_brpc_server.h"
-#include "paddle/fluid/distributed/service/graph_py_service.h"
-#include "paddle/fluid/distributed/service/ps_client.h"
-#include "paddle/fluid/distributed/service/sendrecv.pb.h"
-#include "paddle/fluid/distributed/service/service.h"
-#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/ps/service/env.h"
+#include "paddle/fluid/distributed/ps/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/ps/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/ps/service/ps_client.h"
+#include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h"
+#include "paddle/fluid/distributed/ps/service/ps_service/service.h"
+#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
index 3a430d7a51068a3aa1fb341b3425830add5266cf..3243ebc389c851a2fb0c706280f2f6b8a24c1ef9 100644
--- a/paddle/fluid/distributed/test/graph_node_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -21,16 +21,16 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/service/brpc_ps_client.h"
-#include "paddle/fluid/distributed/service/brpc_ps_server.h"
-#include "paddle/fluid/distributed/service/env.h"
-#include "paddle/fluid/distributed/service/graph_brpc_client.h"
-#include "paddle/fluid/distributed/service/graph_brpc_server.h"
-#include "paddle/fluid/distributed/service/graph_py_service.h"
-#include "paddle/fluid/distributed/service/ps_client.h"
-#include "paddle/fluid/distributed/service/sendrecv.pb.h"
-#include "paddle/fluid/distributed/service/service.h"
-#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/ps/service/env.h"
+#include "paddle/fluid/distributed/ps/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/ps/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/ps/service/ps_client.h"
+#include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h"
+#include "paddle/fluid/distributed/ps/service/ps_service/service.h"
+#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
diff --git a/paddle/fluid/distributed/test/large_scale_test.cc b/paddle/fluid/distributed/test/large_scale_test.cc
index 6ce8723abeea1ef0cc15d197135d7d14dc2fa86f..13c1d132124ebefc45284c5ab2c47efac6ca6ed3 100644
--- a/paddle/fluid/distributed/test/large_scale_test.cc
+++ b/paddle/fluid/distributed/test/large_scale_test.cc
@@ -21,9 +21,9 @@ limitations under the License. */
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/table/common_sparse_table.h"
-#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
-#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/distributed/ps/table/common_sparse_table.h"
+#include "paddle/fluid/distributed/ps/table/depends/large_scale_kv.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/test/memory_sparse_table_test.cc b/paddle/fluid/distributed/test/memory_sparse_table_test.cc
index 30a1107d64e3c4fcb8a0b091d4c11f11a81ad947..62992c74bfd23456959ce7531afd268e62ee9df3 100644
--- a/paddle/fluid/distributed/test/memory_sparse_table_test.cc
+++ b/paddle/fluid/distributed/test/memory_sparse_table_test.cc
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/table/memory_sparse_table.h"
-#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc
index e86234f1bd9c7618eab0220cc41994b9e2855c7f..c895231d93ec5e3bb12d7d4eb2769a630016e2ef 100644
--- a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc
+++ b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/distributed/table/sparse_sgd_rule.h"
+#include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
 #include <cmath>
 #include <iostream>
 #include "gtest/gtest.h"
diff --git a/paddle/fluid/distributed/test/sparse_table_test.cc b/paddle/fluid/distributed/test/sparse_table_test.cc
index 26bede392d6fade06dd29cf5e5a28295bb1cbc43..f13bab078a6b0c95ad580b36ad2d7c34d0b470e6 100644
--- a/paddle/fluid/distributed/test/sparse_table_test.cc
+++ b/paddle/fluid/distributed/test/sparse_table_test.cc
@@ -21,10 +21,10 @@ limitations under the License. */
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/table/common_dense_table.h"
-#include "paddle/fluid/distributed/table/common_sparse_table.h"
-#include "paddle/fluid/distributed/table/sparse_geo_table.h"
-#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/distributed/ps/table/common_dense_table.h"
+#include "paddle/fluid/distributed/ps/table/common_sparse_table.h"
+#include "paddle/fluid/distributed/ps/table/sparse_geo_table.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/test/table_test.cc b/paddle/fluid/distributed/test/table_test.cc
index 9b12717f73087751ab08b37f5232c434e14b3c31..6a29781158b838378468b1789b9eed0408c3435d 100644
--- a/paddle/fluid/distributed/test/table_test.cc
+++ b/paddle/fluid/distributed/test/table_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/table/common_sparse_table.h"
-#include "paddle/fluid/distributed/table/sparse_geo_table.h"
+#include "paddle/fluid/distributed/ps/table/common_sparse_table.h"
+#include "paddle/fluid/distributed/ps/table/sparse_geo_table.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/eager/legacy/infer_shape_context.h b/paddle/fluid/eager/legacy/infer_shape_context.h
index 0979abc63d65870e1a2aabdc14116a55d786ed00..b43eda7abc345b0533cdc1bca017bc8311d90a79 100644
--- a/paddle/fluid/eager/legacy/infer_shape_context.h
+++ b/paddle/fluid/eager/legacy/infer_shape_context.h
@@ -197,9 +197,8 @@ class EagerInferShapeContext : public paddle::framework::InferShapeContext {
           out_var->GetMutable<paddle::framework::LoDTensor>();
       out_lod_tensor->Resize(in_lod_tensor.dims());
     } else {
-      auto& in_sele_rows = in_var->Get<paddle::framework::SelectedRows>();
-      auto out_sele_rows =
-          out_var->GetMutable<paddle::framework::SelectedRows>();
+      auto& in_sele_rows = in_var->Get<pten::SelectedRows>();
+      auto out_sele_rows = out_var->GetMutable<pten::SelectedRows>();
       out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
       out_sele_rows->set_rows(in_sele_rows.rows());
       out_sele_rows->set_height(in_sele_rows.height());
@@ -368,8 +367,8 @@ class EagerInferShapeContext : public paddle::framework::InferShapeContext {
                                      "Input variable should not be null"));
     if (var->IsType<paddle::framework::LoDTensor>()) {
       return var->Get<paddle::framework::LoDTensor>().dims();
-    } else if (var->IsType<paddle::framework::SelectedRows>()) {
-      return var->Get<paddle::framework::SelectedRows>().GetCompleteDims();
+    } else if (var->IsType<pten::SelectedRows>()) {
+      return var->Get<pten::SelectedRows>().GetCompleteDims();
     } else {
       PADDLE_THROW(paddle::platform::errors::PermissionDenied(
           "Only LoDTensor/SelectedRows support 'GetDim', but Variables "
@@ -385,8 +384,8 @@ class EagerInferShapeContext : public paddle::framework::InferShapeContext {
   void SetDim(paddle::framework::Variable* var, const DDim& dim) {
     if (var->IsType<paddle::framework::LoDTensor>()) {
       var->GetMutable<paddle::framework::LoDTensor>()->Resize(dim);
-    } else if (var->IsType<paddle::framework::SelectedRows>()) {
-      var->GetMutable<paddle::framework::SelectedRows>()->set_height(dim[0]);
+    } else if (var->IsType<pten::SelectedRows>()) {
+      var->GetMutable<pten::SelectedRows>()->set_height(dim[0]);
     } else {
       PADDLE_THROW(paddle::platform::errors::PermissionDenied(
           "Variable type_id %s, expect LoDTensor/SelectedRows."));
diff --git a/paddle/fluid/eager/legacy/prepared_operator.cc b/paddle/fluid/eager/legacy/prepared_operator.cc
index bd7e5c549872d32a93b7b1b303081a17ade167ff..3179b96807119eac9c200f79f4c7990c3026ad4f 100644
--- a/paddle/fluid/eager/legacy/prepared_operator.cc
+++ b/paddle/fluid/eager/legacy/prepared_operator.cc
@@ -32,8 +32,8 @@ const paddle::framework::Tensor* GetTensorFromVar(
     const paddle::framework::Variable& var) {
   if (var.IsType<paddle::framework::LoDTensor>()) {
     return &(var.Get<paddle::framework::LoDTensor>());
-  } else if (var.IsType<paddle::framework::SelectedRows>()) {
-    return &(var.Get<paddle::framework::SelectedRows>().value());
+  } else if (var.IsType<pten::SelectedRows>()) {
+    return &(var.Get<pten::SelectedRows>().value());
   } else {
     return nullptr;
   }
diff --git a/paddle/fluid/eager/legacy/tensor_helper.cc b/paddle/fluid/eager/legacy/tensor_helper.cc
index 2ee2f9fefa9a342238e764d198124b5d74ee1dd0..fbf3205be2fe37ea5333d4295fd2d0fb0d76f811 100644
--- a/paddle/fluid/eager/legacy/tensor_helper.cc
+++ b/paddle/fluid/eager/legacy/tensor_helper.cc
@@ -32,7 +32,7 @@ void InitializeVariable(paddle::framework::Variable *var,
   if (var_type == paddle::framework::proto::VarType::LOD_TENSOR) {
     var->GetMutable<paddle::framework::LoDTensor>();
   } else if (var_type == paddle::framework::proto::VarType::SELECTED_ROWS) {
-    var->GetMutable<paddle::framework::SelectedRows>();
+    var->GetMutable<pten::SelectedRows>();
   } else if (var_type == paddle::framework::proto::VarType::FEED_MINIBATCH) {
     var->GetMutable<paddle::framework::FeedList>();
   } else if (var_type == paddle::framework::proto::VarType::FETCH_LIST) {
@@ -72,9 +72,9 @@ void CopyVariable(const paddle::framework::Variable &src_var,
     auto &src_tensor = src_var.Get<paddle::framework::LoDTensor>();
     tmp_grad_tensor->set_lod(src_tensor.lod());
     paddle::framework::TensorCopy(src_tensor, cpu_place, tmp_grad_tensor);
-  } else if (src_var.IsType<paddle::framework::SelectedRows>()) {
-    auto &src_slr = src_var.Get<paddle::framework::SelectedRows>();
-    auto *tmp_grad_slr = dst_var->GetMutable<paddle::framework::SelectedRows>();
+  } else if (src_var.IsType<pten::SelectedRows>()) {
+    auto &src_slr = src_var.Get<pten::SelectedRows>();
+    auto *tmp_grad_slr = dst_var->GetMutable<pten::SelectedRows>();
     tmp_grad_slr->set_rows(src_slr.rows());
     tmp_grad_slr->set_height(src_slr.height());
     auto &src_t = src_slr.value();
@@ -89,8 +89,8 @@ paddle::framework::proto::VarType::Type GetDtypeFromVar(
     const paddle::framework::Variable &var) {
   if (var.IsType<paddle::framework::LoDTensor>()) {
     return var.Get<paddle::framework::LoDTensor>().type();
-  } else if (var.IsType<paddle::framework::SelectedRows>()) {
-    return var.Get<paddle::framework::SelectedRows>().value().type();
+  } else if (var.IsType<pten::SelectedRows>()) {
+    return var.Get<pten::SelectedRows>().value().type();
   } else {
     PADDLE_THROW(paddle::platform::errors::InvalidArgument(
         "Variable type is %s, expect LoDTensor or SelectedRows.",
@@ -101,8 +101,8 @@ const paddle::platform::Place &GetPlaceFromVar(
     const paddle::framework::Variable &var) {
   if (var.IsType<paddle::framework::LoDTensor>()) {
     return var.Get<paddle::framework::LoDTensor>().place();
-  } else if (var.IsType<paddle::framework::SelectedRows>()) {
-    return var.Get<paddle::framework::SelectedRows>().place();
+  } else if (var.IsType<pten::SelectedRows>()) {
+    return var.Get<pten::SelectedRows>().place();
   } else {
     PADDLE_THROW(paddle::platform::errors::InvalidArgument(
         "Variable type is %s, expect LoDTensor or SelectedRows.",
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 0220e5fd59476a836045fe0d4fcaa48bccdeb92f..ce63a58d41ae004298f239effa80fe1ce79c4eef 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -192,11 +192,11 @@ cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_va
 IF(WITH_XPU)
 cc_library(operator SRCS operator.cc DEPS xpu_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
     shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils
-    pten pten_utils kernel_factory infershape_utils)
+    pten pten_utils kernel_factory infershape_utils op_utils)
 ELSE()
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
     shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils
-    pten pten_utils kernel_factory infershape_utils)
+    pten pten_utils kernel_factory infershape_utils op_utils)
 ENDIF()
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
@@ -383,7 +383,7 @@ cc_library(prune SRCS prune.cc DEPS framework_proto boost)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
         proto_desc)
-cc_library(selected_rows_utils SRCS selected_rows_utils.cc DEPS tensor)
+cc_library(selected_rows_utils SRCS selected_rows_utils.cc DEPS selected_rows)
 cc_test(selected_rows_utils_test SRCS selected_rows_utils_test.cc DEPS selected_rows_utils)
 
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto op_kernel_type)
@@ -393,10 +393,6 @@ cc_test(tuple_test SRCS tuple_test.cc )
 
 cc_test(inlined_vector_test SRCS inlined_vector_test.cc)
 
-if (NOT WIN32)
-cc_test(rw_lock_test SRCS rw_lock_test.cc)
-endif (NOT WIN32)
-
 cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack)
 cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog)
 
@@ -408,7 +404,7 @@ cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tens
 cc_library(generator SRCS generator.cc DEPS enforce place)
 
 cc_library(pten_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows_utils place pten var_type_traits pten_api_utils op_info)
-cc_library(infershape_utils SRCS infershape_utils.cc DEPS lod_tensor selected_rows_utils attribute place pten var_type_traits pten pten_api_utils op_info shape_inference)
+cc_library(infershape_utils SRCS infershape_utils.cc DEPS pten_utils attribute shape_inference op_utils)
 
 # Get the current working branch
 execute_process(
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index 1b5db8380514d552ed56ae3c65a338a082f02bdc..df1840794af3bbef1a2bdf8c2073c89991cdf9fd 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -24,7 +24,7 @@
 #include "paddle/fluid/platform/timer.h"
 
 #ifdef PADDLE_WITH_PSCORE
-#include "paddle/fluid/distributed/fleet.h"
+#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 #endif
 
 #if defined _WIN32 || defined __APPLE__
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index d8c372becf1b45895920c5d2783f427c2b8d352b..22a2847c1d834fee9fc3012957ddfc70130e41d3 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -120,9 +120,9 @@ void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
     tran_lod_tensor->set_format(in_lod_tensor.format());
 #endif
     tran_lod_tensor->ShareDataWith(tensor);
-  } else if (in_var.IsType<SelectedRows>()) {
-    auto &in_selected_rows = in_var.Get<SelectedRows>();
-    auto *trans_selected_rows = out_var->GetMutable<SelectedRows>();
+  } else if (in_var.IsType<pten::SelectedRows>()) {
+    auto &in_selected_rows = in_var.Get<pten::SelectedRows>();
+    auto *trans_selected_rows = out_var->GetMutable<pten::SelectedRows>();
     trans_selected_rows->set_height(in_selected_rows.height());
     trans_selected_rows->set_rows(in_selected_rows.rows());
     trans_selected_rows->mutable_value()->ShareDataWith(tensor);
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 052860cd0ab40479df7672ae32ebc6e75965b97b..4511578f34ec27b31736b2a762991e52e5a66bd4 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -17,7 +17,7 @@
 #include "paddle/fluid/framework/variable_helper.h"
 
 #if defined PADDLE_WITH_PSCORE
-#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
index 2e82fe22dba73149e722958d6027ee6ba52f12d8..1435a82c0f528ad90a2da7958c602670a33ad1e7 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -237,7 +237,7 @@ struct TestBroadcastOpHandle {
     PADDLE_ENFORCE_NOT_NULL(
         var, platform::errors::NotFound("Variable %s is not found in scope.",
                                         varname));
-    auto selected_rows = var->GetMutable<f::SelectedRows>();
+    auto selected_rows = var->GetMutable<pten::SelectedRows>();
     auto value = selected_rows->mutable_value();
     value->mutable_data<float>(kDims, place_list_[input_scope_idx]);
     selected_rows->set_height(height);
@@ -256,7 +256,7 @@ struct TestBroadcastOpHandle {
     PADDLE_ENFORCE_NOT_NULL(
         var, platform::errors::NotFound("Variable %s is not found in scope.",
                                         varname));
-    auto& selected_rows = var->Get<f::SelectedRows>();
+    auto& selected_rows = var->Get<pten::SelectedRows>();
     auto rt = selected_rows.value();
     PADDLE_ENFORCE_EQ(selected_rows.height(), height,
                       platform::errors::InvalidArgument(
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 59614e89c1344e76a1e7042e27dbff41fccb7799..42b87f3853c58ab336474773f7eeb2501b4fd971 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -129,9 +129,10 @@ void EagerDeletionOpHandle::RunImpl() {
 
     if (var->IsType<LoDTensor>()) {
       garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
-    } else if (var->IsType<SelectedRows>()) {
-      garbages.emplace_back(
-          var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder());
+    } else if (var->IsType<pten::SelectedRows>()) {
+      garbages.emplace_back(var->GetMutable<pten::SelectedRows>()
+                                ->mutable_value()
+                                ->MoveMemoryHolder());
     } else if (var->IsType<LoDTensorArray>()) {
       auto *tensor_arr = var->GetMutable<LoDTensorArray>();
       for (auto &t : *tensor_arr) {
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
index 74f5deed45557c96d7d2e84034d5fddf05892079..430f55793b73606ec0087dd4e8823d80587da618 100644
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -64,14 +64,14 @@ void GatherOpHandle::RunImpl() {
       platform::errors::NotFound("The variable '%s' is not found in the scope.",
                                  in_0_handle->name()));
 
-  PADDLE_ENFORCE_EQ(pre_in_var->IsType<framework::SelectedRows>(), true,
+  PADDLE_ENFORCE_EQ(pre_in_var->IsType<pten::SelectedRows>(), true,
                     platform::errors::Unimplemented(
                         "Currently, gather_op only supports SelectedRows."));
 
   // Wait input done, this Wait is asynchronous operation
   WaitInputVarGenerated();
 
-  auto &pre_in_value = pre_in_var->Get<framework::SelectedRows>();
+  auto &pre_in_value = pre_in_var->Get<pten::SelectedRows>();
   std::vector<int64_t> out_rows;
   std::vector<Tensor> in_tensors;
 
@@ -85,7 +85,7 @@ void GatherOpHandle::RunImpl() {
             "The variable '%s' is not found in the scope.", in_handle->name()));
     VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var);
 
-    auto &in_sr_value = in_var->Get<framework::SelectedRows>();
+    auto &in_sr_value = in_var->Get<pten::SelectedRows>();
 
     auto &in_sr_rows = in_sr_value.rows();
     out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end());
@@ -108,7 +108,7 @@ void GatherOpHandle::RunImpl() {
       out_var,
       platform::errors::NotFound("The variable '%s' is not found in the scope.",
                                  out_var_handle->name()));
-  auto out_value = out_var->GetMutable<framework::SelectedRows>();
+  auto out_value = out_var->GetMutable<pten::SelectedRows>();
   out_value->set_height(pre_in_value.height());
   out_value->set_rows(out_rows);
   size_t rows = out_rows.size();
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 38e20127f1612e74bd4dc6117680a3df8cc8244f..b46168bf8fb314eaf0234ebf5898a790fea714e1 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -146,7 +146,7 @@ struct TestGatherOpHandle {
       PADDLE_ENFORCE_NOT_NULL(
           in_var, platform::errors::NotFound(
                       "The variable '%s' is not found in the scope.", "input"));
-      auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+      auto in_selected_rows = in_var->GetMutable<pten::SelectedRows>();
       auto value = in_selected_rows->mutable_value();
       value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
 
@@ -162,10 +162,10 @@ struct TestGatherOpHandle {
     PADDLE_ENFORCE_NOT_NULL(
         out_var, platform::errors::NotFound(
                      "The variable '%s' is not found in the scope.", "out"));
-    auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
+    auto out_selected_rows = out_var->GetMutable<pten::SelectedRows>();
 
     auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input");
-    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+    auto in_selected_rows = in_var->GetMutable<pten::SelectedRows>();
 
     out_selected_rows->mutable_value()->ShareDataWith(
         in_selected_rows->value());
@@ -177,7 +177,7 @@ struct TestGatherOpHandle {
 
     p::CPUPlace cpu_place;
 
-    auto& out_select_rows = out_var->Get<f::SelectedRows>();
+    auto& out_select_rows = out_var->Get<pten::SelectedRows>();
     auto rt = out_select_rows.value();
 
     PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index db3eaece3569f19cc8297cbcf94df977c4e013ce..f57136e1f0ed94b3d573a36aa8367e227f7ead24 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -321,8 +321,8 @@ void CheckVarHasNanOrInf(const std::string& op_type,
   const Tensor* tensor{nullptr};
   if (var->IsType<framework::LoDTensor>()) {
     tensor = &var->Get<framework::LoDTensor>();
-  } else if (var->IsType<framework::SelectedRows>()) {
-    tensor = &var->Get<framework::SelectedRows>().value();
+  } else if (var->IsType<pten::SelectedRows>()) {
+    tensor = &var->Get<pten::SelectedRows>().value();
   } else {
     VLOG(10) << var_name << " var_name need not to check";
     return;
@@ -468,8 +468,8 @@ void PrintNpuVarInfo(const std::string& op_type, const std::string& var_name,
   const Tensor* tensor{nullptr};
   if (var->IsType<framework::LoDTensor>()) {
     tensor = &var->Get<framework::LoDTensor>();
-  } else if (var->IsType<framework::SelectedRows>()) {
-    tensor = &var->Get<framework::SelectedRows>().value();
+  } else if (var->IsType<pten::SelectedRows>()) {
+    tensor = &var->Get<pten::SelectedRows>().value();
   } else {
     VLOG(10) << var_name << " var_name need not to check";
     return;
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index 27f55e237f51689bc5dfcc1d5bcc92496aa506cb..427b981e7cda27269f9da5f007464a5fd97d28c2 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -275,10 +275,8 @@ struct OpInfoFiller<T, kVarTypeInference> {
 template <typename T>
 struct OpInfoFiller<T, kShapeInference> {
   void operator()(const char* op_type, OpInfo* info) const {
-    PADDLE_ENFORCE_EQ(
-        info->infer_shape_, nullptr,
-        platform::errors::AlreadyExists(
-            "Duplicate InferShapeFN of %s has been registered", op_type));
+    // Note: if fill InferShapeFN by this Filler, the infershape here
+    // will overwrite the op->InferShape func registered in kOperator Filler
     info->infer_shape_ = [](InferShapeContext* ctx) {
       T inference;
       inference(ctx);
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
index 583c34494bca4c64c033cde17b031851ae96f209..6d136055da7824a30a086d83a5e65f9674fa9cdb 100644
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -20,6 +20,11 @@
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
+
+namespace pten {
+class SelectedRows;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -96,10 +101,10 @@ struct ReduceBufferData {
 
 struct GatherLocalSelectedRowsFunctor {
   GatherLocalSelectedRowsFunctor(
-      const std::vector<const SelectedRows *> &src_selected_rows,
+      const std::vector<const pten::SelectedRows *> &src_selected_rows,
       const std::vector<platform::Place> &in_places,
       const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
-      const platform::Place &out_place, SelectedRows *dst_selected_rows)
+      const platform::Place &out_place, pten::SelectedRows *dst_selected_rows)
       : dev_ctxes_(dev_ctxes),
         in_places_(in_places),
         out_place_(out_place),
@@ -147,7 +152,7 @@ struct GatherLocalSelectedRowsFunctor {
   std::vector<Tensor> in_tensors_;
 
   platform::Place out_place_;
-  SelectedRows *dst_selected_rows_;
+  pten::SelectedRows *dst_selected_rows_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 6493ef540ccbe0f70ea47d817907a75a001a7f94..5cf84a04958b82b91367e6fec477af6467fadd4f 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -114,10 +114,10 @@ void ReduceOpHandle::RunImpl() {
     t_out_p = platform::CPUPlace();
   }
 
-  if (pre_in_var->IsType<framework::SelectedRows>()) {
+  if (pre_in_var->IsType<pten::SelectedRows>()) {
     this->RunAndRecordEvent([&] {
-      std::vector<const SelectedRows *> in_selected_rows =
-          GetInputValues<SelectedRows>(in_var_handles, var_scopes);
+      std::vector<const pten::SelectedRows *> in_selected_rows =
+          GetInputValues<pten::SelectedRows>(in_var_handles, var_scopes);
 
       const CollectiveContext &collective_context =
           *CollectiveContext::GetInstance();
@@ -130,7 +130,7 @@ void ReduceOpHandle::RunImpl() {
           platform::is_cpu_place(t_out_p)) {
         GatherLocalSelectedRowsFunctor functor(
             in_selected_rows, in_places, dev_ctxes_, t_out_p,
-            out_var->GetMutable<framework::SelectedRows>());
+            out_var->GetMutable<pten::SelectedRows>());
         WaitInputVarGenerated();
         functor();
         return;
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index e9c913b0c8255065f5a603560c36830c119d967a..5b1267d0970831431a91a4e8bae493594b929a6d 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -27,7 +27,6 @@
 
 namespace paddle {
 namespace framework {
-class SelectedRows;
 
 namespace details {
 struct VarHandle;
@@ -131,11 +130,11 @@ struct ReduceOpHandle : public OpHandleBase {
     defined PADDLE_WITH_DISTRIBUTE
   template <typename DevCtx, typename DataType>
   void GatherSelectedRows(
-      const std::vector<const SelectedRows *> &src_selecte_rows_,
+      const std::vector<const pten::SelectedRows *> &src_selecte_rows_,
       const std::vector<platform::Place> &in_places,
       const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
       VarHandle *out_var_handle, const platform::Place &out_place,
-      SelectedRows *dst_selecte_rows);
+      pten::SelectedRows *dst_selecte_rows);
 #endif
 
   void Wait(
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
index 35dba488454725ddc889f62a1c7511e38bd570ff..4931c64fdf83f7577f5e7c427c384eca4b83ed5f 100644
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -174,7 +174,7 @@ struct TestReduceOpHandle {
       PADDLE_ENFORCE_NOT_NULL(
           in_var, platform::errors::NotFound(
                       "Variable %s is not found in scope.", "input"));
-      auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+      auto in_selected_rows = in_var->GetMutable<pten::SelectedRows>();
       auto value = in_selected_rows->mutable_value();
       value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
 
@@ -190,10 +190,10 @@ struct TestReduceOpHandle {
     PADDLE_ENFORCE_NOT_NULL(out_var,
                             platform::errors::NotFound(
                                 "Variable %s is not found in scope.", "out"));
-    auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
+    auto out_selected_rows = out_var->GetMutable<pten::SelectedRows>();
 
     auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
-    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+    auto in_selected_rows = in_var->GetMutable<pten::SelectedRows>();
 
     out_selected_rows->mutable_value()->ShareDataWith(
         in_selected_rows->value());
@@ -205,7 +205,7 @@ struct TestReduceOpHandle {
 
     p::CPUPlace cpu_place;
 
-    auto &out_select_rows = out_var->Get<f::SelectedRows>();
+    auto &out_select_rows = out_var->Get<pten::SelectedRows>();
     auto rt = out_select_rows.value();
 
     PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.cc b/paddle/fluid/framework/details/scope_buffered_monitor.cc
index 7354824aae5996da77bca2893872300f623bc91f..2efe1c9555857f6e1be27c135c3c613bb2981876 100644
--- a/paddle/fluid/framework/details/scope_buffered_monitor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_monitor.cc
@@ -33,9 +33,9 @@ static void GetTensors(Variable *var,
                        std::unordered_set<Tensor *> *tensor_set) {
   if (var->IsType<LoDTensor>() && var->Get<LoDTensor>().IsInitialized()) {
     tensor_set->insert(var->GetMutable<LoDTensor>());
-  } else if (var->IsType<SelectedRows>() &&
-             var->Get<SelectedRows>().value().IsInitialized()) {
-    tensor_set->insert(var->GetMutable<SelectedRows>()->mutable_value());
+  } else if (var->IsType<pten::SelectedRows>() &&
+             var->Get<pten::SelectedRows>().value().IsInitialized()) {
+    tensor_set->insert(var->GetMutable<pten::SelectedRows>()->mutable_value());
   } else if (var->IsType<LoDTensorArray>()) {
     auto *tensor_arr = var->GetMutable<LoDTensorArray>();
     for (auto &t : *tensor_arr) {
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 265e346a9d8dfb0925783b812174410bb11ae86d..c8a6cd25f0fcbe9724972225d03b539285b7225f 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -18,7 +18,7 @@
 #include "paddle/fluid/platform/profiler.h"
 
 #if defined PADDLE_WITH_PSCORE
-#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc
index 4315b6b0fc245a93f6adea9224ba45c40f0a3368..9979d2ee205311517d5047012ec52e3a1d2d9559 100644
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -33,8 +33,8 @@ template <typename Func>
 static void VisitVariable(Variable* var, Func* func) {
   if (var->IsType<LoDTensor>()) {
     (*func)(var->GetMutable<LoDTensor>());
-  } else if (var->IsType<SelectedRows>()) {
-    (*func)(var->GetMutable<SelectedRows>());
+  } else if (var->IsType<pten::SelectedRows>()) {
+    (*func)(var->GetMutable<pten::SelectedRows>());
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "VisitVariable is not supported for type %s.",
@@ -46,8 +46,8 @@ template <typename Func>
 static void VisitVariable(const Variable& var, Func* func) {
   if (var.IsType<LoDTensor>()) {
     (*func)(var.Get<LoDTensor>());
-  } else if (var.IsType<SelectedRows>()) {
-    (*func)(var.Get<SelectedRows>());
+  } else if (var.IsType<pten::SelectedRows>()) {
+    (*func)(var.Get<pten::SelectedRows>());
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "VisitVariable is not supported for type %s.", ToTypeName(var.Type())));
@@ -59,7 +59,7 @@ struct TensorVisitor {
 
   void operator()(LoDTensor* tensor) { result_ = tensor; }
 
-  void operator()(SelectedRows* selected_rows) {
+  void operator()(pten::SelectedRows* selected_rows) {
     result_ = selected_rows->mutable_value();
   }
 
@@ -85,8 +85,8 @@ struct ShareDimsAndLoDVisitor {
     tensor->Resize(val.dims());
   }
 
-  void operator()(const SelectedRows& val) {
-    auto* selected_rows = trg_->GetMutable<SelectedRows>();
+  void operator()(const pten::SelectedRows& val) {
+    auto* selected_rows = trg_->GetMutable<pten::SelectedRows>();
     selected_rows->set_rows(val.rows());
     selected_rows->set_height(val.height());
     selected_rows->mutable_value()->Resize(val.value().dims());
@@ -131,8 +131,8 @@ struct EnforceShapeAndDTypeEQVisitor {
             "The layout of the two variables' tensors tensor is not equal."));
   }
 
-  void operator()(const SelectedRows& src) {
-    auto& selected_rows = dst_->Get<SelectedRows>();
+  void operator()(const pten::SelectedRows& src) {
+    auto& selected_rows = dst_->Get<pten::SelectedRows>();
     PADDLE_ENFORCE_EQ(
         src.place().GetType(), selected_rows.place().GetType(),
         platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index 83d5a2efa342e57a3124651324824fddb287cc01..bea23469f113a94489d3ec53206b9b68b433c8e9 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -815,8 +815,8 @@ void DownpourWorker::TrainFiles() {
             if (var->IsType<framework::LoDTensor>()) {
               tensor = var->GetMutable<LoDTensor>();
               len = tensor->numel();
-            } else if (var->IsType<SelectedRows>()) {
-              auto selected_rows = var->GetMutable<SelectedRows>();
+            } else if (var->IsType<pten::SelectedRows>()) {
+              auto selected_rows = var->GetMutable<pten::SelectedRows>();
               tensor = selected_rows->mutable_value();
               len = tensor->numel();
             }
diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc
index 6e5578a2d12b4c29445c1ee4597431a647a13c9a..00d2149cb184b3766f4e68e179a280c0c98640e5 100644
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
@@ -147,9 +147,10 @@ void DeleteUnusedTensors(const Scope &scope,
     VLOG(2) << "Erase variable " << var_name;
     if (var->IsType<LoDTensor>()) {
       garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
-    } else if (var->IsType<SelectedRows>()) {
-      garbages.emplace_back(
-          var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder());
+    } else if (var->IsType<pten::SelectedRows>()) {
+      garbages.emplace_back(var->GetMutable<pten::SelectedRows>()
+                                ->mutable_value()
+                                ->MoveMemoryHolder());
     } else if (var->IsType<LoDTensorArray>()) {
       auto *lod_tensor_arr = var->GetMutable<LoDTensorArray>();
       for (auto &t : *lod_tensor_arr) {
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index 3e8b0cfbc31f3551bcd6101e7ba48927b9600553..a88ffbe3d9637a8c6d3de9e065bd380d0c69c280 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -26,10 +26,10 @@ limitations under the License. */
 #endif
 
 #ifdef PADDLE_WITH_PSCORE
-#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
+#include "paddle/fluid/distributed/ps/table/depends/large_scale_kv.h"
 #endif
 
-#include "paddle/fluid/distributed/thirdparty/round_robin.h"
+#include "paddle/fluid/distributed/ps/thirdparty/round_robin.h"
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/scope.h"
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index 509b43431b572539608cd976f67d1cab90414856..b3173a1386582a27faccdcdc49d0c5013204901f 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -21,9 +21,9 @@ limitations under the License. */
 #include "common_value.h"  // NOLINT
 #endif
 #ifdef PADDLE_WITH_PSCORE
-#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
+#include "paddle/fluid/distributed/ps/table/depends/large_scale_kv.h"
 #endif
-#include "paddle/fluid/framework/rw_lock.h"
+#include "paddle/pten/core/utils/rw_lock.h"
 #include "thrust/pair.h"
 // #include "cudf/concurrent_unordered_map.cuh.h"
 #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
@@ -81,7 +81,7 @@ class HashTable {
             << " push value size: " << push_grad_value_size_;
   }
 
-  std::unique_ptr<RWLock> rwlock_{nullptr};
+  std::unique_ptr<pten::RWLock> rwlock_{nullptr};
 
  private:
   TableContainer<KeyType, ValType>* container_;
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
index dec73574685585747178bd0c2c65d39090eb6943..72e628223e31782b2dcfb74567654708ffbd2d57 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
@@ -121,7 +121,7 @@ __global__ void dy_mf_update_kernel(Table* table,
 template <typename KeyType, typename ValType>
 HashTable<KeyType, ValType>::HashTable(size_t capacity) {
   container_ = new TableContainer<KeyType, ValType>(capacity);
-  rwlock_.reset(new RWLock);
+  rwlock_.reset(new pten::RWLock);
 }
 
 template <typename KeyType, typename ValType>
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index aa01c5f769ae252ff04ef7e2526c473d6604403a..ef5cd8466f1759484f8541546235ac44dd827037 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -29,7 +29,7 @@ limitations under the License. */
 #include <gloo/broadcast.h>
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
-#include "paddle/fluid/distributed/thirdparty/round_robin.h"
+#include "paddle/fluid/distributed/ps/thirdparty/round_robin.h"
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/fleet/heter_context.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h"
@@ -43,7 +43,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_PSCORE
-#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/heter_pipeline_trainer.cc b/paddle/fluid/framework/heter_pipeline_trainer.cc
index cb939f38ff3d9678e09e5cae433317031a47d78f..13eb78874c395e8ff2baa01d2fd0bd9f2df5c42c 100644
--- a/paddle/fluid/framework/heter_pipeline_trainer.cc
+++ b/paddle/fluid/framework/heter_pipeline_trainer.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #if defined(PADDLE_WITH_PSCORE)
-#include "paddle/fluid/distributed/service/heter_server.h"
+#include "paddle/fluid/distributed/ps/service/heter_server.h"
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/trainer.h"
diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc
index a4e582c8fed13d93ec54ed29ad26ebe3d109aa09..8e94bb1d0e1498bfa69db565de0de36ffce63cb3 100644
--- a/paddle/fluid/framework/heter_section_worker.cc
+++ b/paddle/fluid/framework/heter_section_worker.cc
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_PSCORE)
 #include <float.h>
-#include "paddle/fluid/distributed/service/heter_server.h"
+#include "paddle/fluid/distributed/ps/service/heter_server.h"
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index f4660751b582a460f8079173a9bb859e26711344..0b4c8f4a719afcb0aee39fb369516b9b47e52a71 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/lodtensor_printer.h"
 
 #if defined PADDLE_WITH_PSCORE
-#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 9a91a5208ebbcb97fc1770bc3bfd5b860716c135..08b945159ad7ee201514845af2cb8d8f5876664c 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -15,11 +15,14 @@ limitations under the License. */
 #include "paddle/fluid/framework/infershape_utils.h"
 
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/pten/core/compat/arg_map_context.h"
+#include "paddle/pten/core/compat/op_utils.h"
 #include "paddle/pten/core/compat_utils.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/infermeta_utils.h"
 #include "paddle/pten/core/meta_tensor.h"
 
 namespace paddle {
@@ -186,5 +189,40 @@ class CompatMetaTensor : public pten::MetaTensor {
   bool is_runtime_;
 };
 
+pten::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
+                                             const std::string& op_type) {
+  // 1. get kernel args
+  InitDefaultKernelSignatureMap();
+  auto arg_map_fn = pten::OpUtilsMap::Instance().GetArgumentMappingFn(op_type);
+  PADDLE_ENFORCE_NOT_NULL(
+      arg_map_fn, platform::errors::NotFound(
+                      "The ArgumentMappingFn of %s op is not found.", op_type));
+  InferShapeArgumentMappingContext arg_map_context(*ctx);
+  auto signature = arg_map_fn(arg_map_context);
+  VLOG(3) << "BuildInferMetaContext: op kernel signature - " << signature;
+
+  // 2. build infermeta context
+  pten::InferMetaContext infer_meta_context(ctx->IsRuntime());
+
+  auto& input_names = std::get<0>(signature.args);
+  auto& output_names = std::get<2>(signature.args);
+  // TODO(chenweihang): support attrs in next pr
+  // auto& attr_names = std::get<1>(signature.args);
+
+  // TODO(chenweihang): support multiple inputs and outputs
+  pten::InferMetaContext infer_mete_context;
+  for (auto& in_name : input_names) {
+    infer_meta_context.EmplaceBackInput(std::make_shared<CompatMetaTensor>(
+        ctx->GetInputVarPtrs(in_name)[0], ctx->IsRuntime()));
+  }
+  for (auto& out_name : output_names) {
+    infer_meta_context.EmplaceBackOutput(std::make_shared<CompatMetaTensor>(
+        ctx->GetOutputVarPtrs(out_name)[0], ctx->IsRuntime()));
+  }
+  // TODO(chenweihang): support attrs later
+
+  return infer_meta_context;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/infershape_utils.h b/paddle/fluid/framework/infershape_utils.h
index f943989523e50d1361aebbdebe771811cdb358f3..fbfb44e27c8b104cfefb8256aedbb3af8a4caf8f 100644
--- a/paddle/fluid/framework/infershape_utils.h
+++ b/paddle/fluid/framework/infershape_utils.h
@@ -26,7 +26,6 @@ class InferMetaContext;
 namespace paddle {
 namespace framework {
 
-// TODO(chenweihang): impl this function in next PR
 pten::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
                                              const std::string& op_type);
 
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 0d9c460628e17186152462c313937aff5490e723..323e743087ffbc0f979768bb9a8b8dd7eaec25b2 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -132,6 +132,22 @@ if(WITH_MKLDNN)
     pass_library(multi_gru_seq_fuse_pass inference DIR mkldnn)
 endif()
 
+if(WITH_IPU)
+    pass_library(forward_graph_extract_pass base DIR ipu)
+    pass_library(optimizer_extract_pass base DIR ipu)
+    pass_library(optimizer_state_align_pass base DIR ipu)
+    pass_library(ipu_graph_builder_pass base DIR ipu)
+    pass_library(ipu_runtime_replacer_pass base DIR ipu)
+    pass_library(inference_process_pass base DIR ipu)
+    pass_library(inference_postprocess_pass base DIR ipu)
+    pass_library(popart_canonicalization_pass base DIR ipu)
+    pass_library(ipu_inplace_pass base DIR ipu)
+    pass_library(infer_shape_pass base DIR ipu)
+    pass_library(delete_scale_op_pass base DIR ipu)
+    pass_library(avg_shard_pass base DIR ipu)
+    pass_library(transfer_cast_op_pass base DIR ipu)
+endif()
+
 cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_bn_add_act_pass SRCS fuse_bn_add_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index b2ab6bed36c3afbe99c8debd8547784fb455475f..83bed2a97baa7453ac84039405ad43a20a12a4bd 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -603,9 +603,9 @@ static std::vector<std::vector<ir::Node::Dep>> GetOpDependencies(
   for (const auto *op_desc : block_ops) {
     size_t op_idx = op_id_to_idx.size();
     PADDLE_ENFORCE_EQ(
-        op_id_to_idx.emplace(op_desc->Id(), op_idx).second, true,
+        op_id_to_idx.emplace(op_desc->OriginalId(), op_idx).second, true,
         platform::errors::InvalidArgument(
-            "There should not be duplicate op id: %d", op_desc->Id()));
+            "There should not be duplicate op id: %d", op_desc->OriginalId()));
   }
 
   std::vector<std::vector<ir::Node::Dep>> dep_matrix(op_num);
@@ -624,9 +624,9 @@ static std::vector<std::vector<ir::Node::Dep>> GetOpDependencies(
 
   for (const auto &pair : all_preceding_ops) {
     const auto *cur_op_node = pair.first;
-    size_t op_idx_1 = get_op_idx_by_id(cur_op_node->Op()->Id());
+    size_t op_idx_1 = get_op_idx_by_id(cur_op_node->Op()->OriginalId());
     for (const auto *preceding_op_node : pair.second) {
-      size_t op_idx_2 = get_op_idx_by_id(preceding_op_node->Op()->Id());
+      size_t op_idx_2 = get_op_idx_by_id(preceding_op_node->Op()->OriginalId());
       dep_matrix[op_idx_1][op_idx_2] = ir::Node::Dep::kAfter;
       dep_matrix[op_idx_2][op_idx_1] = ir::Node::Dep::kBefore;
     }
diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc
index 45087036b5d17dc500d59f0413dd4f7223bc9e4c..32d3cdef4512bb072820970ed9db6d2d1289652b 100644
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/lodtensor_printer.h"
 
 #if defined PADDLE_WITH_PSCORE
-#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
 #endif
 
 namespace paddle {
@@ -136,7 +136,7 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
         if (!root_var) {
           continue;
         }
-        if (root_var->IsType<SelectedRows>()) {
+        if (root_var->IsType<pten::SelectedRows>()) {
           continue;
         }
         LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc
index 9230c36a0c7450dc96304e5a0f773feabe610afa..3fe9e877658dad64bfeb4737f025ea73b54840f8 100644
--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -259,7 +259,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
       auto var = var_name_item.second[i];
       auto& var_name = new_ins[var_name_item.first].at(i);
       const Tensor* tensor_in;
-      if (var->IsType<LoDTensor>() || var->IsType<SelectedRows>()) {
+      if (var->IsType<LoDTensor>() || var->IsType<pten::SelectedRows>()) {
         tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
       } else if (var->IsType<LoDTensorArray>()) {
         tensor_in =
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index aea9ad20353966f3b9491d85129bfd62269cfcb0..f71a5b2c710cea76a4d5346b14af7e69b8215f95 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -676,8 +676,9 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
                    operators::reader::
                        OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
       // do nothing
-    } else if (var->IsType<SelectedRows>()) {
-      TensorRecordStream(*(var->GetMutable<SelectedRows>()->mutable_value()));
+    } else if (var->IsType<pten::SelectedRows>()) {
+      TensorRecordStream(
+          *(var->GetMutable<pten::SelectedRows>()->mutable_value()));
     } else if (var->IsType<LoDTensorArray>()) {
       auto* tensor_arr = var->GetMutable<LoDTensorArray>();
       for (auto& tensor : *tensor_arr) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc b/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc
index 7beefec4487de31d2fa558153b7a0522545def72..ba81ee9166fd655cf1c6b2b0bf14486d5c274143 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc
@@ -76,10 +76,12 @@ void InterpreterCoreEventGarbageCollector::Add(
   } else if (var->IsType<LoDRankTable>()) {
     // TODO(xiongkun03) in old executor, this type of variable is not support
     // eager deletion. so we just leave it here ?
-  } else if (var->IsType<SelectedRows>()) {
-    Add(var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder(),
+  } else if (var->IsType<pten::SelectedRows>()) {
+    Add(var->GetMutable<pten::SelectedRows>()
+            ->mutable_value()
+            ->MoveMemoryHolder(),
         event, ctx);
-    var->GetMutable<SelectedRows>()->mutable_rows()->clear();
+    var->GetMutable<pten::SelectedRows>()->mutable_rows()->clear();
   } else if (var->IsType<LoDTensorArray>()) {
     auto* tensor_arr = var->GetMutable<LoDTensorArray>();
     for (auto& t : *tensor_arr) {
@@ -132,4 +134,4 @@ void InterpreterCoreEventGarbageCollector::Free(
 }
 
 }  // namespace framework
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.cc b/paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.cc
index 784cfca943ea1d88546e5d024bbdeaece2c55849..14fb8a9819b2dc4f1356b881150983937d691af6 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.cc
@@ -32,9 +32,11 @@ void InterpreterCoreFastGarbageCollector::Add(Variable* var) {
   } else if (var->IsType<LoDRankTable>()) {
     // TODO(xiongkun03) in old executor, this type of variable is not support
     // eager deletion. so we just leave it here ?
-  } else if (var->IsType<SelectedRows>()) {
-    Add(var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder());
-    var->GetMutable<SelectedRows>()->mutable_rows()->clear();
+  } else if (var->IsType<pten::SelectedRows>()) {
+    Add(var->GetMutable<pten::SelectedRows>()
+            ->mutable_value()
+            ->MoveMemoryHolder());
+    var->GetMutable<pten::SelectedRows>()->mutable_rows()->clear();
   } else if (var->IsType<LoDTensorArray>()) {
     auto* tensor_arr = var->GetMutable<LoDTensorArray>();
     for (auto& t : *tensor_arr) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 214a1d728266b03d122ffc5fdf36d4617612f22b..0371b12d009f3f15cfd649c143a81032484f49f2 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -468,8 +468,8 @@ void build_op_func_list(const platform::Place& place,
       if (var->IsType<LoDTensor>()) {
         garbages->emplace_back(
             var->GetMutable<LoDTensor>()->MoveMemoryHolder());
-      } else if (var->IsType<SelectedRows>()) {
-        garbages->emplace_back(var->GetMutable<SelectedRows>()
+      } else if (var->IsType<pten::SelectedRows>()) {
+        garbages->emplace_back(var->GetMutable<pten::SelectedRows>()
                                    ->mutable_value()
                                    ->MoveMemoryHolder());
       } else if (var->IsType<LoDTensorArray>()) {
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index fb29e18887b4ee74b448323fb1d14409212e9f71..6c5e98489ef5a8db4f163cc31d888e900bdbb582 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -18,7 +18,7 @@
 #include <vector>
 
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
-#include "paddle/fluid/framework/rw_lock.h"
+#include "paddle/pten/core/utils/rw_lock.h"
 
 // When in inference scenario, the scopes will not be written by two threads in
 // a mean time, but a scope may be read by multiple threads concurrently, and
@@ -171,9 +171,9 @@ void InterpretercoreInferShapeContext::ShareDim(const std::string& in,
       platform::errors::InvalidArgument(
           "The type of input (%s) and output (%s) are inconsistent.", in, out));
 
-  if (in_var->IsType<framework::SelectedRows>()) {
-    auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
-    auto out_sele_rows = out_var->GetMutable<framework::SelectedRows>();
+  if (in_var->IsType<pten::SelectedRows>()) {
+    auto& in_sele_rows = in_var->Get<pten::SelectedRows>();
+    auto out_sele_rows = out_var->GetMutable<pten::SelectedRows>();
     out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
     out_sele_rows->set_rows(in_sele_rows.rows());
     out_sele_rows->set_height(in_sele_rows.height());
@@ -392,8 +392,8 @@ DDim InterpretercoreInferShapeContext::GetDim(Variable* var) const {
       var, platform::errors::InvalidArgument("Input variable is nullptr."));
   if (var->IsType<LoDTensor>()) {
     return var->Get<LoDTensor>().dims();
-  } else if (var->IsType<SelectedRows>()) {
-    return var->Get<SelectedRows>().GetCompleteDims();
+  } else if (var->IsType<pten::SelectedRows>()) {
+    return var->Get<pten::SelectedRows>().GetCompleteDims();
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Only LoDTensor or SelectedRows support 'GetDim', but input "
@@ -420,8 +420,8 @@ std::vector<DDim> InterpretercoreInferShapeContext::GetRepeatedDims(
 void InterpretercoreInferShapeContext::SetDim(Variable* var, const DDim& dim) {
   if (var->IsType<LoDTensor>()) {
     var->GetMutable<LoDTensor>()->Resize(dim);
-  } else if (var->IsType<SelectedRows>()) {
-    var->GetMutable<SelectedRows>()->set_height(dim[0]);
+  } else if (var->IsType<pten::SelectedRows>()) {
+    var->GetMutable<pten::SelectedRows>()->set_height(dim[0]);
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Variable type error, expect LoDTensor or SelectedRows, but received "
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index 0ef85a25a237b5b97f4bba32dc28a436a5336174..b61b8af1e4a1b38f3db686e3b438aaf7745ed3c0 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -19,10 +19,10 @@
 #include <vector>
 
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/device_event_base.h"
 #include "paddle/fluid/platform/event.h"
+#include "paddle/pten/core/utils/rw_lock.h"
 
 // When in inference scenario, the scopes will not be written by two threads in
 // a mean time, but a scope may be read by multiple threads concurrently, and
diff --git a/paddle/fluid/framework/op_call_stack.h b/paddle/fluid/framework/op_call_stack.h
index f633538e700b242469bce6d76dfb58e89f9cdbe8..e4fd66fee2d732e51351b050c852aefa6cdb6001 100644
--- a/paddle/fluid/framework/op_call_stack.h
+++ b/paddle/fluid/framework/op_call_stack.h
@@ -19,12 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace platform {
-struct EnforceNotMet;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
index fb2d23a5513b4fee64276ec5880ffe7729d2f500..a22adacd31a91c966fba3f77fbf914a987c409a8 100644
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -18,12 +18,6 @@ limitations under the License. */
 #include "gtest/gtest-test-part.h"
 #include "gtest/gtest.h"
 
-namespace paddle {
-namespace platform {
-struct EnforceNotMet;
-}  // namespace platform
-}  // namespace paddle
-
 class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 33a4e5d2f390611a3f079bff3232a1bd5f7b3ac0..087a817d03af1c5bffd15965071dc48b4a299e9f 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/common/scalar_array.h"
+#include "paddle/pten/ops/compat/signatures.h"
 
 namespace pten {
 class DenseTensor;
@@ -77,11 +78,11 @@ static DDim GetDimsDebug(const ScopeBase& scope, const std::string& name,
   if (var->IsType<LoDTensor>()) {
     const LoDTensor& tensor = var->Get<LoDTensor>();
     return tensor.dims();
-  } else if (var->IsType<SelectedRows>()) {
+  } else if (var->IsType<pten::SelectedRows>()) {
     if (get_actual_dim) {
-      return var->Get<SelectedRows>().value().dims();
+      return var->Get<pten::SelectedRows>().value().dims();
     } else {
-      return var->Get<SelectedRows>().GetCompleteDims();
+      return var->Get<pten::SelectedRows>().GetCompleteDims();
     }
   } else if (var->IsType<Strings>()) {
     return DDim({static_cast<int64_t>(var->Get<Strings>().size())});
@@ -108,8 +109,8 @@ static std::string GetDtype(const ScopeBase& scope, const std::string& name) {
       return "";
     }
     return DataTypeToString(tensor.type());
-  } else if (var->IsType<SelectedRows>()) {
-    auto tensor = var->Get<SelectedRows>().value();
+  } else if (var->IsType<pten::SelectedRows>()) {
+    auto tensor = var->Get<pten::SelectedRows>().value();
     if (UNLIKELY(!tensor.IsInitialized())) {
       return "uninited";
     } else {
@@ -139,8 +140,8 @@ static std::string GetPlace(const ScopeBase& scope, const std::string& name) {
       return "";
     }
     return to_string(tensor.place());
-  } else if (var->IsType<SelectedRows>()) {
-    auto tensor = var->Get<SelectedRows>().value();
+  } else if (var->IsType<pten::SelectedRows>()) {
+    auto tensor = var->Get<pten::SelectedRows>().value();
     if (UNLIKELY(!tensor.IsInitialized())) {
       return "uninited";
     } else {
@@ -157,8 +158,8 @@ static int GetRowSize(const ScopeBase& scope, const std::string& name) {
     return -1;
   }
 
-  if (var->IsType<SelectedRows>()) {
-    return var->Get<SelectedRows>().rows().size();
+  if (var->IsType<pten::SelectedRows>()) {
+    return var->Get<pten::SelectedRows>().rows().size();
   }
 
   return -1;
@@ -497,8 +498,8 @@ void OperatorBase::GenerateTemporaryNames() {
 const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) {
   if (var.IsType<LoDTensor>()) {
     return static_cast<const Tensor*>(&(var.Get<LoDTensor>()));
-  } else if (var.IsType<SelectedRows>()) {
-    return &(var.Get<SelectedRows>().value());
+  } else if (var.IsType<pten::SelectedRows>()) {
+    return &(var.Get<pten::SelectedRows>().value());
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Variable type is %s, expect LoDTensor or SelectedRows.",
@@ -509,8 +510,8 @@ const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) {
 Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) {
   if (var->IsType<LoDTensor>()) {
     return var->GetMutable<LoDTensor>();
-  } else if (var->IsType<SelectedRows>()) {
-    return var->GetMutable<SelectedRows>()->mutable_value();
+  } else if (var->IsType<pten::SelectedRows>()) {
+    return var->GetMutable<pten::SelectedRows>()->mutable_value();
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Variable type is %s, expect LoDTensor or SelectedRows.",
@@ -741,9 +742,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
             "The type of input (%s) and output (%s) are inconsistent.", in,
             out));
 
-    if (in_var->IsType<framework::SelectedRows>()) {
-      auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
-      auto out_sele_rows = out_var->GetMutable<framework::SelectedRows>();
+    if (in_var->IsType<pten::SelectedRows>()) {
+      auto& in_sele_rows = in_var->Get<pten::SelectedRows>();
+      auto out_sele_rows = out_var->GetMutable<pten::SelectedRows>();
       out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
       out_sele_rows->set_rows(in_sele_rows.rows());
       out_sele_rows->set_height(in_sele_rows.height());
@@ -950,8 +951,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
         var, platform::errors::InvalidArgument("Input variable is nullptr."));
     if (var->IsType<LoDTensor>()) {
       return var->Get<LoDTensor>().dims();
-    } else if (var->IsType<SelectedRows>()) {
-      return var->Get<SelectedRows>().GetCompleteDims();
+    } else if (var->IsType<pten::SelectedRows>()) {
+      return var->Get<pten::SelectedRows>().GetCompleteDims();
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Only LoDTensor or SelectedRows support 'GetDim', but input "
@@ -976,8 +977,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
   void SetDim(Variable* var, const DDim& dim) {
     if (var->IsType<LoDTensor>()) {
       var->GetMutable<LoDTensor>()->Resize(dim);
-    } else if (var->IsType<SelectedRows>()) {
-      var->GetMutable<SelectedRows>()->set_height(dim[0]);
+    } else if (var->IsType<pten::SelectedRows>()) {
+      var->GetMutable<pten::SelectedRows>()->set_height(dim[0]);
     } else {
       PADDLE_THROW(platform::errors::Unimplemented(
           "Variable type error, expect LoDTensor or SelectedRows, but received "
@@ -1086,6 +1087,13 @@ bool OperatorWithKernel::CanMKLDNNBeUsed(const framework::ExecutionContext& ctx,
   return use_mkldnn_ctx && this->SupportsMKLDNN(data_type);
 }
 
+void OperatorWithKernel::InferShape(InferShapeContext* ctx) const {
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "The default InferShape function of OperatorWithKernel is not allowed to "
+      "be called, please override corresponding InferShape function in the "
+      "specific operator."));
+}
+
 void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
                                            const platform::Place& place,
                                            const RuntimeContext& ctx) const {
@@ -1342,6 +1350,16 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
     kernel_iter = kernels.find(expected_kernel_key);
   }
 #endif
+#ifdef PADDLE_WITH_IPU
+  if (kernel_iter == kernels.end() &&
+      platform::is_ipu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing IPU kernel: " << type_
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
+#endif
 #ifdef PADDLE_WITH_ASCEND_CL
   if (kernel_iter == kernels.end() &&
       platform::is_npu_place(expected_kernel_key.place_)) {
@@ -1646,8 +1664,8 @@ void OperatorWithKernel::ParseInputDataType(
         t = &var->Get<Tensor>();
       } else if (var->IsType<LoDTensor>()) {
         t = &var->Get<LoDTensor>();
-      } else if (var->IsType<SelectedRows>()) {
-        t = &(var->Get<SelectedRows>().value());
+      } else if (var->IsType<pten::SelectedRows>()) {
+        t = &(var->Get<pten::SelectedRows>().value());
       } else if (var->IsType<LoDTensorArray>()) {
         auto t_arr = &var->Get<LoDTensorArray>();
         for (size_t j = 0; j < t_arr->size(); j++) {
@@ -1728,8 +1746,8 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely(
     t = var->GetMutable<Tensor>();
   } else if (var->IsType<LoDTensor>()) {
     t = var->GetMutable<LoDTensor>();
-  } else if (var->IsType<SelectedRows>()) {
-    t = var->GetMutable<SelectedRows>()->mutable_value();
+  } else if (var->IsType<pten::SelectedRows>()) {
+    t = var->GetMutable<pten::SelectedRows>()->mutable_value();
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Unsupported input variable type in complex type promotion."));
@@ -1784,8 +1802,10 @@ OpKernelType OperatorWithKernel::GetKernelTypeForVar(
 
 KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
     const ExecutionContext& ctx) const {
-  return KernelSignatureMap::Instance().Get(
-      pten::TransToPtenKernelName(Type()));
+  InitDefaultKernelSignatureMap();
+  ExecutionArgumentMappingContext arg_mapping_ctx(ctx);
+  return pten::OpUtilsMap::Instance().GetArgumentMappingFn(Type())(
+      arg_mapping_ctx);
 }
 
 Scope* OperatorWithKernel::PreparePtenData(
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 40c80ec5f2d654b57a72290398e323e1ce91e156..c280eeaa0fa5713bf52679996bbe2b3f7ac22473 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -41,6 +41,7 @@ limitations under the License. */
 #include "paddle/utils/flat_hash_map.h"
 
 #include "paddle/pten/core/compat/arg_map_context.h"
+#include "paddle/pten/core/compat/op_utils.h"
 #include "paddle/pten/core/kernel_context.h"
 #include "paddle/pten/core/kernel_factory.h"
 
@@ -117,7 +118,7 @@ inline std::string GradOriginalVarName(const std::string& grad_var_name) {
 }
 
 inline bool VarIsTensor(const Variable& var) {
-  return var.IsType<LoDTensor>() || var.IsType<SelectedRows>();
+  return var.IsType<LoDTensor>() || var.IsType<pten::SelectedRows>();
 }
 
 const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var);
@@ -468,12 +469,11 @@ class ExecutionArgumentMappingContext : public pten::ArgumentMappingContext {
   }
 
   bool IsDenseTensorInput(const std::string& name) const override {
-    return ctx_.InputVar(name)->IsType<framework::Tensor>() ||
-           ctx_.InputVar(name)->IsType<framework::LoDTensor>();
+    return ctx_.InputVar(name)->IsType<framework::LoDTensor>();
   }
 
   bool IsSelectedRowsInput(const std::string& name) const override {
-    return ctx_.InputVar(name)->IsType<framework::SelectedRows>();
+    return ctx_.InputVar(name)->IsType<pten::SelectedRows>();
   }
 
  private:
@@ -550,7 +550,7 @@ class OperatorWithKernel : public OperatorBase {
   bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx,
                        proto::VarType::Type data_type) const;
 
-  virtual void InferShape(InferShapeContext* ctx) const = 0;
+  virtual void InferShape(InferShapeContext* ctx) const;
 
   void RuntimeInferShape(const Scope& scope, const platform::Place& place,
                          const RuntimeContext& ctx) const override;
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index df7e3c4f6dde3b7ff8eb7d9a199f11fca45a034e..ef6c41990cd6e243cd5d7d062722ccd1555e9591 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -456,7 +456,7 @@ TEST(IndicateVarDataTypeTest, selectedrows) {
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   auto* var = scope.Var("selected_rows_1");
-  var->GetMutable<paddle::framework::SelectedRows>();
+  var->GetMutable<pten::SelectedRows>();
 
   bool caught = false;
   try {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 54167d95899d6fab81a8657b167012b47bf950ea..535c9ab58e295fae2048bb162adfb0384745d0ae 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -38,12 +38,12 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/analysis/dot.h"
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/pten/core/utils/rw_lock.h"
 
 namespace paddle {
 namespace framework {
@@ -75,7 +75,7 @@ const CinnCompiledObject& CinnCompiler::Compile(
 
   bool exist = false;
   {
-    AutoRDLock r_guard{&rwlock_};
+    pten::AutoRDLock r_guard{&rwlock_};
     exist = cache_by_address_.count(cur_key_by_address) != 0;
     // if cannot find graph by address, checkout whether the graph structure
     // have been stored in cache.
@@ -96,13 +96,13 @@ const CinnCompiledObject& CinnCompiler::Compile(
     std::int64_t compiled_num = real_compiled_num_.fetch_add(1);
     auto compiled_res =
         CompileGraph(graph, input_tensors, target, compiled_num, stream);
-    AutoWRLock w_guard{&rwlock_};
+    pten::AutoWRLock w_guard{&rwlock_};
     if (!cache_by_struct_.count(cur_key_by_struct)) {
       cache_by_address_[cur_key_by_address] = compiled_res.get();
       cache_by_struct_[cur_key_by_struct] = std::move(compiled_res);
     }
   }
-  AutoRDLock guard{&rwlock_};
+  pten::AutoRDLock guard{&rwlock_};
   const auto& cached_boj = *cache_by_address_[cur_key_by_address];
   return cached_boj;
 }
@@ -198,7 +198,7 @@ std::string CinnCompiler::ReadableKey(
 
 void CinnCompiler::Clear() {
   {
-    AutoWRLock guard{&rwlock_};
+    pten::AutoWRLock guard{&rwlock_};
     graphs_.clear();
     cache_by_address_.clear();
     cache_by_struct_.clear();
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
index 5070eb5ce5674dfc5803c61a1eb38117432fb4c1..024dd26747b8e7db9eec15fd2998cefaeeb931fb 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -26,9 +26,9 @@
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h"
-#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/pten/core/utils/rw_lock.h"
 
 namespace paddle {
 
@@ -102,7 +102,7 @@ class CinnCompiler {
                      std::unique_ptr<CinnCompiledObject>, CinnCacheKey::Hash>
       cache_by_struct_;
   std::atomic_int64_t real_compiled_num_{0};
-  mutable RWLock rwlock_;
+  mutable pten::RWLock rwlock_;
 
   DISABLE_COPY_AND_ASSIGN(CinnCompiler);
 };
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index 2fd5b87b7f3fd2cfc655ca6112ef33bddedb59cf..dc20aaffec9ca7abce0096fe1d948d043cc5e044 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <sstream>
 
 #include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/pten/core/compat/op_utils.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_factory.h"
 
@@ -89,48 +90,6 @@ pten::KernelKey TransOpKernelTypeToPtenKernelKey(
   return pten::KernelKey(backend, layout, dtype);
 }
 
-KernelSignatureMap* KernelSignatureMap::kernel_signature_map_ = nullptr;
-std::once_flag KernelSignatureMap::init_flag_;
-
-KernelSignatureMap& KernelSignatureMap::Instance() {
-  std::call_once(init_flag_, [] {
-    kernel_signature_map_ = new KernelSignatureMap();
-    for (const auto& pair : OpInfoMap::Instance().map()) {
-      const auto& op_type = pair.first;
-      const auto* op_proto = pair.second.proto_;
-      if (pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type) &&
-          op_proto) {
-        KernelArgsNameMakerByOpProto maker(op_proto);
-        VLOG(10) << "Register kernel signature for " << op_type;
-        auto success = kernel_signature_map_->map_
-                           .emplace(pten::TransToPtenKernelName(op_type),
-                                    std::move(maker.GetKernelSignature()))
-                           .second;
-        PADDLE_ENFORCE_EQ(
-            success, true,
-            platform::errors::PermissionDenied(
-                "Kernel signature of the operator %s has been registered.",
-                op_type));
-      }
-    }
-  });
-  return *kernel_signature_map_;
-}
-
-bool KernelSignatureMap::Has(const std::string& op_type) const {
-  return map_.find(op_type) != map_.end();
-}
-
-const KernelSignature& KernelSignatureMap::Get(
-    const std::string& op_type) const {
-  auto it = map_.find(op_type);
-  PADDLE_ENFORCE_NE(
-      it, map_.end(),
-      platform::errors::NotFound(
-          "Operator `%s`'s kernel signature is not registered.", op_type));
-  return it->second;
-}
-
 const paddle::SmallVector<std::string>&
 KernelArgsNameMakerByOpProto::GetInputArgsNames() {
   for (int i = 0; i < op_proto_->inputs_size(); ++i) {
@@ -196,6 +155,24 @@ KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
                          GetOutputArgsNames());
 }
 
+std::once_flag kernel_sig_map_init_flag;
+
+void InitDefaultKernelSignatureMap() {
+  std::call_once(kernel_sig_map_init_flag, [] {
+    for (const auto& pair : paddle::framework::OpInfoMap::Instance().map()) {
+      const auto& op_type = pair.first;
+      const auto* op_proto = pair.second.proto_;
+      if (pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type) &&
+          op_proto) {
+        paddle::framework::KernelArgsNameMakerByOpProto maker(op_proto);
+        VLOG(10) << "Register kernel signature for " << op_type;
+        pten::DefaultKernelSignatureMap::Instance().Insert(
+            op_type, std::move(maker.GetKernelSignature()));
+      }
+    }
+  });
+}
+
 void SetAllocationForOutputTenosr(pten::DenseTensor* tensor,
                                   const platform::Place& place) {
   if (!tensor->IsInitialized() || !(tensor->place() == place)) {
diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
index ab129c6313dabfecf3d7cd1968b66485e48ec211..9b1019f65823774d315b12c14c307b416ca9ff70 100644
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -44,26 +44,6 @@ pten::KernelKey TransOpKernelTypeToPtenKernelKey(
 
 /* Kernel Args parse */
 
-// TODO(chenweihang): we can generate this map by proto info in compile time
-class KernelSignatureMap {
- public:
-  static KernelSignatureMap& Instance();
-
-  bool Has(const std::string& op_type) const;
-
-  const KernelSignature& Get(const std::string& op_type) const;
-
- private:
-  KernelSignatureMap() = default;
-  DISABLE_COPY_AND_ASSIGN(KernelSignatureMap);
-
- private:
-  static KernelSignatureMap* kernel_signature_map_;
-  static std::once_flag init_flag_;
-
-  paddle::flat_hash_map<std::string, KernelSignature> map_;
-};
-
 class KernelArgsNameMaker {
  public:
   virtual ~KernelArgsNameMaker() {}
@@ -72,6 +52,8 @@ class KernelArgsNameMaker {
   virtual const paddle::SmallVector<std::string>& GetAttrsArgsNames() = 0;
 };
 
+void InitDefaultKernelSignatureMap();
+
 void SetAllocationForOutputTenosr(pten::DenseTensor* tensor,
                                   const platform::Place& place);
 
@@ -86,5 +68,12 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> {
   using TYPE = pten::CPUContext;
 };
 
+#ifdef PADDLE_WITH_XPU
+template <>
+struct ConvertToPtenContext<platform::XPUDeviceContext> {
+  using TYPE = pten::XPUContext;
+};
+#endif
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h
deleted file mode 100644
index 9b74a55304077c6c13a55f36ea8cf3b6dfbe5b9c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/rw_lock.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#if !defined(_WIN32)
-#include <pthread.h>
-#else
-#include <mutex>  // NOLINT
-#endif            // !_WIN32
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-#if !defined(_WIN32)
-struct RWLock {
-  RWLock() { pthread_rwlock_init(&lock_, nullptr); }
-
-  ~RWLock() { pthread_rwlock_destroy(&lock_); }
-
-  inline void RDLock() {
-    PADDLE_ENFORCE_EQ(
-        pthread_rwlock_rdlock(&lock_), 0,
-        platform::errors::External("The pthread failed to acquire read lock."));
-  }
-
-  inline void WRLock() {
-    PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
-                      platform::errors::External(
-                          "The pthread failed to acquire write lock."));
-  }
-
-  inline void UNLock() {
-    PADDLE_ENFORCE_EQ(
-        pthread_rwlock_unlock(&lock_), 0,
-        platform::errors::External("The pthread failed to unlock."));
-  }
-
- private:
-  pthread_rwlock_t lock_;
-};
-// TODO(paddle-dev): Support RWLock for WIN32 for correctness.
-#else
-// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
-// In windows, rw_lock seems like a hack. Use empty object and do nothing.
-struct RWLock {
-  // FIXME(minqiyang): use mutex here to do fake lock
-  inline void RDLock() { mutex_.lock(); }
-
-  inline void WRLock() { mutex_.lock(); }
-
-  inline void UNLock() { mutex_.unlock(); }
-
- private:
-  std::mutex mutex_;
-};
-#endif
-
-class AutoWRLock {
- public:
-  explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
-
-  ~AutoWRLock() { UnLock(); }
-
- private:
-  inline void Lock() { lock_->WRLock(); }
-
-  inline void UnLock() { lock_->UNLock(); }
-
- private:
-  RWLock* lock_;
-};
-
-class AutoRDLock {
- public:
-  explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
-
-  ~AutoRDLock() { UnLock(); }
-
- private:
-  inline void Lock() { lock_->RDLock(); }
-
-  inline void UnLock() { lock_->UNLock(); }
-
- private:
-  RWLock* lock_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/rw_lock_test.cc b/paddle/fluid/framework/rw_lock_test.cc
deleted file mode 100644
index d140e95a37d84fe34397e06092a3ec89c8dc8435..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/rw_lock_test.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/rw_lock.h"
-
-#include <gtest/gtest.h>
-#include <thread>  // NOLINT
-
-namespace f = paddle::framework;
-
-void f1(f::RWLock *lock) {
-  lock->RDLock();
-  lock->UNLock();
-}
-
-TEST(RWLOCK, read_read) {
-  f::RWLock lock;
-  lock.RDLock();
-  std::thread t1(f1, &lock);
-  std::thread t2(f1, &lock);
-  t1.join();
-  t2.join();
-  lock.UNLock();
-}
-
-void f2(f::RWLock *lock, std::vector<int> *result) {
-  lock->RDLock();
-  ASSERT_EQ(result->size(), 0UL);
-  lock->UNLock();
-}
-
-void f3(f::RWLock *lock, std::vector<int> *result) {
-  lock->WRLock();
-  result->push_back(1);
-  lock->UNLock();
-}
-
-TEST(RWLOCK, read_write) {
-  f::RWLock lock;
-  std::vector<int> result;
-
-  lock.RDLock();
-  std::thread t1(f2, &lock, &result);
-  t1.join();
-  std::thread t2(f3, &lock, &result);
-  std::this_thread::sleep_for(std::chrono::seconds(1));
-  ASSERT_EQ(result.size(), 0UL);
-  lock.UNLock();
-  t2.join();
-  ASSERT_EQ(result.size(), 1UL);
-}
-
-void f4(f::RWLock *lock, std::vector<int> *result) {
-  lock->RDLock();
-  ASSERT_EQ(result->size(), 1UL);
-  lock->UNLock();
-}
-
-TEST(RWLOCK, write_read) {
-  f::RWLock lock;
-  std::vector<int> result;
-
-  lock.WRLock();
-  std::thread t1(f4, &lock, &result);
-  std::this_thread::sleep_for(std::chrono::seconds(1));
-  result.push_back(1);
-  lock.UNLock();
-  t1.join();
-}
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index b2062cc51206a98889a5b584239791483b1722a4..e6a372a8e631f92bee69dfd705d23b0ea56678ac 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -34,10 +34,10 @@ PADDLE_DEFINE_EXPORTED_bool(
 #define SCOPE_VARS_READER_LOCK
 #define SCOPE_VARS_WRITER_LOCK
 #else
-#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_);
-#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_);
-#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
-#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
+#define SCOPE_KIDS_READER_LOCK pten::AutoRDLock auto_lock(&kids_lock_);
+#define SCOPE_KIDS_WRITER_LOCK pten::AutoWRLock auto_lock(&kids_lock_);
+#define SCOPE_VARS_READER_LOCK pten::AutoRDLock auto_lock(&vars_lock_);
+#define SCOPE_VARS_WRITER_LOCK pten::AutoWRLock auto_lock(&vars_lock_);
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index b963c28d597bbb3614ccb00c4124123879dc0c84..7eb6082ce15fea2575c12d643329fe2a8bc555d7 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -26,9 +26,9 @@ extern "C" {
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/pten/core/utils/rw_lock.h"
 
 namespace paddle {
 namespace framework {
@@ -194,8 +194,8 @@ class Scope : public ScopeBase {
 #ifndef PADDLE_ON_INFERENCE
 
  private:
-  mutable RWLock kids_lock_;
-  mutable RWLock vars_lock_;
+  mutable pten::RWLock kids_lock_;
+  mutable pten::RWLock vars_lock_;
 #endif
 };
 
diff --git a/paddle/fluid/framework/selected_rows_utils.cc b/paddle/fluid/framework/selected_rows_utils.cc
index c33ee655c2a98b73b517c922895f494f443dfd90..a1bffcfce19f1a0a8c9eaf954f174299790f5384 100644
--- a/paddle/fluid/framework/selected_rows_utils.cc
+++ b/paddle/fluid/framework/selected_rows_utils.cc
@@ -17,73 +17,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-struct ReAllocateVisitor {
-  ReAllocateVisitor(const framework::DDim& dims, framework::Tensor* tensor)
-      : dims_(dims), tensor_(tensor) {}
-
-  template <typename T>
-  void operator()() const {
-    framework::Tensor cpu_tensor;
-    platform::CPUPlace cpu;
-    T* ptr = cpu_tensor.mutable_data<T>(dims_, cpu);
-    const T* old_ptr =
-        tensor_->memory_size() == 0 ? nullptr : tensor_->data<T>();
-    if (old_ptr != nullptr) {
-      std::copy(old_ptr, old_ptr + tensor_->numel(), ptr);
-    }
-    tensor_->ShareDataWith(cpu_tensor);
-  }
-
-  framework::DDim dims_;
-  framework::Tensor* tensor_;
-};
-
-struct TensorCopyVisitor {
-  TensorCopyVisitor(framework::Tensor* dst, int64_t dst_offset,
-                    const framework::Tensor src, int64_t src_offset,
-                    int64_t size)
-      : dst_(dst),
-        dst_offset_(dst_offset),
-        src_(src),
-        src_offset_(src_offset),
-        size_(size) {}
-
-  template <typename T>
-  void apply() const {
-    // TODO(Yancey1989): support other place
-    platform::CPUPlace cpu;
-    memory::Copy(cpu, dst_->mutable_data<T>(cpu) + dst_offset_, cpu,
-                 src_.data<T>() + src_offset_, size_ * sizeof(T));
-  }
-
-  framework::Tensor* dst_;
-  int64_t dst_offset_;
-  framework::Tensor src_;
-  int64_t src_offset_;
-  int64_t size_;
-};
-
-struct TensorFillVisitor {
-  TensorFillVisitor(framework::Tensor* dst, int64_t dst_offset, int64_t size,
-                    float value)
-      : dst_(dst), dst_offset_(dst_offset), size_(size) {}
-
-  template <typename T>
-  void apply() const {
-    // TODO(qiao): support other place
-    platform::CPUPlace cpu;
-    auto* tensor_data = dst_->mutable_data<T>(cpu);
-    auto* start = tensor_data + dst_offset_;
-    auto* end = start + size_;
-    std::fill(start, end, static_cast<T>(0.0));
-  }
-
-  framework::Tensor* dst_;
-  int64_t dst_offset_;
-  int64_t size_;
-};
-
-void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
+void SerializeToStream(std::ostream& os,
+                       const pten::SelectedRows& selected_rows,
                        const platform::DeviceContext& dev_ctx) {
   {  // the 1st field, uint32_t version
     constexpr uint32_t version = 0;
@@ -107,7 +42,8 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
   TensorToStream(os, selected_rows.value(), dev_ctx);
 }
 
-void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) {
+void SerializeToStream(std::ostream& os,
+                       const pten::SelectedRows& selected_rows) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   const platform::DeviceContext* dev_ctx;
   auto place = selected_rows.place();
@@ -115,14 +51,15 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) {
   SerializeToStream(os, selected_rows, *dev_ctx);
 }
 
-void DeserializeFromStream(std::istream& os, SelectedRows* selected_rows) {
+void DeserializeFromStream(std::istream& os,
+                           pten::SelectedRows* selected_rows) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   const platform::DeviceContext* dev_ctx;
   dev_ctx = pool.Get(platform::CPUPlace());
   DeserializeFromStream(os, selected_rows, *dev_ctx);
 }
 
-void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
+void DeserializeFromStream(std::istream& is, pten::SelectedRows* selected_rows,
                            const platform::DeviceContext& dev_ctx) {
   {
     // the 1st field, unit32_t version for SelectedRows
@@ -151,109 +88,5 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
   // the 4st field, tensor which contains the data
   TensorFromStream(is, selected_rows->mutable_value(), dev_ctx);
 }
-
-bool SelectedRows::HasKey(int64_t key) const {
-  return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false
-                                                                   : true;
-}
-
-int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown,
-                                     bool is_test) {
-  if (is_test) {
-    auto iter = id_to_index_.find(key);
-    if (iter == id_to_index_.end()) {
-      return -1;
-    } else {
-      return iter->second;
-    }
-  }
-
-  rwlock_->RDLock();
-  auto iter = id_to_index_.find(key);
-  if (iter == id_to_index_.end()) {
-    rwlock_->UNLock();
-    PADDLE_ENFORCE_EQ(
-        auto_grown, true,
-        platform::errors::NotFound("Input key(%lld) is not found.", key));
-    rwlock_->WRLock();
-    auto map_size = id_to_index_.size();
-    auto vector_size = rows_.size();
-    if (map_size != vector_size) {
-      rwlock_->UNLock();
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Row map size(%zu) should be equal to rows size(%zu).", map_size,
-          vector_size));
-    }
-    auto write_iter = id_to_index_.find(key);
-    if (write_iter == id_to_index_.end()) {
-      int row_num = rows_.size();
-      if (row_num == value_->dims()[0]) {
-        rwlock_->UNLock();
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Selected rows is full, then length exceed the length of first "
-            "dimension (%d).",
-            row_num));
-      }
-      // key logic to put a key into id_to_index_
-      rows_.push_back(key);
-      auto index = static_cast<int64_t>(rows_.size() - 1);
-      id_to_index_[key] = index;
-      rwlock_->UNLock();
-      return index;
-    } else {
-      auto index = write_iter->second;
-      rwlock_->UNLock();
-      return index;
-    }
-  } else {
-    auto index = iter->second;
-    rwlock_->UNLock();
-    return index;
-  }
-}
-
-void SelectedRows::SyncIndex() {
-  rwlock_->WRLock();
-  id_to_index_.clear();
-  for (size_t i = 0; i < rows_.size(); ++i) {
-    id_to_index_[rows_[i]] = i;
-  }
-  rwlock_->UNLock();
-}
-
-void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
-                       bool auto_grown, bool is_test) {
-  PADDLE_ENFORCE_EQ(value->IsInitialized(), true,
-                    platform::errors::InvalidArgument(
-                        "The value tensor is not initialized."));
-  if (ids.numel() == 0) {
-    VLOG(3) << "keys is empty, please check data!";
-  } else {
-    int64_t value_width = value_->numel() / value_->dims()[0];
-    PADDLE_ENFORCE_EQ(
-        value_width, value->numel() / value->dims()[0],
-        platform::errors::InvalidArgument(
-            "Output tensor should have the same shape with table "
-            "except the first dimmension, excepted value width not counting "
-            "the first dimension is %d, actual value width is %d.",
-            value_width, value->numel() / value->dims()[0]));
-    for (int i = 0; i < ids.numel(); ++i) {
-      auto id = ids.data<int64_t>()[i];
-      int64_t index = AutoGrownIndex(id, auto_grown, is_test);
-      if (index < 0) {
-        VLOG(5) << "id " << id << " not in the table, return 0";
-        framework::VisitDataType(
-            value_->type(),
-            TensorFillVisitor(value, i * value_width, value_width, 0.0));
-      } else {
-        framework::VisitDataType(
-            value_->type(),
-            TensorCopyVisitor(value, i * value_width, *value_.get(),
-                              index * value_width, value_width));
-      }
-    }
-  }
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/selected_rows_utils.h b/paddle/fluid/framework/selected_rows_utils.h
index 445f446ef2f4aecac496250a1269514f1faa037b..e1b26f2bbafa3f8ed3c372010c02c44d08c81066 100644
--- a/paddle/fluid/framework/selected_rows_utils.h
+++ b/paddle/fluid/framework/selected_rows_utils.h
@@ -21,153 +21,28 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/rw_lock.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/pten/core/selected_rows.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
-
-class SelectedRows {
-  /*
-   * @brief We can use the SelectedRows structure to reproduce a sparse table.
-   *  A sparse table is a key-value structure that the key is an `int64_t`,
-   *  and the value is a Tensor which the first dimension is 0.
-   *  You can use the following interface to operate the sparse table, and you
-   * can find
-   *  some detail information from the comments of each interface:
-   *
-   *  HasKey(key), whether the sparse table has the specified key.
-   *  Set(key, value), set a key-value pair into the sparse table.
-   *  Get(keys, value*), get value by given key list and apply it to the given
-   * value pointer
-   *    with the specified offset.
-   *
-   */
- public:
-  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
-      : rows_(rows), height_(height) {
-    value_.reset(new Tensor());
-    rwlock_.reset(new RWLock);
-  }
-
-  SelectedRows() {
-    height_ = 0;
-    value_.reset(new Tensor());
-    rwlock_.reset(new RWLock);
-  }
-
-  const platform::Place& place() const { return value_->place(); }
-
-  const Tensor& value() const { return *value_; }
-
-  Tensor* mutable_value() { return value_.get(); }
-
-  int64_t height() const { return height_; }
-
-  void set_height(int64_t height) { height_ = height; }
-
-  const Vector<int64_t>& rows() const { return rows_; }
-
-  Vector<int64_t>* mutable_rows() { return &rows_; }
-
-  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
-
-  /*
-   * @brief Get the index of key in rows
-   *
-   * @return -1 if the key does not exists.
-   */
-  int64_t Index(int64_t key) const {
-    auto it = std::find(rows_.begin(), rows_.end(), key);
-    if (it == rows_.end()) {
-      PADDLE_THROW(platform::errors::NotFound(
-          "Input id (%lld) is not in current rows table.", key));
-    }
-    return static_cast<int64_t>(std::distance(rows_.begin(), it));
-  }
-
-  /*
-   * @brief whether has the specified key in the table.
-   *
-   * @return true if the key is exists.
-   */
-  bool HasKey(int64_t key) const;
-
-  /*
-   * @brief Get value by the key list.
-   * Note!!! this interface is only used when selected_rows is used as
-   * parameters
-   * for distribute lookup table.
-   *
-   * @return a list of pair which contains the non-exists key and the index in
-   * the value
-   */
-  void Get(const framework::Tensor& ids, framework::Tensor* value,
-           bool auto_grown = false, bool is_test = false);
-
-  /*
-   * @brief Get the index of the key from id_to_index_ map. If the key not
-   * exist,
-   * add the key into id_to_index_.
-   *
-   * Note!!! this interface is only used when selected_rows is used as
-   * parameters
-   * for distribute lookup table.
-   *
-   * @return index of the key.
-   */
-  int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false);
-
-  /*
-   * @brief Get the index of the key from id_to_index_ map.
-   */
-  inline int64_t GetIndexFromId(int64_t key) const {
-    auto iter = id_to_index_.find(key);
-    if (iter == id_to_index_.end()) {
-      return -1;
-    } else {
-      return iter->second;
-    }
-  }
-
-  void SyncIndex();
-  /*
-   * @brief Get complete Dims before
-   */
-  DDim GetCompleteDims() const {
-    std::vector<int64_t> dims = vectorize(value_->dims());
-    dims[0] = height_;
-    return make_ddim(dims);
-  }
-
- private:
-  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
-  // SelectedRows are simply concated when adding together. Until a
-  // SelectedRows add a Tensor, will the duplicate rows be handled.
-  Vector<int64_t> rows_;
-  std::unordered_map<int64_t, int64_t>
-      id_to_index_;  // should not be used when rows_ has duplicate member
-  std::unique_ptr<Tensor> value_{nullptr};
-  int64_t height_;  // height indicates the underline tensor's height
-  std::unique_ptr<RWLock> rwlock_{nullptr};
-};
-
 /*
  * Serialize/Desiralize SelectedRows to std::ostream
  * You can pass ofstream or ostringstream to serilize to file
  * or to a in memory string. GPU tensor will be copied to CPU.
  */
-void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
+void SerializeToStream(std::ostream& os,
+                       const pten::SelectedRows& selected_rows,
                        const platform::DeviceContext& dev_ctx);
-void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
+void DeserializeFromStream(std::istream& is, pten::SelectedRows* selected_rows,
                            const platform::DeviceContext& dev_ctx);
 
-void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows);
+void SerializeToStream(std::ostream& os,
+                       const pten::SelectedRows& selected_rows);
 
-void DeserializeFromStream(std::istream& os, SelectedRows* selected_rows);
+void DeserializeFromStream(std::istream& os, pten::SelectedRows* selected_rows);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/selected_rows_utils_test.cc b/paddle/fluid/framework/selected_rows_utils_test.cc
index 7a9f86041d996eed67d836e52d89d5e57cc740c3..9a14f4395d9a196af67598714e3679c9d11d2289 100644
--- a/paddle/fluid/framework/selected_rows_utils_test.cc
+++ b/paddle/fluid/framework/selected_rows_utils_test.cc
@@ -24,7 +24,7 @@ class SelectedRowsTester : public ::testing::Test {
     std::vector<int64_t> rows{0, 4, 7};
     int64_t height = 10;
     int64_t row_numel = 100;
-    selected_rows_.reset(new SelectedRows(rows, height));
+    selected_rows_.reset(new pten::SelectedRows(rows, height));
 
     Tensor* value = selected_rows_->mutable_value();
     auto* data = value->mutable_data<float>(
@@ -36,7 +36,7 @@ class SelectedRowsTester : public ::testing::Test {
 
  protected:
   platform::CPUPlace place_;
-  std::unique_ptr<SelectedRows> selected_rows_{nullptr};
+  std::unique_ptr<pten::SelectedRows> selected_rows_{nullptr};
 };
 
 TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); }
@@ -50,7 +50,7 @@ TEST_F(SelectedRowsTester, complete_dims) {
 }
 
 TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
-  SelectedRows dst_tensor;
+  pten::SelectedRows dst_tensor;
   platform::CPUDeviceContext cpu_ctx(place_);
   std::ostringstream oss;
 
@@ -71,7 +71,7 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
 
 TEST(SelectedRows, SparseTable) {
   platform::CPUPlace cpu;
-  SelectedRows table;
+  pten::SelectedRows table;
 
   int64_t table_size = 100;
   int64_t embedding_width = 8;
@@ -124,7 +124,7 @@ TEST(SelectedRows, SparseTable) {
   }
 }
 
-void f1(SelectedRows* table, int table_size) {
+void f1(pten::SelectedRows* table, int table_size) {
   for (int i = 1000000; i > 0; --i) {
     auto id = i % table_size;
     int64_t index1 = table->AutoGrownIndex(id, true);
@@ -135,7 +135,7 @@ void f1(SelectedRows* table, int table_size) {
   }
 }
 
-void f2(SelectedRows* table, int table_size) {
+void f2(pten::SelectedRows* table, int table_size) {
   for (int i = 0; i < 1000000; ++i) {
     auto id = i % table_size;
     int64_t index1 = table->AutoGrownIndex(id, true);
@@ -146,7 +146,7 @@ void f2(SelectedRows* table, int table_size) {
   }
 }
 
-void f3(SelectedRows* table, int table_size) {
+void f3(pten::SelectedRows* table, int table_size) {
   clock_t t1 = clock();
   for (int i = 100000; i > 0; --i) {
     auto id1 = table->AutoGrownIndex(i % table_size, true);
@@ -157,7 +157,7 @@ void f3(SelectedRows* table, int table_size) {
   std::cout << "f3 run time:" << t2 - t1 << std::endl;
 }
 
-void f4(SelectedRows* table, int table_size) {
+void f4(pten::SelectedRows* table, int table_size) {
   clock_t t1 = clock();
   for (int i = 0; i < 100000; ++i) {
     auto id1 = table->AutoGrownIndex(i % table_size, true);
@@ -170,7 +170,7 @@ void f4(SelectedRows* table, int table_size) {
 
 TEST(SelectedRows, MultiThreadAutoIndex) {
   platform::CPUPlace cpu;
-  SelectedRows table;
+  pten::SelectedRows table;
 
   int64_t table_size = 100000;
   int64_t embedding_width = 8;
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 86bf2d8ac413e388ebba81ccf5e08edd891224e5..fe376a5669c984e439fcb8b93de25b96462d21de 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -21,8 +21,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/variant.h"
+#include "paddle/pten/core/type_defs.h"
 #include "paddle/utils/small_vector.h"
 
 namespace paddle {
@@ -39,14 +39,6 @@ class InferNoNeedBufferVarsFN;
 using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using VariableValueMap = std::map<std::string, std::vector<Variable*>>;
 
-// The order should be as same as framework.proto
-using Attribute = boost::variant<
-    boost::blank, int, float, std::string, std::vector<int>, std::vector<float>,
-    std::vector<std::string>, bool, std::vector<bool>, BlockDesc*, int64_t,
-    std::vector<BlockDesc*>, std::vector<int64_t>, std::vector<double>>;
-
-using AttributeMap = std::unordered_map<std::string, Attribute>;
-
 #ifdef PADDLE_WITH_ASCEND_CL
 using NPUAttribute =
     boost::variant<boost::blank, int, float, std::string, std::vector<int>,
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
index 5747df57c456854674515c1f653e4958fc9b57b4..dd1e329ac03231300cd63bd02f828c680203de6a 100644
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -57,7 +57,7 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
       visitor(var.Get<LoDTensorArray>());
       return;
     case proto::VarType::SELECTED_ROWS:
-      visitor(var.Get<SelectedRows>());
+      visitor(var.Get<pten::SelectedRows>());
       return;
     case proto::VarType::READER:
       visitor(var.Get<ReaderHolder>());
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index 008b6829f9fe374600f837a8f51ee82130ab1ac5..ac55abaad8d0a77d1b4decad733e32f51a994bc4 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -49,6 +49,7 @@
 
 namespace pten {
 class DenseTensor;
+class SelectedRows;
 }  // namespace pten
 
 // Users should add forward declarations here
@@ -76,7 +77,6 @@ class LoDRankTable;
 class ScopeBase;
 class ReaderHolder;
 class Scope;
-class SelectedRows;
 }  // namespace framework
 
 namespace operators {
@@ -166,7 +166,7 @@ struct VarTypeRegistryImpl {
 // Users should add other variable types below.
 // Paddle would generate unique Ids for each registered variable types.
 using VarTypeRegistry = detail::VarTypeRegistryImpl<
-    Tensor, SelectedRows, std::vector<Scope *>, LoDRankTable, Strings,
+    Tensor, pten::SelectedRows, std::vector<Scope *>, LoDRankTable, Strings,
     LoDTensorArray, platform::PlaceList, ReaderHolder, String, Scope *,
     operators::reader::LoDTensorBlockingQueueHolder, FetchList, FeedList,
     operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
@@ -206,7 +206,7 @@ struct VarTypeTrait {
 // Users should set some of variable type ids to be what is defined in
 // framework.proto below
 REG_PROTO_VAR_TYPE_TRAIT(LoDTensor, proto::VarType::LOD_TENSOR);
-REG_PROTO_VAR_TYPE_TRAIT(SelectedRows, proto::VarType::SELECTED_ROWS);
+REG_PROTO_VAR_TYPE_TRAIT(pten::SelectedRows, proto::VarType::SELECTED_ROWS);
 REG_PROTO_VAR_TYPE_TRAIT(std::vector<Scope *>, proto::VarType::STEP_SCOPES);
 REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE);
 REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY);
diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc
index 812a34112a465a57687c0420edf1cef8ee760abc..bc418363bf737df2ed558a320c1b39582439815d 100644
--- a/paddle/fluid/framework/var_type_traits_test.cc
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@@ -92,7 +92,7 @@ bool CheckVarId(int proto_id) {
 
 TEST(var_type_traits, check_proto_type_id) {
   ASSERT_TRUE(CheckVarId<LoDTensor>(proto::VarType::LOD_TENSOR));
-  ASSERT_TRUE(CheckVarId<SelectedRows>(proto::VarType::SELECTED_ROWS));
+  ASSERT_TRUE(CheckVarId<pten::SelectedRows>(proto::VarType::SELECTED_ROWS));
   ASSERT_TRUE(CheckVarId<std::vector<Scope *>>(proto::VarType::STEP_SCOPES));
   ASSERT_TRUE(CheckVarId<LoDRankTable>(proto::VarType::LOD_RANK_TABLE));
   ASSERT_TRUE(CheckVarId<LoDTensorArray>(proto::VarType::LOD_TENSOR_ARRAY));
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index 188b00d818de3df8ee88790dcf681d287f85833b..52bf3a12a043f63f6d370f528ac55368aed63527 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -72,7 +72,7 @@ class Variable {
  private:
   // This method hides type T, so it doesn't appear as a template parameter of
   // Variable.
-  pten::TensorInplaceVersion* InplaceVersionCounter();
+  pten::DenseTensor::InplaceVersion* InplaceVersionCounter();
 
  public:
   void SetInplaceVersionToZero();
@@ -114,8 +114,8 @@ class Variable {
   std::shared_ptr<Placeholder> holder_;
 };
 
-inline pten::TensorInplaceVersion* Variable::InplaceVersionCounter() {
-  pten::TensorInplaceVersion* version_counter_ptr(nullptr);
+inline pten::DenseTensor::InplaceVersion* Variable::InplaceVersionCounter() {
+  pten::DenseTensor::InplaceVersion* version_counter_ptr(nullptr);
   if (IsType<framework::LoDTensor>()) {
     version_counter_ptr =
         &GetMutable<framework::LoDTensor>()->InplaceVersionCounter();
@@ -123,8 +123,8 @@ inline pten::TensorInplaceVersion* Variable::InplaceVersionCounter() {
     version_counter_ptr =
         &GetMutable<framework::Tensor>()->InplaceVersionCounter();
 
-  } else if (IsType<framework::SelectedRows>()) {
-    version_counter_ptr = &GetMutable<framework::SelectedRows>()
+  } else if (IsType<pten::SelectedRows>()) {
+    version_counter_ptr = &GetMutable<pten::SelectedRows>()
                                ->mutable_value()
                                ->InplaceVersionCounter();
   } else {
diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc
index 34ab07def54c18f8377636a7990052712f215ab8..3c71987303bd40ac76f16221b9bbef134df29196 100644
--- a/paddle/fluid/framework/variable_helper.cc
+++ b/paddle/fluid/framework/variable_helper.cc
@@ -31,7 +31,7 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
   if (var_type == proto::VarType::LOD_TENSOR) {
     var->GetMutable<LoDTensor>();
   } else if (var_type == proto::VarType::SELECTED_ROWS) {
-    var->GetMutable<SelectedRows>();
+    var->GetMutable<pten::SelectedRows>();
   } else if (var_type == proto::VarType::FEED_MINIBATCH) {
     var->GetMutable<FeedList>();
   } else if (var_type == proto::VarType::FETCH_LIST) {
@@ -70,9 +70,9 @@ void CopyVariable(const Variable &src_var, Variable *dst_var) {
     auto &src_tensor = src_var.Get<framework::LoDTensor>();
     tmp_grad_tensor->set_lod(src_tensor.lod());
     framework::TensorCopy(src_tensor, cpu_place, tmp_grad_tensor);
-  } else if (src_var.IsType<framework::SelectedRows>()) {
-    auto &src_slr = src_var.Get<framework::SelectedRows>();
-    auto *tmp_grad_slr = dst_var->GetMutable<framework::SelectedRows>();
+  } else if (src_var.IsType<pten::SelectedRows>()) {
+    auto &src_slr = src_var.Get<pten::SelectedRows>();
+    auto *tmp_grad_slr = dst_var->GetMutable<pten::SelectedRows>();
     tmp_grad_slr->set_rows(src_slr.rows());
     tmp_grad_slr->set_height(src_slr.height());
     auto &src_t = src_slr.value();
diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc
index d1d6a0f5adf581498ec52cf21ea7c1f762a3b446..0f105ec9a308232ad4c006c208c04981839459ed 100644
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -39,8 +39,8 @@ static const platform::Place &GetVarPlace(const framework::Variable &src) {
   if (src.IsType<framework::LoDTensor>()) {
     return src.Get<framework::LoDTensor>().place();
 #if NCCL_VERSION_CODE >= 2212
-  } else if (src.IsType<framework::SelectedRows>()) {
-    return src.Get<framework::SelectedRows>().value().place();
+  } else if (src.IsType<pten::SelectedRows>()) {
+    return src.Get<pten::SelectedRows>().value().place();
 #endif
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
@@ -70,8 +70,7 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
 }
 
 #if NCCL_VERSION_CODE >= 2212
-static void AllReduce(const framework::SelectedRows &src,
-                      framework::SelectedRows *dst,
+static void AllReduce(const pten::SelectedRows &src, pten::SelectedRows *dst,
                       const ParallelStrategy &strategy,
                       const gpuStream_t stream,
                       const platform::NCCLComm *comm) {
@@ -191,19 +190,18 @@ void AllReduce(const framework::Variable &src, framework::Variable *dst,
     AllReduce(src.Get<framework::LoDTensor>(),
               dst->GetMutable<framework::LoDTensor>(), stream, comm);
 #if NCCL_VERSION_CODE >= 2212
-  } else if (src.IsType<framework::SelectedRows>()) {
+  } else if (src.IsType<pten::SelectedRows>()) {
     if (&src != dst) {
-      if (!dst->IsType<framework::SelectedRows>()) {
+      if (!dst->IsType<pten::SelectedRows>()) {
         dst->Clear();
       }
-      AllReduce(src.Get<framework::SelectedRows>(),
-                dst->GetMutable<framework::SelectedRows>(), strategy, stream,
-                comm);
+      AllReduce(src.Get<pten::SelectedRows>(),
+                dst->GetMutable<pten::SelectedRows>(), strategy, stream, comm);
     } else {
       // SelectedRows cannot be allreduce in-place
       framework::Variable tmp_dst;
-      AllReduce(src.Get<framework::SelectedRows>(),
-                tmp_dst.GetMutable<framework::SelectedRows>(), strategy, stream,
+      AllReduce(src.Get<pten::SelectedRows>(),
+                tmp_dst.GetMutable<pten::SelectedRows>(), strategy, stream,
                 comm);
       // stream must synchronize to ensure accuracy of the move operation
       platform::GpuStreamSync(stream);
diff --git a/paddle/fluid/imperative/dygraph_grad_maker.h b/paddle/fluid/imperative/dygraph_grad_maker.h
index c50018f8236037d344448b18321827b3004c86ed..e1931a3b0f2489798ec935f03d83502b2cdb239f 100644
--- a/paddle/fluid/imperative/dygraph_grad_maker.h
+++ b/paddle/fluid/imperative/dygraph_grad_maker.h
@@ -365,12 +365,12 @@ class TracedGradOp {
                var_wrapper->MutableVar()->CurrentInplaceVersion()) {
       return var_wrapper;
     } else if (var_wrapper->MutableVar()->IsType<framework::LoDTensor>() ||
-               var_wrapper->MutableVar()->IsType<framework::SelectedRows>()) {
+               var_wrapper->MutableVar()->IsType<pten::SelectedRows>()) {
       auto* tensor =
           var_wrapper->MutableVar()->IsType<framework::LoDTensor>()
               ? var_wrapper->MutableVar()->GetMutable<framework::LoDTensor>()
               : var_wrapper->MutableVar()
-                    ->GetMutable<framework::SelectedRows>()
+                    ->GetMutable<pten::SelectedRows>()
                     ->mutable_value();
       if (!tensor->IsInitialized()) {
         return var_wrapper;
diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc
index 1eaf0c6538043ff274b8a30f8618373deea771b0..44315e267ee78d3bccbb808529269063f3a206c5 100644
--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -72,18 +72,18 @@ void GLOOParallelContext::AllReduceByStream(const framework::Variable &src,
     }
     AllReduce(src.Get<framework::LoDTensor>(),
               dst->GetMutable<framework::LoDTensor>());
-  } else if (src.IsType<framework::SelectedRows>()) {
+  } else if (src.IsType<pten::SelectedRows>()) {
     if (&src != dst) {
-      if (!dst->IsType<framework::SelectedRows>()) {
+      if (!dst->IsType<pten::SelectedRows>()) {
         dst->Clear();
       }
-      AllReduce(src.Get<framework::SelectedRows>(),
-                dst->GetMutable<framework::SelectedRows>());
+      AllReduce(src.Get<pten::SelectedRows>(),
+                dst->GetMutable<pten::SelectedRows>());
     } else {
       // SelectedRows cannot be allreduce in-place
       framework::Variable tmp_dst;
-      AllReduce(src.Get<framework::SelectedRows>(),
-                tmp_dst.GetMutable<framework::SelectedRows>());
+      AllReduce(src.Get<pten::SelectedRows>(),
+                tmp_dst.GetMutable<pten::SelectedRows>());
       *dst = std::move(tmp_dst);
     }
   } else {
@@ -120,8 +120,8 @@ void GLOOParallelContext::AllReduce(const framework::Tensor &src_tensor,
     break;                                                        \
   }
 
-void GLOOParallelContext::AllReduce(const framework::SelectedRows &src,
-                                    framework::SelectedRows *dst) {
+void GLOOParallelContext::AllReduce(const pten::SelectedRows &src,
+                                    pten::SelectedRows *dst) {
   // auto ;
   // int local_rank = strategy_.local_rank_;
   int nranks = strategy_.nranks_;
diff --git a/paddle/fluid/imperative/gloo_context.h b/paddle/fluid/imperative/gloo_context.h
index f13bb859eee93691510df27bc3449330344773b3..d63d48eac7e02b4dadf674a314570875b436bdd4 100644
--- a/paddle/fluid/imperative/gloo_context.h
+++ b/paddle/fluid/imperative/gloo_context.h
@@ -59,8 +59,7 @@ class GLOOParallelContext : public ParallelContext {
 
  private:
   void AllReduce(const framework::Tensor& src, framework::Tensor* dst);
-  void AllReduce(const framework::SelectedRows& src,
-                 framework::SelectedRows* dst);
+  void AllReduce(const pten::SelectedRows& src, pten::SelectedRows* dst);
 
  private:
   std::unique_ptr<platform::CPUDeviceContext> device_;
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 092872247cca56ae90bb6bcf8870de79c2535c11..9ae8b75075a1a724c31a8e03e071912ba140715a 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -55,12 +55,12 @@ static void MoveOrCopyVar(framework::Variable* dst, framework::Variable* src,
     auto* dst_tensor = dst->GetMutable<framework::LoDTensor>();
     framework::TensorCopy(src_tensor, src_tensor.place(), dst_tensor);
     dst_tensor->set_lod(src_tensor.lod());
-  } else if (src->IsType<framework::SelectedRows>()) {
-    auto& src_selected_rows = src->Get<framework::SelectedRows>();
-    if (!dst->IsType<framework::SelectedRows>()) {
+  } else if (src->IsType<pten::SelectedRows>()) {
+    auto& src_selected_rows = src->Get<pten::SelectedRows>();
+    if (!dst->IsType<pten::SelectedRows>()) {
       dst->Clear();
     }
-    auto* dst_selected_rows = dst->GetMutable<framework::SelectedRows>();
+    auto* dst_selected_rows = dst->GetMutable<pten::SelectedRows>();
     framework::TensorCopy(src_selected_rows.value(),
                           src_selected_rows.value().place(),
                           dst_selected_rows->mutable_value());
@@ -243,6 +243,13 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
                         "should be equal, Otherwise, the calculation results "
                         "will be incorrect."));
 
+#ifdef PADDLE_WITH_XPU
+  // if src and dst are in different place, copy dst to src's place
+  if (dst_tensor->place() != place) {
+    paddle::framework::TensorCopySync(*dst_tensor, place, dst_tensor);
+  }
+#endif
+
 #define PADDLE_TENSOR_ADD(cpp_type)                                  \
   if (data_type == framework::DataTypeTrait<cpp_type>::DataType()) { \
     TensorAddFunctor<cpp_type> func(                                 \
@@ -332,7 +339,7 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
 void SelectedRowsAddToTensor(const framework::Variable& src,
                              framework::Variable* dst) {
   auto* dst_tensor = dst->GetMutable<framework::LoDTensor>();
-  auto& src_selected_rows = src.Get<framework::SelectedRows>();
+  auto& src_selected_rows = src.Get<pten::SelectedRows>();
   auto place = dst_tensor->place();
   auto data_type = src_selected_rows.value().type();
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
@@ -371,7 +378,7 @@ static void SelectedRowsAddTensor(
     const framework::Variable& src_tensor_var,
     framework::Variable* dst_tensor_var) {
   const auto& src_selected_rows =
-      src_selected_rows_var.Get<framework::SelectedRows>();
+      src_selected_rows_var.Get<pten::SelectedRows>();
   const auto& src_tensor = src_tensor_var.Get<framework::LoDTensor>();
   const auto& place = src_tensor.place();
   auto data_type = src_tensor.type();
@@ -414,18 +421,18 @@ static void SelectedRowsAddTensor(
 //   to one then add it to a empty selected rows, the after is correct
 std::shared_ptr<VariableWrapper> SelectedRowsMerge(
     const framework::Variable& src1, const framework::Variable& src2) {
-  auto& src_selected_rows1 = src1.Get<framework::SelectedRows>();
-  auto& src_selected_rows2 = src2.Get<framework::SelectedRows>();
+  auto& src_selected_rows1 = src1.Get<pten::SelectedRows>();
+  auto& src_selected_rows2 = src2.Get<pten::SelectedRows>();
   auto place = src_selected_rows1.value().place();
   auto data_type = src_selected_rows1.value().type();
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
 
-  std::vector<const framework::SelectedRows*> src_selected_rows;
+  std::vector<const pten::SelectedRows*> src_selected_rows;
   src_selected_rows.emplace_back(&src_selected_rows1);
   src_selected_rows.emplace_back(&src_selected_rows2);
   auto dst_var = std::make_shared<VariableWrapper>("Temp");
   auto* dst_selected_rows =
-      dst_var->MutableVar()->GetMutable<framework::SelectedRows>();
+      dst_var->MutableVar()->GetMutable<pten::SelectedRows>();
 
 #define PADDLE_SELECTED_ROWS_ADD(dev_ctx_type, cpp_type)                  \
   if (data_type == framework::DataTypeTrait<cpp_type>::DataType()) {      \
@@ -463,7 +470,7 @@ void VariableWrapperAdd(std::shared_ptr<VariableWrapper> var,
   if (dst->IsType<framework::LoDTensor>()) {
     if (src.IsType<framework::LoDTensor>()) {
       TensorAdd(src, dst);
-    } else if (src.IsType<framework::SelectedRows>()) {
+    } else if (src.IsType<pten::SelectedRows>()) {
       SelectedRowsAddToTensor(src, dst);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
@@ -481,7 +488,7 @@ void VariableWrapperAdd(std::shared_ptr<VariableWrapper> var,
         SelectedRowsAddToTensor(*dst, src_mutable);
         *dst = std::move(*(var->MutableVar()));
       }
-    } else if (src.IsType<framework::SelectedRows>()) {
+    } else if (src.IsType<pten::SelectedRows>()) {
       auto temp = SelectedRowsMerge(src, *dst);
       *dst = std::move(*(temp->MutableVar()));
     } else {
@@ -497,8 +504,8 @@ static platform::Place GetPlaceOfVar(
   platform::Place place;
   if (var->Var().IsType<framework::LoDTensor>()) {
     place = var->Var().Get<framework::LoDTensor>().place();
-  } else if (var->Var().IsType<framework::SelectedRows>()) {
-    place = var->Var().Get<framework::SelectedRows>().place();
+  } else if (var->Var().IsType<pten::SelectedRows>()) {
+    place = var->Var().Get<pten::SelectedRows>().place();
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "only support LoDTensor and SelectedRows in dygraph"));
@@ -530,14 +537,14 @@ void GradientAccumulator::AccumulateGrad() {
     if (dst->IsType<framework::LoDTensor>()) {
       if (src->IsType<framework::LoDTensor>()) {
         TensorAdd(*src, dst);
-      } else if (src->IsType<framework::SelectedRows>()) {
+      } else if (src->IsType<pten::SelectedRows>()) {
         SelectedRowsAddToTensor(*src, dst);
       }
-    } else if (dst->IsType<framework::SelectedRows>()) {
+    } else if (dst->IsType<pten::SelectedRows>()) {
       if (src->IsType<framework::LoDTensor>()) {
         SelectedRowsAddToTensor(*dst, src);
         *dst = std::move(*src);
-      } else if (src->IsType<framework::SelectedRows>()) {
+      } else if (src->IsType<pten::SelectedRows>()) {
         auto temp = SelectedRowsMerge(*src, *dst);
         *dst = std::move(*(temp->MutableVar()));
       }
@@ -657,7 +664,7 @@ void EagerGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
   // so synchronous VariableWrapper with Variable.
   if (dst_var->Var().IsType<framework::LoDTensor>()) {
     dst_var->SetType(framework::proto::VarType::LOD_TENSOR);
-  } else if (dst_var->Var().IsType<framework::SelectedRows>()) {
+  } else if (dst_var->Var().IsType<pten::SelectedRows>()) {
     dst_var->SetType(framework::proto::VarType::SELECTED_ROWS);
   }
 
@@ -701,7 +708,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
       if (paddle::platform::is_gpu_place(place)) {
         // sum selected rows firstly
         for (auto& var_info : tmp_grad_vars_) {
-          if (!var_info.var->Var().IsType<framework::SelectedRows>()) {
+          if (!var_info.var->Var().IsType<pten::SelectedRows>()) {
             continue;
           }
 
@@ -744,7 +751,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
           }
           PADDLE_ENFORCE_EQ(
               var_info.var->Var().IsType<framework::LoDTensor>() ||
-                  var_info.var->Var().IsType<framework::SelectedRows>(),
+                  var_info.var->Var().IsType<pten::SelectedRows>(),
               true, platform::errors::PermissionDenied("The type of Gradient "
                                                        "var must be LoDTensor "
                                                        "or SelectedRows"));
@@ -789,7 +796,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
 
   if (dst_var->Var().IsType<framework::LoDTensor>()) {
     dst_var->SetType(framework::proto::VarType::LOD_TENSOR);
-  } else if (dst_var->Var().IsType<framework::SelectedRows>()) {
+  } else if (dst_var->Var().IsType<pten::SelectedRows>()) {
     dst_var->SetType(framework::proto::VarType::SELECTED_ROWS);
   }
 }
diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h
index 6411dce4405c11795418fb8334e26b32079e7596..8896e5d0f406447f524bcdd9215db30d6d2ecc28 100644
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
@@ -31,7 +31,7 @@ class GradientAccumulator {
     if (var && var->Var().IsInitialized()) {
       if (var->Var().IsType<framework::LoDTensor>()) {
         var->SetType(framework::proto::VarType::LOD_TENSOR);
-      } else if (var->Var().IsType<framework::SelectedRows>()) {
+      } else if (var->Var().IsType<pten::SelectedRows>()) {
         var->SetType(framework::proto::VarType::SELECTED_ROWS);
       } else {
         PADDLE_THROW(platform::errors::PermissionDenied(
diff --git a/paddle/fluid/imperative/infer_shape_context.h b/paddle/fluid/imperative/infer_shape_context.h
index 71f7fb7387effe68ae63d5a3c5236e9a9a108d2f..a39e58bba90110c122a666f97a4cf0911284e4a8 100644
--- a/paddle/fluid/imperative/infer_shape_context.h
+++ b/paddle/fluid/imperative/infer_shape_context.h
@@ -196,8 +196,8 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
       auto* out_lod_tensor = out_var->GetMutable<framework::LoDTensor>();
       out_lod_tensor->Resize(in_lod_tensor.dims());
     } else {
-      auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
-      auto out_sele_rows = out_var->GetMutable<framework::SelectedRows>();
+      auto& in_sele_rows = in_var->Get<pten::SelectedRows>();
+      auto out_sele_rows = out_var->GetMutable<pten::SelectedRows>();
       out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
       out_sele_rows->set_rows(in_sele_rows.rows());
       out_sele_rows->set_height(in_sele_rows.height());
@@ -365,8 +365,8 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
                                      "Input variable should not be null"));
     if (var->IsType<framework::LoDTensor>()) {
       return var->Get<framework::LoDTensor>().dims();
-    } else if (var->IsType<framework::SelectedRows>()) {
-      return var->Get<framework::SelectedRows>().GetCompleteDims();
+    } else if (var->IsType<pten::SelectedRows>()) {
+      return var->Get<pten::SelectedRows>().GetCompleteDims();
     } else {
       PADDLE_THROW(platform::errors::PermissionDenied(
           "Only LoDTensor/SelectedRows support 'GetDim', but Variables "
@@ -382,8 +382,8 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
   void SetDim(framework::Variable* var, const DDim& dim) {
     if (var->IsType<framework::LoDTensor>()) {
       var->GetMutable<framework::LoDTensor>()->Resize(dim);
-    } else if (var->IsType<framework::SelectedRows>()) {
-      var->GetMutable<framework::SelectedRows>()->set_height(dim[0]);
+    } else if (var->IsType<pten::SelectedRows>()) {
+      var->GetMutable<pten::SelectedRows>()->set_height(dim[0]);
     } else {
       PADDLE_THROW(platform::errors::PermissionDenied(
           "Variable type_id %s, expect LoDTensor/SelectedRows."));
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index f47b024973ba7899ebf5040a09702f5bab83fe32..65720c8a3cf6578f0c35a7b79be78fde14c1a9cf 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -105,9 +105,9 @@ static std::string DebugString(
         ss << "NOT_INITED";
       }
       ss << ">";
-    } else if (var.IsType<framework::SelectedRows>()) {
+    } else if (var.IsType<pten::SelectedRows>()) {
       ss << "SelectedRows<";
-      auto& selected_rows = var.Get<framework::SelectedRows>();
+      auto& selected_rows = var.Get<pten::SelectedRows>();
       auto& tensor = selected_rows.value();
       auto& rows = selected_rows.rows();
       if (tensor.IsInitialized()) {
@@ -188,9 +188,8 @@ size_t VarBase::GradOpNum() const {
 void VarBase::ClearGradient(bool set_to_zero) {
   VLOG(4) << "ClearGradient " << Name();
   if (grad_var_) {
-    if (grad_var_->Var().IsType<framework::SelectedRows>()) {
-      auto* grad_t =
-          grad_var_->MutableVar()->GetMutable<framework::SelectedRows>();
+    if (grad_var_->Var().IsType<pten::SelectedRows>()) {
+      auto* grad_t = grad_var_->MutableVar()->GetMutable<pten::SelectedRows>();
       if (grad_t->mutable_value()->IsInitialized()) {
 #ifdef PADDLE_WITH_MKLDNN
         if (FLAGS_use_mkldnn) platform::ClearMKLDNNCache(grad_t->place());
@@ -248,7 +247,7 @@ std::shared_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
                                              const bool blocking) const {
   PADDLE_ENFORCE_EQ(
       Var().IsInitialized() && (Var().IsType<framework::LoDTensor>() ||
-                                Var().IsType<framework::SelectedRows>()),
+                                Var().IsType<pten::SelectedRows>()),
       true, platform::errors::InvalidArgument(
                 "Variable is not initialized or Variable's type is not "
                 "LoDTensor or SelectedRows when getting numpy tensor"));
@@ -277,12 +276,12 @@ std::shared_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
             << dst_place;
     return new_var;
   } else {
-    auto& src_selected_rows = Var().Get<framework::SelectedRows>();
+    auto& src_selected_rows = Var().Get<pten::SelectedRows>();
     auto new_var = std::make_shared<VarBase>(
         false, "Itmp" + std::to_string(copied_counter_++));
     new_var->SetType(framework::proto::VarType::SELECTED_ROWS);
     auto* dst_selected_rows =
-        new_var->MutableVar()->GetMutable<framework::SelectedRows>();
+        new_var->MutableVar()->GetMutable<pten::SelectedRows>();
 
     framework::TensorCopy(src_selected_rows.value(), dst_place,
                           dst_selected_rows->mutable_value());
@@ -346,10 +345,9 @@ void VarBase::CopyFrom(const VarBase& src, const bool blocking) {
       dst_tensor->Resize(src_tensor.dims());
     }
     framework::TensorCopy(src_tensor, place, dst_tensor);
-  } else if (src.Var().IsType<framework::SelectedRows>()) {
-    auto& src_selected_rows = src.Var().Get<framework::SelectedRows>();
-    auto* dst_selected_rows =
-        MutableVar()->GetMutable<framework::SelectedRows>();
+  } else if (src.Var().IsType<pten::SelectedRows>()) {
+    auto& src_selected_rows = src.Var().Get<pten::SelectedRows>();
+    auto* dst_selected_rows = MutableVar()->GetMutable<pten::SelectedRows>();
     dst_selected_rows->set_height(src_selected_rows.height());
     dst_selected_rows->set_rows(src_selected_rows.rows());
 
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index fe60f05e1da431dc7ed7b45acebb8cffecc12941..d9a21c9247b9363b0f1cbcdf6c8d62bb6242c183 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -25,6 +25,7 @@
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(check_nan_inf);
 DECLARE_bool(run_pten_kernel);
@@ -47,8 +48,8 @@ const std::shared_ptr<VariableWrapper>& GetVariableWrapper(
 const framework::Tensor* GetTensorFromVar(const framework::Variable& var) {
   if (var.IsType<framework::LoDTensor>()) {
     return &(var.Get<framework::LoDTensor>());
-  } else if (var.IsType<framework::SelectedRows>()) {
-    return &(var.Get<framework::SelectedRows>().value());
+  } else if (var.IsType<pten::SelectedRows>()) {
+    return &(var.Get<pten::SelectedRows>().value());
   } else {
     return nullptr;
   }
@@ -369,6 +370,10 @@ static void BuildDygraphPtenKernelContext(
     size_t end_idx = start_idx + outs_vector.size();
 
     for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
+      if (outs_vector[offset] == nullptr) {
+        kernel_ctx->EmplaceBackOutputWithoutSetRange({nullptr});
+        continue;
+      }
       auto* var = outs_vector[offset]->MutableVar();
       framework::Tensor* tensor_out = nullptr;
       if (var->template IsType<framework::LoDTensor>()) {
@@ -501,12 +506,21 @@ static void PreparedOpRunImpl(
   // TODO(zjl): remove scope in dygraph
   framework::Scope scope;
 
-  DygraphInferShapeContext<VarType> infer_shape_ctx(
-      &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type);
-  op.Info().infer_shape_(&infer_shape_ctx);
+  {
+    platform::RecordEvent record_event(op.Type() + " infer_shape",
+                                       platform::EventRole::kInnerOp);
+    DygraphInferShapeContext<VarType> infer_shape_ctx(
+        &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type);
+    op.Info().infer_shape_(&infer_shape_ctx);
+  }
+
+  {
+    platform::RecordEvent record_event(op.Type() + " compute",
+                                       platform::EventRole::kInnerOp);
 
-  func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx, ins, outs,
-                                        attrs, default_attrs));
+    func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx, ins, outs,
+                                          attrs, default_attrs));
+  }
 
   if (FLAGS_check_nan_inf) {
     framework::details::CheckOpHasNanOrInfInDygraph<VarType>(
@@ -547,18 +561,27 @@ static void PreparedOpRunPtImpl(
     const NameVarMap<VarType>& ins, const NameVarMap<VarType>& outs,
     const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs) {
-  DygraphInferShapeContext<VarType> infer_shape_ctx(
-      &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type);
-  op.Info().infer_shape_(&infer_shape_ctx);
+  {
+    platform::RecordEvent record_event(op.Type() + " infer_shape",
+                                       platform::EventRole::kInnerOp);
+    DygraphInferShapeContext<VarType> infer_shape_ctx(
+        &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type);
+    op.Info().infer_shape_(&infer_shape_ctx);
+  }
 
-  PreparePtenData<VarType>(pt_kernel, pt_kernel_signature, ins);
+  {
+    platform::RecordEvent record_event(op.Type() + " compute",
+                                       platform::EventRole::kInnerOp);
 
-  pten::KernelContext pt_kernel_context;
-  BuildDygraphPtenKernelContext<VarType>(pt_kernel_signature, pt_kernel, ins,
-                                         outs, attrs, default_attrs, dev_ctx,
-                                         &pt_kernel_context);
+    PreparePtenData<VarType>(pt_kernel, pt_kernel_signature, ins);
 
-  pt_kernel(&pt_kernel_context);
+    pten::KernelContext pt_kernel_context;
+    BuildDygraphPtenKernelContext<VarType>(pt_kernel_signature, pt_kernel, ins,
+                                           outs, attrs, default_attrs, dev_ctx,
+                                           &pt_kernel_context);
+
+    pt_kernel(&pt_kernel_context);
+  }
 
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index ad518eb96062d29a1c8f8f9f25a5c49c48c27b04..54e27b2bd8c313eaa3df016b48ee17957fd833f2 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -36,8 +36,7 @@ namespace imperative {
 void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
   framework::Tensor *tensor =
       is_sparse_
-          ? sparse_contents_->GetMutable<framework::SelectedRows>()
-                ->mutable_value()
+          ? sparse_contents_->GetMutable<pten::SelectedRows>()->mutable_value()
           : dense_contents_.GetMutable<framework::LoDTensor>();
 
   if (platform::is_gpu_place(tensor->place())) {
@@ -775,7 +774,7 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
     auto var_base = vars_[var_index]->GradVarBase();
     // need to check tensor type
     PADDLE_ENFORCE_EQ(
-        var_base->Var().IsType<framework::SelectedRows>(), true,
+        var_base->Var().IsType<pten::SelectedRows>(), true,
         platform::errors::PreconditionNotMet(
             "The sparse parameter[%d][%s] must have a selectedrows gradient. "
             "Before forward pass, the parameter type is inferred to be "
@@ -995,8 +994,8 @@ bool Reducer::HasGrad(size_t var_index) {
     if (var.Get<framework::LoDTensor>().IsInitialized()) {
       return true;
     }
-  } else if (var.IsType<framework::SelectedRows>()) {
-    if (var.Get<framework::SelectedRows>().value().IsInitialized()) {
+  } else if (var.IsType<pten::SelectedRows>()) {
+    if (var.Get<pten::SelectedRows>().value().IsInitialized()) {
       return true;
     }
   } else {
diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
index 0a7df9953ad45a3d1f93a09af88d34046b0c9776..25ffab470646b3e69e02e049967f540adb776a08 100644
--- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
+++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
@@ -15,6 +15,7 @@
 #include <memory>
 #include <type_traits>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/gradient_accumulator.h"
@@ -29,8 +30,8 @@ namespace imperative {
 
 void TensorAdd(const framework::Variable& src, framework::Variable* dst);
 
-template <typename Place, typename T>
-int TensorddTest(Place place, T t1, T t2) {
+template <typename Place1, typename Place2, typename T>
+int TensorddTest(Place1 place1, Place2 place2, T t1, T t2) {
   framework::Variable var1;
   framework::Variable var2;
   std::vector<T> src_data(10, t1);
@@ -46,18 +47,25 @@ int TensorddTest(Place place, T t1, T t2) {
   auto* dst = var2.GetMutable<framework::LoDTensor>();
   src->Resize(framework::make_ddim(dims));
   dst->Resize(framework::make_ddim(dims));
-  auto* src_mutable = src->mutable_data<T>(place);
-  auto* dst_mutable = dst->mutable_data<T>(place);
-  if (!std::is_same<Place, platform::CUDAPlace>::value) {
-    paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
+  auto* src_mutable = src->mutable_data<T>(place1);
+  auto* dst_mutable = dst->mutable_data<T>(place2);
+
+  if (!std::is_same<Place1, platform::CUDAPlace>::value) {
+    paddle::memory::Copy(place1, src_mutable, src_place, src_data.data(),
                          sizeof(T) * src_data.size());
-    paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
-                         sizeof(T) * dst_data.size());
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   } else {
-    paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
+    paddle::memory::Copy(place1, src_mutable, src_place, src_data.data(),
                          sizeof(T) * src_data.size(), 0);
-    paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
+#endif
+  }
+
+  if (!std::is_same<Place2, platform::CUDAPlace>::value) {
+    paddle::memory::Copy(place2, dst_mutable, src_place, dst_data.data(),
+                         sizeof(T) * dst_data.size());
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  } else {
+    paddle::memory::Copy(place2, dst_mutable, src_place, dst_data.data(),
                          sizeof(T) * dst_data.size(), 0);
 #endif
   }
@@ -80,25 +88,64 @@ TEST(test_add_functor, add_functor) {
   platform::CPUPlace cpu_place;
 
   int cpu_res = 1;
-  cpu_res = TensorddTest(cpu_place, 1.0, 0.0);
+
+  // float32
+  cpu_res = TensorddTest(cpu_place, cpu_place, static_cast<float>(1.0),
+                         static_cast<float>(2.0));
   EXPECT_EQ(cpu_res, 0);
-  cpu_res = TensorddTest(cpu_place, static_cast<double>(1.0),
-                         static_cast<double>(2.0));
+  // float16
+  cpu_res =
+      TensorddTest(cpu_place, cpu_place, static_cast<platform::float16>(1.0),
+                   static_cast<platform::float16>(2.0));
   EXPECT_EQ(cpu_res, 0);
-  cpu_res = TensorddTest(cpu_place, static_cast<platform::float16>(1.0),
-                         static_cast<platform::float16>(2.0));
+
+#ifndef PADDLE_WITH_XPU
+  // does not support double when compiled using xpu
+  cpu_res = TensorddTest(cpu_place, cpu_place, static_cast<double>(1.0),
+                         static_cast<double>(2.0));
   EXPECT_EQ(cpu_res, 0);
+#endif
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   int gpu_res = 1;
-  gpu_res = TensorddTest(gpu_place, 1.0, 0.0);
+  gpu_res = TensorddTest(gpu_place, gpu_place, 1.0, 0.0);
   EXPECT_EQ(gpu_res, 0);
-  gpu_res = TensorddTest(gpu_place, static_cast<double>(1.0),
+  gpu_res = TensorddTest(gpu_place, gpu_place, static_cast<double>(1.0),
                          static_cast<double>(2.0));
   EXPECT_EQ(gpu_res, 0);
-  gpu_res = TensorddTest(gpu_place, static_cast<platform::float16>(1.0),
-                         static_cast<platform::float16>(2.0));
+  gpu_res =
+      TensorddTest(gpu_place, gpu_place, static_cast<platform::float16>(1.0),
+                   static_cast<platform::float16>(2.0));
   EXPECT_EQ(gpu_res, 0);
 #endif
+
+#ifdef PADDLE_WITH_XPU
+  platform::XPUPlace xpu_place(0);
+  int xpu_res = 1;
+  // normal
+  xpu_res = TensorddTest(xpu_place, xpu_place, static_cast<float>(1.0),
+                         static_cast<float>(2.0));
+  EXPECT_EQ(xpu_res, 0);
+  xpu_res =
+      TensorddTest(xpu_place, xpu_place, static_cast<platform::float16>(1.0),
+                   static_cast<platform::float16>(2.0));
+  EXPECT_EQ(xpu_res, 0);
+  // different places
+  xpu_res = TensorddTest(cpu_place, xpu_place, static_cast<float>(1.0),
+                         static_cast<float>(2.0));
+  EXPECT_EQ(xpu_res, 0);
+  xpu_res = TensorddTest(xpu_place, cpu_place, static_cast<float>(1.0),
+                         static_cast<float>(2.0));
+  EXPECT_EQ(xpu_res, 0);
+  xpu_res =
+      TensorddTest(cpu_place, xpu_place, static_cast<platform::float16>(1.0),
+                   static_cast<platform::float16>(2.0));
+  EXPECT_EQ(xpu_res, 0);
+  xpu_res =
+      TensorddTest(xpu_place, cpu_place, static_cast<platform::float16>(1.0),
+                   static_cast<platform::float16>(2.0));
+  EXPECT_EQ(xpu_res, 0);
+#endif
 }
 
 TEST(test_add_functor, execption) {
@@ -106,10 +153,11 @@ TEST(test_add_functor, execption) {
   platform::CUDAPlace cuda_place(0);
   platform::CPUPlace cpu_place;
 
-  ASSERT_ANY_THROW(TensorddTest(cpu_place, 1, 0));
+  ASSERT_ANY_THROW(TensorddTest(cpu_place, cpu_place, 1, 0));
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place, 1.0, 0.0));
-  ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place,
+  ASSERT_ANY_THROW(
+      TensorddTest(cuda_pinned_place, cuda_pinned_place, 1.0, 0.0));
+  ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place, cuda_pinned_place,
                                 static_cast<platform::float16>(1.0),
                                 static_cast<platform::float16>(2.0)));
 #endif
@@ -124,8 +172,8 @@ static void CopyVar(const framework::Variable& var,
     auto* dst_tensor = dst.GetMutable<framework::LoDTensor>();
     framework::TensorCopySync(src_tensor, src_tensor.place(), dst_tensor);
   } else {
-    const auto& src_selected_rows = var.Get<framework::SelectedRows>();
-    auto* dst_selected_rows = dst.GetMutable<framework::SelectedRows>();
+    const auto& src_selected_rows = var.Get<pten::SelectedRows>();
+    auto* dst_selected_rows = dst.GetMutable<pten::SelectedRows>();
     dst_selected_rows->set_rows(src_selected_rows.rows());
     dst_selected_rows->set_height(src_selected_rows.height());
     framework::TensorCopySync(src_selected_rows.value(),
@@ -148,8 +196,8 @@ static bool IsEqualVar(const framework::Variable& var1,
     framework::TensorCopySync(var2.Get<framework::LoDTensor>(),
                               platform::CPUPlace(), &t2);
   } else {
-    auto& s1 = var1.Get<framework::SelectedRows>();
-    auto& s2 = var2.Get<framework::SelectedRows>();
+    auto& s1 = var1.Get<pten::SelectedRows>();
+    auto& s2 = var2.Get<pten::SelectedRows>();
 
     if (s1.height() != s2.height()) {
       return false;
@@ -166,9 +214,9 @@ static bool IsEqualVar(const framework::Variable& var1,
       return false;
     }
 
-    framework::TensorCopySync(var1.Get<framework::SelectedRows>().value(),
+    framework::TensorCopySync(var1.Get<pten::SelectedRows>().value(),
                               platform::CPUPlace(), &t1);
-    framework::TensorCopySync(var2.Get<framework::SelectedRows>().value(),
+    framework::TensorCopySync(var2.Get<pten::SelectedRows>().value(),
                               platform::CPUPlace(), &t2);
   }
 
@@ -211,7 +259,7 @@ static framework::Variable RandomSelectedRows(framework::DDim dims,
   dims[0] = row_number;
 
   framework::Variable ret;
-  auto* sr = ret.GetMutable<framework::SelectedRows>();
+  auto* sr = ret.GetMutable<pten::SelectedRows>();
   auto tensor_var = RandomTensor<T>(dims, place, low, high);
   sr->mutable_value()->ShareDataWith(
       tensor_var.template Get<framework::LoDTensor>());
diff --git a/paddle/fluid/imperative/tests/test_layer.cc b/paddle/fluid/imperative/tests/test_layer.cc
index 064f47f54979a135fb83f9636ebc6f5105e7c39d..c54ed34bb8108afe76459445b3ce695d73ccd0ca 100644
--- a/paddle/fluid/imperative/tests/test_layer.cc
+++ b/paddle/fluid/imperative/tests/test_layer.cc
@@ -237,7 +237,7 @@ TEST(test_layer, test_debug_string) {
   std::shared_ptr<imperative::VarBase> selected_rows(
       new imperative::VarBase(false, "selected_rows"));
   auto tensor_sr = selected_rows->MutableVar()
-                       ->GetMutable<framework::SelectedRows>()
+                       ->GetMutable<pten::SelectedRows>()
                        ->mutable_value();
   std::string res_ui_sr = test_func(selected_rows);
   ASSERT_TRUE(res_ui_sr.find("NOT_INITED") != std::string::npos);
diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc
index 5e269d74044d24adc7baea8875ecd9eb2d6772c1..b4ff3cff38217a57c0b1091c3e003043ca4c9673 100644
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@@ -101,7 +101,7 @@ const framework::Tensor* GetTensorFromVar(const framework::Variable& var);
 TEST(test_prepare_op, test_get_tensor_from_var) {
   std::shared_ptr<imperative::VarBase> vout_error(
       new imperative::VarBase(false, "vout_error"));
-  vout_error->MutableVar()->GetMutable<framework::SelectedRows>();
+  vout_error->MutableVar()->GetMutable<pten::SelectedRows>();
   auto* ts = GetTensorFromVar(*vout_error->MutableVar());
   ASSERT_TRUE(ts != nullptr);
 }
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index f4e535de108a6a69dddd19ad4705c1b08e749e47..e845ce104534cd57ec232957cbbcce88addb60b9 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -32,6 +32,8 @@ namespace imperative {
 
 thread_local bool Tracer::has_grad_ = true;
 
+thread_local AmpLevel Tracer::amp_level_ = AmpLevel::O0;
+
 static std::shared_ptr<Tracer> g_current_tracer(nullptr);
 
 const std::shared_ptr<Tracer>& GetCurrentTracer() { return g_current_tracer; }
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 93f68f2054b9a85b65639ae6ddfdc1f7fc8911f8..bd8521dabde1f43371722bd7c8b0dc9c93787cc4 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -126,7 +126,7 @@ class Tracer {
   platform::Place expected_place_;
   GarbageCollectorMap gcs_;
   static thread_local bool has_grad_;
-  AmpLevel amp_level_{AmpLevel::O0};
+  static thread_local AmpLevel amp_level_;
 };
 
 // To access static variable current_tracer
diff --git a/paddle/fluid/imperative/type_defs.h b/paddle/fluid/imperative/type_defs.h
index 74fd152e72a5752af9becf729b3fde63fa6d9d35..a0258c7a8806fb4562102f7e681d292227bee5ae 100644
--- a/paddle/fluid/imperative/type_defs.h
+++ b/paddle/fluid/imperative/type_defs.h
@@ -13,47 +13,4 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace imperative {
-
-class VariableWrapper;
-class SavedVariableWrapperList;
-class VarBase;
-class OpBase;
-class GradOpNode;
-class Tracer;
-
-using WeakNameVarBaseMap =
-    std::map<std::string, std::vector<std::weak_ptr<VarBase>>>;
-
-namespace details {
-template <typename T>
-struct NameVarMapTrait {};
-
-template <>
-struct NameVarMapTrait<VarBase> {
-  using Type = std::map<std::string, std::vector<std::shared_ptr<VarBase>>>;
-};
-
-template <>
-struct NameVarMapTrait<VariableWrapper> {
-  using Type = std::map<std::string, SavedVariableWrapperList>;
-};
-}  // namespace details
-
-template <typename T>
-using NameVarMap = typename details::NameVarMapTrait<T>::Type;
-
-using NameVarBaseMap = NameVarMap<VarBase>;
-using NameVariableWrapperMap = NameVarMap<VariableWrapper>;
-
-using VariableWrapperList = std::vector<std::shared_ptr<VariableWrapper>>;
-
-}  // namespace imperative
-}  // namespace paddle
+#include "paddle/pten/core/type_defs.h"
diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h
index c257191a546e439cedee0d2075549a45a3467423..bd96cd3f1aa1781b623c665d5263eaee0a3da244 100644
--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -104,8 +104,8 @@ class VariableWrapper {
       const framework::Tensor* tensor = nullptr;
       if (var_.IsType<framework::LoDTensor>()) {
         tensor = &(var_.Get<framework::LoDTensor>());
-      } else if (var_.IsType<framework::SelectedRows>()) {
-        tensor = &(var_.Get<framework::SelectedRows>().value());
+      } else if (var_.IsType<pten::SelectedRows>()) {
+        tensor = &(var_.Get<pten::SelectedRows>().value());
       } else {
         PADDLE_THROW(platform::errors::PermissionDenied(
             "Only support LoDTensor and SelectedRows for gradient var"));
@@ -153,7 +153,7 @@ class VariableWrapper {
       if (type_ == framework::proto::VarType::LOD_TENSOR) {
         tensor = &(var_.Get<framework::LoDTensor>());
       } else if (type_ == framework::proto::VarType::SELECTED_ROWS) {
-        tensor = &(var_.Get<framework::SelectedRows>().value());
+        tensor = &(var_.Get<pten::SelectedRows>().value());
       } else if (type_ == framework::proto::VarType::VOCAB) {
         const framework::Vocab* data = nullptr;
         data = &(var_.Get<framework::Vocab>());
@@ -193,7 +193,7 @@ class VariableWrapper {
       if (type_ == framework::proto::VarType::LOD_TENSOR) {
         tensor = &(var_.Get<framework::LoDTensor>());
       } else if (type_ == framework::proto::VarType::SELECTED_ROWS) {
-        tensor = &(var_.Get<framework::SelectedRows>().value());
+        tensor = &(var_.Get<pten::SelectedRows>().value());
       } else {
         VLOG(6) << "Variable " << name_ << " is not initialized";
         return place;
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 6ff25597125c5f0b13ee603bc17329a351074a8b..d731bfe139bac58050fdf79b420744551bfd17e8 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -36,6 +36,7 @@ endif()
 # fluid_modules exclude API-interface of inference/api and inference/capi_exp
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 get_property(pten_modules GLOBAL PROPERTY PTEN_MODULES)
+set(utils_modules stringpiece pretty_log string_helper)
 
 add_subdirectory(api)
 
@@ -46,9 +47,9 @@ set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
         analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
 #TODO(wilber, T8T9): Do we still need to support windows gpu static library?
 if(WIN32 AND WITH_GPU)
-  cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API})
+  cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules})
 else()
-  create_static_lib(paddle_inference ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API})
+  create_static_lib(paddle_inference ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules})
   if(WITH_IPU)
     target_link_libraries(paddle_inference -Wl,--allow-multiple-definition popart_canonicalization_utils)
   endif()
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 175bc55dcff17e46aa47e1d2d187e3a8c8c4b43d..febfdec0b5cf500c30d44feccf4bed7e029feef4 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -282,6 +282,10 @@ struct Argument {
   DECL_ARGUMENT_FIELD(ipu_batch_size, IpuBatchSize, int);
   DECL_ARGUMENT_FIELD(ipu_need_avg_shard, IpuNeedAvgShard, bool);
 
+  // npu related
+  DECL_ARGUMENT_FIELD(use_npu, UseNpu, bool);
+  DECL_ARGUMENT_FIELD(npu_device_id, NPUDeviceId, int);
+
  private:
   std::unordered_set<std::string> valid_fields_;
 };
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 06a353d5622a7093760c8680bcb8c1e245496ae8..daa18d8c78bf875ebcc6571bf955a7f634948e4f 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -22,16 +22,50 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
-  PADDLE_ENFORCE_EQ(
-      argument->scope_valid(), true,
-      platform::errors::PreconditionNotMet("The scope field should be valid"));
-  PADDLE_ENFORCE_EQ(argument->use_gpu_valid(), true,
+#ifdef PADDLE_WITH_ASCEND_CL
+void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {
+  if (!argument->use_npu()) return;
+
+  auto &graph = argument->main_graph();
+  std::vector<std::string> repetitive_params;
+
+  if (graph.Has(framework::ir::kRepetitiveParamAttr))
+    repetitive_params = graph.Get<std::vector<std::string>>(
+        framework::ir::kRepetitiveParamAttr);
+
+  LOG(INFO) << "Sync params from CPU to NPU";
+
+  PADDLE_ENFORCE_EQ(argument->npu_device_id_valid(), true,
                     platform::errors::PreconditionNotMet(
-                        "The use_gpu field should be valid"));
+                        "The npu_device_id field should be valid"));
+  platform::Place place = platform::NPUPlace(argument->npu_device_id());
+  auto *scope = argument->scope_ptr();
+  std::vector<std::string> all_vars = scope->LocalVarNames();
 
-  platform::Place place;
+  for (auto &var_name : all_vars) {
+    auto *var = scope->FindLocalVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet(
+                                     "The var should not be nullptr"));
+
+    if (var->IsType<framework::LoDTensor>() ||
+        var->IsType<framework::Tensor>()) {
+      auto *t = var->GetMutable<framework::LoDTensor>();
 
+      platform::CPUPlace cpu_place;
+      framework::LoDTensor temp_tensor;
+      temp_tensor.Resize(t->dims());
+      temp_tensor.mutable_data<float>(cpu_place);
+
+      paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
+      t->clear();
+      paddle::framework::TensorCopySync(temp_tensor, place, t);
+    }
+  }
+}
+
+#else
+
+void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
   // The parameters are on the cpu, therefore, synchronization is not necessary.
   if (!argument->use_gpu()) return;
 
@@ -47,8 +81,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
   PADDLE_ENFORCE_EQ(argument->gpu_device_id_valid(), true,
                     platform::errors::PreconditionNotMet(
                         "The gpu_device_id field should be valid"));
-  place = platform::CUDAPlace(argument->gpu_device_id());
-
+  platform::Place place = platform::CUDAPlace(argument->gpu_device_id());
   auto *scope = argument->scope_ptr();
   std::vector<std::string> all_vars = scope->LocalVarNames();
 
@@ -100,6 +133,22 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
   }
 }
 
+#endif
+
+void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
+  PADDLE_ENFORCE_EQ(
+      argument->scope_valid(), true,
+      platform::errors::PreconditionNotMet("The scope field should be valid"));
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (!argument->use_npu_valid()) return;
+  CopyParamsToNpu(argument);
+#else
+  if (!argument->use_gpu_valid()) return;
+  CopyParamsToGpu(argument);
+#endif
+}
+
 std::string IrParamsSyncAmongDevicesPass::repr() const {
   return "ir-params-sync-among-devices-pass";
 }
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
index 61990150a30db147418c4301359428cf3c6db541..d5e98ec886e65f829a1496b1431f23aad6c4bc4c 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -33,6 +33,13 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
  public:
   void RunImpl(Argument *argument) override;
   std::string repr() const override;
+
+ private:
+#ifdef PADDLE_WITH_ASCEND_CL
+  void CopyParamsToNpu(Argument *argument);
+#else
+  void CopyParamsToGpu(Argument *argument);
+#endif
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index a86329a2b2b25df7cb256c47200598644af84bfe..628d974c1237862c81c9e124851004c50d07d377 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -668,6 +668,9 @@ void AnalysisPredictor::PrepareArgument() {
   argument_.SetIpuBatchSize(config_.ipu_batch_size_);
   argument_.SetIpuNeedAvgShard(config_.ipu_need_avg_shard_);
 
+  argument_.SetUseNpu(config_.use_npu_);
+  argument_.SetNPUDeviceId(config_.npu_device_id());
+
   if (config_.use_mkldnn_) {
     LOG(INFO) << "MKLDNN is enabled";
     argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h
index fa5997d92dd231af221265601ba337e9291b6284..bd867ba54d235973663ed61deabc81eb34b76c18 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
@@ -43,7 +43,7 @@ struct TensorArrayBatchCleaner {
     constexpr auto kLoDTensorId =
         framework::VarTypeTrait<framework::LoDTensor>::kId;
     constexpr auto kSelectedRowsId =
-        framework::VarTypeTrait<framework::SelectedRows>::kId;
+        framework::VarTypeTrait<pten::SelectedRows>::kId;
     constexpr auto kFetchListId =
         framework::VarTypeTrait<framework::FetchList>::kId;
     valid_types_.insert(kTensorId);
diff --git a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
index f3c4059b8e6456581aad49b944997530e67ef9af..7c5eaa309ef18a839ea97fd9aabd44434c1c903d 100644
--- a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
@@ -83,6 +83,8 @@ class ReduceOpConverter : public OpConverter {
     }
 
     auto output_name = op_desc.Output("Out")[0];
+    // Ensure that the output type and input type are consistent.
+    layer->getOutput(0)->setType(layer->getInput(0)->getType());
     RreplenishLayerAndOutput(layer, op_type, {output_name}, test_mode);
   }
 
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 73fb6c0b13b70221e5b4125846bab7820353eaf5..4a65a036191038e5e4b2692c41e7b4e201135d07 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -1464,30 +1464,48 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         VLOG(3) << "the " << op_type
                 << " does not have attr (keep_dim or dim or "
                    "reduce_all)";
-        std::cout << "attr " << desc.HasAttr("keep_dim") << " "
-                  << desc.HasAttr("dim") << " " << desc.HasAttr("reduce_all");
+        return false;
+      }
+
+      auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
         return false;
       }
 
       // The batch size dimension cannot be reduced if it's not dynamic shape.
+      auto* x_var_desc = block->FindVar(desc.Input("X")[0]);
       if (!with_dynamic_shape) {
         if (BOOST_GET_CONST(bool, desc.GetAttr("reduce_all"))) return false;
         std::vector<int32_t> dim =
             BOOST_GET_CONST(std::vector<int32_t>, desc.GetAttr("dim"));
+        const auto input_shape = x_var_desc->GetShape();
         for (auto x : dim) {
-          if (!x) return false;
+          if (x == 0 || (x + input_shape.size() == 0)) return false;
         }
+
       } else {
         if (BOOST_GET_CONST(bool, desc.GetAttr("reduce_all")) &&
             !BOOST_GET_CONST(bool, desc.GetAttr("keep_dim")))
           return false;
       }
-      if (desc.HasAttr("out_dtype")) {
-        int out_dtype = BOOST_GET_CONST(int32_t, desc.GetAttr("out_dtype"));
-        if (out_dtype != -1) {
-          return false;
-        }
+
+      auto dtype = x_var_desc->GetDataType();
+#if IS_TRT_VERSION_GE(7000)
+      if (dtype != framework::proto::VarType::INT32 &&
+          dtype != framework::proto::VarType::FP32) {
+        VLOG(3) << "reduce op input data type must be int32 or float32";
+        return false;
       }
+#else
+      if (dtype != framework::proto::VarType::FP32) {
+        VLOG(3)
+            << "reduce op input data type must be float32 using TensorRT < 7.0";
+        return false;
+      }
+#endif
     }
 #if IS_TRT_VERSION_GE(7000)
     if (op_type == "tile") {
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 97952e4b71641e00e27592380a0fd88f2c17b1a0..023b40518edf216f76642aae1577507ee2c36486 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -34,6 +34,13 @@ if (WITH_ROCM)
             DEPS device_context malloc)
 endif()
 
+if(WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")       
+  nv_test(get_base_ptr_test SRCS get_base_ptr_test.cu DEPS malloc gpu_info)
+  set_tests_properties(get_base_ptr_test PROPERTIES 
+                       ENVIRONMENT "FLAGS_allocator_strategy=auto_growth;
+                                    FLAGS_use_stream_safe_cuda_allocator=true;")
+endif()
+
 #if (WITH_GPU)
 #   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place memory)
 #endif()
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 939ad140415df45619018536520e3ffb9d681366..c0d1934a703b66a8ab8a1eab0c1d0680d73b9e17 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -125,10 +125,3 @@ if(NOT WIN32)
   cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator)
   cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator)
 endif(NOT WIN32)
-
-if(WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")       
-  nv_test(base_ptr_test SRCS base_ptr_test.cu DEPS malloc gpu_info)
-  set_tests_properties(base_ptr_test PROPERTIES 
-                       ENVIRONMENT "FLAGS_allocator_strategy=auto_growth;
-                                    FLAGS_use_stream_safe_cuda_allocator=true;")
-endif()
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index 3f04d47516377251011174b1382679ba41fdca02..878633d1a62915383aa1c5306dcc7940d06282e4 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -93,14 +93,7 @@ class Allocation : public pten::Allocation {
              const platform::Place& place)
       : pten::Allocation(ptr, size, place), base_ptr_(base_ptr) {}
 
-  void* base_ptr() const {
-    PADDLE_ENFORCE_EQ(FLAGS_allocator_strategy, "auto_growth",
-                      paddle::platform::errors::Unimplemented(
-                          "base_ptr() is only implemented for auto_growth "
-                          "strategy, not support %s strategy",
-                          FLAGS_allocator_strategy));
-    return base_ptr_;
-  }
+  void* base_ptr() const { return base_ptr_; }
 
  private:
   inline void RegisterDecoratedAllocator(Allocator* allocator) {
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 6615bdf4b138b483761c82312841f5887f6075c7..7cdac0de6138f13325500759c0ca2a392e2000f9 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -282,6 +282,10 @@ class AllocatorFacadePrivate {
     return iter->second;
   }
 
+  void* GetBasePtr(const std::shared_ptr<pten::Allocation>& allocation) {
+    return static_cast<Allocation*>(allocation.get())->base_ptr();
+  }
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   bool HasCUDAAllocator(const platform::CUDAPlace& place,
                         const gpuStream_t& stream) {
@@ -821,6 +825,21 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
   return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
 }
 
+void* AllocatorFacade::GetBasePtr(
+    const std::shared_ptr<pten::Allocation>& allocation) {
+  PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
+                    paddle::platform::errors::Unimplemented(
+                        "GetBasePtr() is only implemented for auto_growth "
+                        "strategy, not support allocator strategy: %d",
+                        static_cast<int>(GetAllocatorStrategy())));
+  PADDLE_ENFORCE_EQ(platform::is_gpu_place(allocation->place()), true,
+                    paddle::platform::errors::Unimplemented(
+                        "GetBasePtr() is only implemented for CUDAPlace(), not "
+                        "suppot place: %s",
+                        allocation->place()));
+  return m_->GetBasePtr(allocation);
+}
+
 std::shared_ptr<pten::Allocation> AllocatorFacade::AllocShared(
     const platform::Place& place, size_t size) {
   return std::shared_ptr<pten::Allocation>(Alloc(place, size));
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 76e2f0b5a94f6ddae8e8fb6281bdfcf70f10b76c..a9b92e1801e4a3c74941388f864172f078d7128a 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -51,6 +51,8 @@ class AllocatorFacade {
 
   const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
 
+  void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
+
   // Allocate a shared allocation.
   std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                           size_t size);
diff --git a/paddle/fluid/memory/allocation/base_ptr_test.cu b/paddle/fluid/memory/get_base_ptr_test.cu
similarity index 80%
rename from paddle/fluid/memory/allocation/base_ptr_test.cu
rename to paddle/fluid/memory/get_base_ptr_test.cu
index 5edabfcb9f5e7efab1242da5f5c091bebcf74c11..fe1d73b60284968d1e0022eb0383bcbcdc25856f 100644
--- a/paddle/fluid/memory/allocation/base_ptr_test.cu
+++ b/paddle/fluid/memory/get_base_ptr_test.cu
@@ -35,9 +35,9 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
   void OneByOneAllocTest() {
     for (size_t i = 0; i < alloc_times_; ++i) {
       size_t size = dis_(random_engine_);
-      AllocationPtr allocation = Alloc(place_, size);
+      auto allocation = AllocShared(place_, size);
 
-      void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
+      void* base_ptr = GetBasePtr(allocation);
       void* system_ptr =
           platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
       EXPECT_EQ(base_ptr, system_ptr);
@@ -47,21 +47,21 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
   }
 
   void BatchByBatchAllocTest() {
-    std::vector<AllocationPtr> allocations;
+    std::vector<std::shared_ptr<pten::Allocation>> allocations;
     allocations.reserve(batch_size_);
     size_t batch_num = alloc_times_ / batch_size_;
 
     for (size_t i = 0; i < batch_num; ++i) {
       for (size_t j = 0; j < batch_size_; ++j) {
         size_t size = dis_(random_engine_);
-        AllocationPtr allocation = Alloc(place_, size);
+        auto allocation = AllocShared(place_, size);
 
-        void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
+        void* base_ptr = GetBasePtr(allocation);
         void* system_ptr =
             platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
         EXPECT_EQ(base_ptr, system_ptr);
 
-        allocations.emplace_back(std::move(allocation));
+        allocations.emplace_back(allocation);
       }
       allocations.clear();
     }
@@ -70,19 +70,19 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
   }
 
   void ContinuousAllocTest() {
-    std::vector<AllocationPtr> allocations;
+    std::vector<std::shared_ptr<pten::Allocation>> allocations;
     allocations.reserve(alloc_times_);
 
     for (size_t i = 0; i < alloc_times_; ++i) {
       size_t size = dis_(random_engine_);
-      AllocationPtr allocation = Alloc(place_, size);
+      auto allocation = AllocShared(place_, size);
 
-      void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
+      void* base_ptr = GetBasePtr(allocation);
       void* system_ptr =
           platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
       EXPECT_EQ(base_ptr, system_ptr);
 
-      allocations.emplace_back(std::move(allocation));
+      allocations.emplace_back(allocation);
     }
 
     allocations.clear();
@@ -90,8 +90,8 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
   }
 
   void ZeroSizeAllocTest() {
-    AllocationPtr allocation = Alloc(place_, 0);
-    void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
+    auto allocation = AllocShared(place_, 0);
+    void* base_ptr = GetBasePtr(allocation);
     void* system_ptr =
         platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
     EXPECT_EQ(base_ptr, system_ptr);
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 3e859377e98d801e775461d9cfaaa50fe9c43e8e..63c562be97fa0728b26761ac856caf755717a64d 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -47,6 +47,10 @@ bool InSameStream(const std::shared_ptr<Allocation>& allocation,
                                                               stream);
 }
 
+void* GetBasePtr(const std::shared_ptr<Allocation>& allocation) {
+  return allocation::AllocatorFacade::Instance().GetBasePtr(allocation);
+}
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
                     const gpuStream_t& stream) {
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 6443e91f08cbeb7c3f504e8f4894808bffd5bbf1..855cbb775a1096ba749d93667c71268045645a15 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -44,6 +44,8 @@ extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
 extern bool InSameStream(const std::shared_ptr<Allocation>& allocation,
                          const platform::Stream& stream);
 
+extern void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
                            const gpuStream_t& stream);
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 6d348ceb87c83de1bb201a6b57477d764b58a2ba..d2ab438fd2946701c70ea0bebf35ac33fbfb521e 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -57,33 +57,6 @@ void Copy<platform::IPUPlace, platform::IPUPlace>(platform::IPUPlace dst_place,
   std::memcpy(dst, src, num);
 }
 
-// NOTE: only for CPUPlace and IPUPlace.
-template <>
-void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
-                                    pten::Place src_place, const void* src,
-                                    size_t num) {
-  if (src_place.GetType() == pten::AllocationType::CPU &&
-      dst_place.GetType() == pten::AllocationType::CPU) {
-    platform::CPUPlace place_dst, place_src;
-    return Copy(place_dst, dst, place_src, src, num);
-  } else if (src_place.GetType() == pten::AllocationType::CPU &&
-             dst_place.GetType() == pten::AllocationType::IPU) {
-    platform::IPUPlace place_dst(dst_place.GetDeviceId());
-    platform::CPUPlace place_src;
-    return Copy(place_dst, dst, place_src, src, num);
-  } else if (src_place.GetType() == pten::AllocationType::IPU &&
-             dst_place.GetType() == pten::AllocationType::CPU) {
-    platform::IPUPlace place_src(src_place.GetDeviceId());
-    platform::CPUPlace place_dst;
-    return Copy(place_dst, dst, place_src, src, num);
-  } else if (src_place.GetType() == pten::AllocationType::IPU &&
-             dst_place.GetType() == pten::AllocationType::IPU) {
-    platform::IPUPlace place_src(src_place.GetDeviceId());
-    platform::IPUPlace place_dst(dst_place.GetDeviceId());
-    return Copy(place_dst, dst, place_src, src, num);
-  }
-}
-
 // NOTE: only for (CPUPlace and IPUPlace) -> (IPUPlace).
 template <>
 void Copy<pten::IPUPlace, pten::Place>(pten::IPUPlace dst_place, void* dst,
@@ -1039,6 +1012,24 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
     return Copy(place_dst, dst, place_src, src, num);
   }
 #endif
+#ifdef PADDLE_WITH_IPU
+  else if (src_place.GetType() == pten::AllocationType::CPU &&
+           dst_place.GetType() == pten::AllocationType::IPU) {
+    platform::IPUPlace place_dst(dst_place.GetDeviceId());
+    platform::CPUPlace place_src;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::IPU &&
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::IPUPlace place_src(src_place.GetDeviceId());
+    platform::CPUPlace place_dst;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::IPU &&
+             dst_place.GetType() == pten::AllocationType::IPU) {
+    platform::IPUPlace place_src(src_place.GetDeviceId());
+    platform::IPUPlace place_dst(dst_place.GetDeviceId());
+    return Copy(place_dst, dst, place_src, src, num);
+  }
+#endif
 }
 
 // NOTE: Only for (CPUPlace) -> (CPUPlace and PinnedPlace).
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index d18ff6f6bfe2f0b04966af9e80bc40f3bebfc593..cbc61fc804397b1f0e4ae28fc792959bf5cfe82e 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -59,6 +59,10 @@ if(WITH_CINN)
     add_subdirectory(cinn)
 endif()
 
+if(WITH_IPU)
+    add_subdirectory(ipu)
+endif()
+
 SET(OP_HEADER_DEPS xxhash executor)
 
 if (WITH_GPU)
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
index 979ae5c508c6b6685848b8eff4944aa5461a1daa..5d769214df4d15823066d6a0c2b5a5af0e06261d 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
@@ -94,11 +94,11 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
         inverse_scale = 0.0;
       }
 
-      paddle::platform::XPUVersion version = dev_ctx.xpu_version();
+      auto version = dev_ctx.xpu_version();
       framework::Tensor float_x;
       framework::Tensor float_out;
       if (std::is_same<T, paddle::platform::float16>::value &&
-          (version == paddle::platform::XPUVersion::XPU1)) {
+          (version == pten::backends::xpu::XPUVersion::XPU1)) {
         float_x.mutable_data<MPDType>(dev_ctx.GetPlace(),
                                       x->numel() * sizeof(MPDType));
         float_out.mutable_data<MPDType>(dev_ctx.GetPlace(),
diff --git a/paddle/fluid/operators/assign_op.h b/paddle/fluid/operators/assign_op.h
index 5fe2ebb20745b28a2c5a34b2257d741f2ca49d05..1125bbe93c37a99966d49e5da623903d6ba9bf19 100644
--- a/paddle/fluid/operators/assign_op.h
+++ b/paddle/fluid/operators/assign_op.h
@@ -50,9 +50,8 @@ class AssignFunctor {
     }
   }
 
-  void operator()(const framework::SelectedRows &rows) const {
-    framework::SelectedRows &out_rows =
-        *out_->GetMutable<framework::SelectedRows>();
+  void operator()(const pten::SelectedRows &rows) const {
+    pten::SelectedRows &out_rows = *out_->GetMutable<pten::SelectedRows>();
     out_rows.set_rows(rows.rows());
     out_rows.set_height(rows.height());
     auto &t = rows.value();
diff --git a/paddle/fluid/operators/assign_op_test.cc b/paddle/fluid/operators/assign_op_test.cc
index 3504ec37d6670b73e93a416ca2d9244b94b46b91..efc1ed9e2ee6045870d1201d686df5a145574bd8 100644
--- a/paddle/fluid/operators/assign_op_test.cc
+++ b/paddle/fluid/operators/assign_op_test.cc
@@ -87,7 +87,7 @@ TEST(AssignOp, AssignSelectedRows) {
   std::vector<int64_t> rows{0, 4, 7};
   int64_t height = 10;
 
-  paddle::framework::SelectedRows input(rows, height);
+  pten::SelectedRows input(rows, height);
   paddle::framework::Tensor* input_tensor = input.mutable_value();
 
   paddle::framework::DDim in_dims = paddle::framework::make_ddim({3, 4});
@@ -98,7 +98,7 @@ TEST(AssignOp, AssignSelectedRows) {
 
   assign_functor(input);
 
-  auto& out_selected_row = output.Get<paddle::framework::SelectedRows>();
+  auto& out_selected_row = output.Get<pten::SelectedRows>();
   const paddle::framework::Vector<int64_t>& out_rows = out_selected_row.rows();
   EXPECT_EQ(rows.size(), out_rows.size());
   for (size_t i = 0; i < rows.size(); ++i) {
diff --git a/paddle/fluid/operators/batch_norm_op_mlu.cc b/paddle/fluid/operators/batch_norm_op_mlu.cc
index 534af63d2a03fb0fe71769e32e3e9377be5ba68b..0e64b461786cce845f7388a520c09101dcba9c09 100644
--- a/paddle/fluid/operators/batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/batch_norm_op_mlu.cc
@@ -106,7 +106,7 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
     if (ctx.HasInput("MomentumTensor")) {
       const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
       Tensor mom_cpu;
-      TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
+      framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
       momentum = mom_cpu.data<float>()[0];
     }
 
diff --git a/paddle/fluid/operators/clip_by_norm_op.cu b/paddle/fluid/operators/clip_by_norm_op.cu
index 368fbe836c266c8835f544b7d739797faf019a81..4d04fdc8ce2d2c658d7e39535dbd9ff2d31c216e 100644
--- a/paddle/fluid/operators/clip_by_norm_op.cu
+++ b/paddle/fluid/operators/clip_by_norm_op.cu
@@ -36,21 +36,22 @@ class ClipByNormKernel<platform::CUDADeviceContext, platform::float16>
 
       output = context.Output<Tensor>("Out");
       output->mutable_data<platform::float16>(context.GetPlace());
-    } else if (in_var->IsType<SelectedRows>()) {
-      auto* x = context.Input<SelectedRows>("X");
+    } else if (in_var->IsType<pten::SelectedRows>()) {
+      auto* x = context.Input<pten::SelectedRows>("X");
 
       // merge ids in selected rows first
       math::scatter::MergeAdd<platform::CUDADeviceContext, platform::float16>
           merge_func;
-      SelectedRows* merged_input =
+      pten::SelectedRows* merged_input =
           const_cast<framework::Scope&>(context.scope())
               .Var()
-              ->GetMutable<SelectedRows>();
+              ->GetMutable<pten::SelectedRows>();
       merge_func(context.template device_context<platform::CUDADeviceContext>(),
                  *x, merged_input);
       input = &(merged_input->value());
 
-      SelectedRows* output_selected_rows = context.Output<SelectedRows>("Out");
+      pten::SelectedRows* output_selected_rows =
+          context.Output<pten::SelectedRows>("Out");
       output_selected_rows->set_rows(merged_input->rows());
       output_selected_rows->set_height(merged_input->height());
       output = output_selected_rows->mutable_value();
diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
index adb2a2fcfa3a7050ad8fd80dcdd4acb04ce49d2d..fb21e98efec2c732b8abeb88343982f62ad07712 100644
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -24,7 +24,7 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-using SelectedRows = framework::SelectedRows;
+// using SelectedRows = pten::SelectedRows;
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
@@ -43,20 +43,21 @@ class ClipByNormKernel : public framework::OpKernel<T> {
 
       output = context.Output<Tensor>("Out");
       output->mutable_data<T>(context.GetPlace());
-    } else if (in_var->IsType<SelectedRows>()) {
-      auto* x = context.Input<SelectedRows>("X");
+    } else if (in_var->IsType<pten::SelectedRows>()) {
+      auto* x = context.Input<pten::SelectedRows>("X");
 
       // merge ids in selected rows first
       math::scatter::MergeAdd<DeviceContext, T> merge_func;
-      SelectedRows* merged_input =
+      pten::SelectedRows* merged_input =
           const_cast<framework::Scope&>(context.scope())
               .Var()
-              ->GetMutable<SelectedRows>();
+              ->GetMutable<pten::SelectedRows>();
       merge_func(context.template device_context<DeviceContext>(), *x,
                  merged_input);
       input = &(merged_input->value());
 
-      SelectedRows* output_selected_rows = context.Output<SelectedRows>("Out");
+      pten::SelectedRows* output_selected_rows =
+          context.Output<pten::SelectedRows>("Out");
       output_selected_rows->set_rows(merged_input->rows());
       output_selected_rows->set_height(merged_input->height());
       output = output_selected_rows->mutable_value();
diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h
index fb41dc16d65129e84df693ab9aed6af4607c0db8..5aff62656fb0f4ba0b0044e8c4a6dcabe42181d5 100644
--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
@@ -113,9 +113,9 @@ class ClipKernel : public framework::OpKernel<T> {
         trans(context.template device_context<DeviceContext>(), x_data,
               x_data + numel, out_data, ClipFunctor<T>(min, max));
       }
-    } else if (x_var->IsType<framework::SelectedRows>()) {
-      auto* x = context.Input<framework::SelectedRows>("X");
-      auto* out = context.Output<framework::SelectedRows>("Out");
+    } else if (x_var->IsType<pten::SelectedRows>()) {
+      auto* x = context.Input<pten::SelectedRows>("X");
+      auto* out = context.Output<pten::SelectedRows>("Out");
       PADDLE_ENFORCE_NE(x, out, platform::errors::InvalidArgument(
                                     "Inplace clip is not allowed "
                                     "when x is SelectedRows"));
diff --git a/paddle/fluid/operators/concat_op_mlu.cc b/paddle/fluid/operators/concat_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f7a1cae72be5a42d15d2e89663010489f529962a
--- /dev/null
+++ b/paddle/fluid/operators/concat_op_mlu.cc
@@ -0,0 +1,85 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/concat_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ConcatMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
+    framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
+    PADDLE_ENFORCE_NOT_NULL(ins[0],
+                            platform::errors::NotFound(
+                                "The first input tensor is not initalized."));
+    auto axis = ctx.Attr<int>("axis");
+    auto ins_size = ins.size();
+    bool need_resize_out_dims = false;
+    if (ctx.HasInput("AxisTensor")) {
+      auto* axis_tensor = ctx.Input<framework::Tensor>("AxisTensor");
+      axis = GetDataFromTensor<int>(axis_tensor)[0];
+      need_resize_out_dims = true;
+    }
+    axis = ComputeAxis(static_cast<int64_t>(axis),
+                       static_cast<int64_t>(ins[0]->dims().size()));
+
+    if (need_resize_out_dims) {
+      const size_t n = ins.size();
+      std::vector<framework::DDim> ins_dims(n);
+      for (size_t i = 0; i < n; i++) {
+        ins_dims[i] = ins[i]->dims();
+      }
+
+      framework::DDim out_dims = ComputeAndCheckShape(true, ins_dims, axis);
+      out->Resize(out_dims);
+    }
+    const int axis_t = axis;
+    const int ins_size_t = ins_size;
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+
+    // mlu should do sth
+    // init ins tensors
+    std::vector<const void*> inputs;
+    std::vector<MLUCnnlTensorDesc> input_descs;
+    std::vector<cnnlTensorDescriptor_t> desc_vector;
+    for (size_t i = 0; i < ins_size; i++) {
+      input_descs.emplace_back(MLUCnnlTensorDesc(
+          *ins[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(ins[i]->type())));
+      desc_vector.push_back(input_descs.back().get());
+      inputs.push_back(GetBasePtr(ins[i]));
+    }
+    // init out tensors
+    MLUCnnlTensorDesc output_desc(*out, CNNL_LAYOUT_ARRAY,
+                                  ToCnnlDataType(out->type()));
+
+    // MLU should do sth
+    MLUCnnl::Concat(ctx, ins_size_t, axis_t, desc_vector.data(), inputs.data(),
+                    output_desc.get(), GetBasePtr(out));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_MLU_KERNEL(concat, ops::ConcatMLUKernel<float>,
+                       ops::ConcatMLUKernel<paddle::platform::float16>,
+                       ops::ConcatMLUKernel<int64_t>,
+                       ops::ConcatMLUKernel<bool>, ops::ConcatMLUKernel<int>,
+                       ops::ConcatMLUKernel<uint8_t>);
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index 4c9727391759b0c1865e9fc51288458e7786c878..7ad49de4eed5e26cdc24a7444ead9a50abf54453 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -251,7 +251,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
               args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS,
               &perf_count, perf_results.get()));
       algo = (perf_results.get())[best_algo_idx].algo;
-      workspace_size = GetWorkspaceSize(args, algo);
+      workspace_size = (perf_results.get())[best_algo_idx].memory;
 
       if (workspace_size > workspace_size_limit) {
 #if CUDNN_VERSION >= 8000
@@ -502,7 +502,8 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
               args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS,
               &perf_count, perf_results.get()));
       algo = (perf_results.get())[best_algo_idx].algo;
-      workspace_size = GetWorkspaceSize(args, algo);
+      workspace_size = (perf_results.get())[best_algo_idx].memory;
+
       if (workspace_size > workspace_size_limit) {
         workspace_size = workspace_size_limit;
 #if CUDNN_VERSION >= 8000
diff --git a/paddle/fluid/operators/conv_op_mlu.cc b/paddle/fluid/operators/conv_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..88698c02dd5daf11d6c5b7d68446d292696977ec
--- /dev/null
+++ b/paddle/fluid/operators/conv_op_mlu.cc
@@ -0,0 +1,251 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/conv_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+class MLUConvOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* output = ctx.Output<Tensor>("Output");
+    output->mutable_data<T>(ctx.GetPlace());
+    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
+    const std::string padding_algorithm =
+        ctx.Attr<std::string>("padding_algorithm");
+    const std::string data_format = ctx.Attr<std::string>("data_format");
+
+    const bool channel_last = data_format == "NHWC";
+
+    // update padding and dilation
+    auto in_dims = input->dims();
+    auto filter_dims = filter->dims();
+    auto in_dims_size = in_dims.size();
+    framework::DDim in_data_dims;
+    framework::DDim filter_data_dims;
+
+    if (channel_last) {
+      in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    } else {
+      in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
+    }
+    filter_data_dims = framework::slice_ddim(filter_dims, 2, in_dims.size());
+    std::vector<int> ksize = framework::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+
+    Tensor input_tensor(input->type());
+    Tensor output_tensor(output->type());
+    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
+    if (channel_last) {
+      input_tensor.ShareDataWith(*input);
+      output_tensor.ShareDataWith(*output);
+    } else {
+      // transpose input from NCHW to NHWC
+      TransposeFromMLUTensor<T>(ctx, perm_to_nhwc, input, &input_tensor,
+                                true /*need_reshape_or_alloc*/);
+      auto output_dims = output->dims();
+      output_tensor.mutable_data<T>(
+          {output_dims[0], output_dims[2], output_dims[3], output_dims[1]},
+          ctx.GetPlace());
+    }
+    input_tensor.set_layout(DataLayout::kNHWC);
+    output_tensor.set_layout(DataLayout::kNHWC);
+
+    // transpose filter from MCHW to MHWC
+    Tensor trans_filter(filter->type());
+    TransposeFromMLUTensor<T>(ctx, perm_to_nhwc, filter, &trans_filter,
+                              true /*need_reshape_or_alloc*/);
+
+    cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
+    MLUCnnlTensorDesc input_desc(input_tensor, data_layout,
+                                 ToCnnlDataType(input_tensor.type()));
+    MLUCnnlTensorDesc filter_desc(trans_filter, data_layout,
+                                  ToCnnlDataType(trans_filter.type()));
+    MLUCnnlTensorDesc output_desc(output_tensor, data_layout,
+                                  ToCnnlDataType(output_tensor.type()));
+
+    MLUCnnlConvolutionDesc conv_desc(in_dims_size, paddings.data(),
+                                     strides.data(), dilations.data(), groups,
+                                     ToCnnlDataType<T>());
+
+    MLUCnnl::ConvolutionForward(
+        ctx, conv_desc.get(), nullptr /*alpha*/, nullptr /*beta*/,
+        nullptr /*bias_desc*/, nullptr /*bias_ptr*/, input_desc.get(),
+        GetBasePtr(&input_tensor), filter_desc.get(), GetBasePtr(&trans_filter),
+        output_desc.get(), GetBasePtr(&output_tensor));
+
+    if (!channel_last) {
+      // transpose ouput from NHWC to NCHW
+      const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
+      TransposeFromMLUTensor<T>(ctx, perm_to_nchw, &output_tensor, output,
+                                false /*need_reshape_or_alloc*/);
+    }
+  }
+};
+
+template <typename T>
+class MLUConvGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto input = ctx.Input<Tensor>("Input");
+    auto filter = ctx.Input<Tensor>("Filter");
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+
+    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
+    const std::string padding_algorithm =
+        ctx.Attr<std::string>("padding_algorithm");
+    const std::string data_format = ctx.Attr<std::string>("data_format");
+
+    const bool channel_last = data_format == "NHWC";
+
+    // update padding and dilation
+    auto in_dims = input->dims();
+    auto filter_dims = filter->dims();
+    auto in_dims_size = in_dims.size();
+    framework::DDim in_data_dims;
+    framework::DDim filter_data_dims;
+
+    if (channel_last) {
+      in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    } else {
+      in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
+    }
+    filter_data_dims = framework::slice_ddim(filter_dims, 2, in_dims.size());
+
+    std::vector<int> ksize = framework::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+
+    Tensor input_tensor(input->type());
+    Tensor output_grad_tensor(output_grad->type());
+    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
+    const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
+    if (channel_last) {
+      input_tensor.ShareDataWith(*input);
+      output_grad_tensor.ShareDataWith(*output_grad);
+    } else {
+      // transpose input and output_grad from NCHW to NHWC
+      TransposeFromMLUTensor<T>(ctx, perm_to_nhwc, input, &input_tensor,
+                                true /*need_reshape_or_alloc*/);
+      TransposeFromMLUTensor<T>(ctx, perm_to_nhwc, output_grad,
+                                &output_grad_tensor,
+                                true /*need_reshape_or_alloc*/);
+    }
+    input_tensor.set_layout(DataLayout::kNHWC);
+    output_grad_tensor.set_layout(DataLayout::kNHWC);
+
+    if (filter_grad) {
+      filter_grad->mutable_data<T>(ctx.GetPlace());
+
+      auto filter_grad_dims = filter_grad->dims();
+      Tensor temp_filter_grad(filter_grad->type());
+      temp_filter_grad.mutable_data<T>(
+          {filter_grad_dims[0], filter_grad_dims[2], filter_grad_dims[3],
+           filter_grad_dims[1]},
+          ctx.GetPlace());
+
+      cnnlDataType_t tensor_dtype = ToCnnlDataType<T>();
+      cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
+      MLUCnnlTensorDesc input_desc(input_tensor, data_layout, tensor_dtype);
+      MLUCnnlTensorDesc out_grad_desc(output_grad_tensor, data_layout,
+                                      tensor_dtype);
+      MLUCnnlTensorDesc temp_filter_grad_desc(temp_filter_grad, data_layout,
+                                              tensor_dtype);
+
+      MLUCnnlConvolutionDesc conv_desc(in_dims_size, paddings.data(),
+                                       strides.data(), dilations.data(), groups,
+                                       tensor_dtype);
+
+      MLUCnnl::ConvBackpropFilter(
+          ctx, conv_desc.get(), input_desc.get(), GetBasePtr(&input_tensor),
+          out_grad_desc.get(), GetBasePtr(&output_grad_tensor),
+          temp_filter_grad_desc.get(), GetBasePtr(&temp_filter_grad));
+
+      // transpose filter_grad from MHWC to MCHW
+      TransposeFromMLUTensor<T>(ctx, perm_to_nchw, &temp_filter_grad,
+                                filter_grad, false /*need_reshape_or_alloc*/);
+    }
+    if (input_grad) {
+      input_grad->mutable_data<T>(ctx.GetPlace());
+
+      Tensor input_grad_tensor(input_grad->type());
+      if (channel_last) {
+        input_grad_tensor.ShareDataWith(*input_grad);
+      } else {
+        auto input_grad_dims = input_grad->dims();
+        input_grad_tensor.mutable_data<T>(
+            {input_grad_dims[0], input_grad_dims[2], input_grad_dims[3],
+             input_grad_dims[1]},
+            ctx.GetPlace());
+      }
+      input_grad_tensor.set_layout(DataLayout::kNHWC);
+
+      // transpose filter from MCHW to MHWC
+      Tensor trans_filter(filter->type());
+      TransposeFromMLUTensor<T>(ctx, perm_to_nhwc, filter, &trans_filter,
+                                true /*need_reshape_or_alloc*/);
+
+      cnnlDataType_t tensor_dtype = ToCnnlDataType<T>();
+      cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
+      MLUCnnlTensorDesc filter_desc(trans_filter, data_layout, tensor_dtype);
+      MLUCnnlTensorDesc out_grad_desc(output_grad_tensor, data_layout,
+                                      tensor_dtype);
+      MLUCnnlTensorDesc in_grad_desc(input_grad_tensor, data_layout,
+                                     tensor_dtype);
+
+      MLUCnnlConvolutionDesc conv_desc(in_dims_size, paddings.data(),
+                                       strides.data(), dilations.data(), groups,
+                                       tensor_dtype);
+
+      MLUCnnl::ConvBackpropInput(
+          ctx, conv_desc.get(), filter_desc.get(), GetBasePtr(&trans_filter),
+          out_grad_desc.get(), GetBasePtr(&output_grad_tensor),
+          in_grad_desc.get(), GetBasePtr(&input_grad_tensor));
+
+      if (!channel_last) {
+        // transpose input_grad from NHWC to NCHW
+        TransposeFromMLUTensor<T>(ctx, perm_to_nchw, &input_grad_tensor,
+                                  input_grad, false /*need_reshape_or_alloc*/);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(conv2d, ops::MLUConvOpKernel<float>,
+                       ops::MLUConvOpKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(conv2d_grad, ops::MLUConvGradOpKernel<float>,
+                       ops::MLUConvGradOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc
index cded525b030d8d88774d01488e3575195381bba4..e80797bd9b971a210efa423d4797984fb1dacf7d 100644
--- a/paddle/fluid/operators/dropout_op_xpu.cc
+++ b/paddle/fluid/operators/dropout_op_xpu.cc
@@ -107,8 +107,8 @@ class DropoutGradXPUKernel : public framework::OpKernel<T> {
       return;
     }
 
-    paddle::platform::XPUVersion version = dev_ctx.xpu_version();
-    if (version == paddle::platform::XPUVersion::XPU1) {
+    auto version = dev_ctx.xpu_version();
+    if (version == pten::backends::xpu::XPUVersion::XPU1) {
       xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
       XPUType* mask_new = RAII_GUARD.alloc_l3_or_gm<XPUType>(mask->numel());
       float scale =
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index 0c2476fde05c2c3226105707fd2686bc61d15bc7..f462c2ea0720b600f238109704e9606a2f7d627c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -33,34 +33,6 @@ class CPUDeviceContext;
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct SameDimsElemwiseAdd<
-    platform::CPUDeviceContext, T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  void operator()(const framework::ExecutionContext &ctx,
-                  const framework::Tensor *x, const framework::Tensor *y,
-                  framework::Tensor *z) {
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-    blas.VADD(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
-  }
-};
-
-template <typename T>
-struct SameDimsElemwiseAdd<
-    platform::CPUDeviceContext, T,
-    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
-  void operator()(const framework::ExecutionContext &ctx,
-                  const framework::Tensor *x, const framework::Tensor *y,
-                  framework::Tensor *z) {
-    auto eigen_x = framework::EigenVector<T>::Flatten(*x);
-    auto eigen_y = framework::EigenVector<T>::Flatten(*y);
-    auto eigen_z = framework::EigenVector<T>::Flatten(*z);
-    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    eigen_z.device(place) = eigen_x + eigen_y;
-  }
-};
-
 class ElementwiseAddOpMaker : public ElementwiseOpMaker {
  protected:
   std::string GetName() const override { return "Add"; }
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 779779b44da8d1df275b057bbb9d37828c6904ed..2326aa561eaa05986c6e58bc1f2f2c93334cf893 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -13,139 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/pten/kernels/gpu/elementwise.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
 namespace paddle {
-namespace operators {
-
-template <typename T>
-static __global__ void SimpleElemwiseAddGradCUDAKernel(
-    const T* __restrict__ dout, int size, int vec_size, T* dx, T* dy) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = gridDim.x * blockDim.x;
-  int loop = size / vec_size;
-  int remainder = size % vec_size;
-  const float4* dout_vec = reinterpret_cast<const float4*>(dout);
-  float4* dx_vec = reinterpret_cast<float4*>(dx);
-  float4* dy_vec = reinterpret_cast<float4*>(dy);
-  float4 tmp_loop;
-
-  for (int i = tid; i < loop; i += stride) {
-    tmp_loop = dout_vec[i];
-    dx_vec[i] = tmp_loop;
-    dy_vec[i] = tmp_loop;
-  }
-
-  if (tid == loop && remainder != 0) {
-    T tmp_rem;
-    while (remainder) {
-      int idx = size - remainder;
-      remainder--;
-      tmp_rem = dout[idx];
-      dx[idx] = tmp_rem;
-      dy[idx] = tmp_rem;
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-default_elementwise_add_grad(const framework::ExecutionContext& ctx,
-                             const framework::Tensor* x,
-                             const framework::Tensor* y,
-                             const framework::Tensor* out,
-                             const framework::Tensor* dout,
-                             framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  auto* dout_data = dout->data<T>();
-
-  // dx
-  if (dx != nullptr) {
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    if (dx->dims() == dout->dims()) {
-      if (dx_data != dout_data) {
-        framework::TensorCopy(
-            *dout, ctx.GetPlace(),
-            ctx.template device_context<platform::DeviceContext>(), dx);
-      }
-    } else {
-      // For inplace strategy, dx will be stored in addr of dout, which makes
-      // the result of dy wrong.
-      if (dx->IsSharedBufferWith(*dout)) {
-        dx->clear();
-        dx->mutable_data<T>(x->dims(), ctx.GetPlace());
-      }
-      std::vector<int> reduce_dims = GetReduceDim(x->dims(), out->dims(), axis);
-      gpuStream_t stream = ctx.cuda_device_context().stream();
-      TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          *dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
-    }
-  }
-  // dy
-  if (dy != nullptr) {
-    auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
-    if (dy->dims() == dout->dims()) {
-      if (dy_data != dout_data) {
-        framework::TensorCopy(
-            *dout, ctx.GetPlace(),
-            ctx.template device_context<platform::DeviceContext>(), dy);
-      }
-    } else {
-      std::vector<int> reduce_dims = GetReduceDim(y->dims(), out->dims(), axis);
-      gpuStream_t stream = ctx.cuda_device_context().stream();
-      TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          *dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
-elementwise_add_grad(const framework::ExecutionContext& ctx,
-                     const framework::Tensor* x, const framework::Tensor* y,
-                     const framework::Tensor* out,
-                     const framework::Tensor* dout, framework::Tensor* dx,
-                     framework::Tensor* dy) {
-  auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-  auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
-  auto* dout_data = dout->data<T>();
-  if (dx_data == dout_data && dy_data != dout_data) {
-    VLOG(4) << "Special case when dx_data is the same as dout_data, "
-               "only need copy dout to dy";
-    framework::TensorCopy(
-        *dout, ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(), dy);
-  } else if (dx_data != dout_data && dy_data == dout_data) {
-    VLOG(4) << "Special case when dy_data is the same as dout_data, "
-               "only need copy dout to dx";
-    framework::TensorCopy(
-        *dout, ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(), dx);
-  } else if (dx_data != dout_data && dy_data != dout_data) {
-    auto size = x->numel();
-    int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
-    dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
-    dim3 grid_size =
-        dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
-                 PREDEFINED_BLOCK_SIZE,
-             1);
-    SimpleElemwiseAddGradCUDAKernel<
-        T><<<grid_size, block_size, 0,
-             ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
-        dout->data<T>(), size, vec_size, dx->mutable_data<T>(ctx.GetPlace()),
-        dy->mutable_data<T>(ctx.GetPlace()));
-  } else {
-    VLOG(4) << "Special case when dy_data is the same as dout_data, "
-               "and dx_data is the same as dout_data, do not need "
-               "any operator";
-  }
-}
-
-}  // namespace operators
+namespace operators {}  // namespace operators
 }  // namespace paddle
 REGISTER_OP_CUDA_KERNEL(
     elementwise_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index 5c4f791b2270c2d45909c24868e56d0bc62f86c3..73415d3fdb5c83cac1c0a8afb67548d7fa09b3c3 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -18,35 +18,13 @@ limitations under the License. */
 #include <utility>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
+// only can include the headers in paddle/pten/include dirs
+#include "paddle/pten/kernels/elementwise_grad_kernel.h"
 #include "paddle/pten/kernels/math_kernel.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
-void LaunchBroadcastElementwiseCpuKernel(const framework::ExecutionContext &ctx,
-                                         const framework::Tensor *x,
-                                         const framework::Tensor *y,
-                                         framework::Tensor *z) {
-  int axis = ctx.Attr<int>("axis");
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  if (x_dims.size() >= y_dims.size()) {
-    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          AddFunctor<T>(), z);
-  } else {
-    ElementwiseComputeEx<InverseAddFunctor<T>, DeviceContext, T>(
-        ctx, x, y, axis, InverseAddFunctor<T>(), z);
-  }
-}
-
-template <typename DeviceContext, typename T, class Enable = void>
-struct SameDimsElemwiseAdd {
-  void operator()(const framework::ExecutionContext &ctx,
-                  const framework::Tensor *x, const framework::Tensor *y,
-                  framework::Tensor *z);
-};
-
 template <typename DeviceContext, typename T>
 class ElementwiseAddKernel : public framework::OpKernel<T> {
  public:
@@ -58,128 +36,29 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
 
     auto &dev_ctx = ctx.device_context<DeviceContext>();
     int axis = ctx.Attr<int>("axis");
-    auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
-    auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
-    auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
     pten::AddRawKernel<T>(
         static_cast<const typename framework::ConvertToPtenContext<
             DeviceContext>::TYPE &>(dev_ctx),
-        *pt_x.get(), *pt_y.get(), axis, pt_z.get());
+        *x, *y, axis, z);
   }
 };
 
-template <typename T>
-struct IdentityGrad {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
-};
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-default_elementwise_add_grad(const framework::ExecutionContext &ctx,
-                             const framework::Tensor *x,
-                             const framework::Tensor *y,
-                             const framework::Tensor *out,
-                             const framework::Tensor *dout,
-                             framework::Tensor *dx, framework::Tensor *dy) {
-  int axis = ctx.Attr<int>("axis");
-
-  ElemwiseExplicitGradCompute<DeviceContext, T, IdentityGrad<T>,
-                              IdentityGrad<T>>(ctx, *x, *y, *out, *dout, axis,
-                                               dx, dy, IdentityGrad<T>(),
-                                               IdentityGrad<T>());
-}
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_floating_point<T>::value &&
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add_grad(const framework::ExecutionContext &ctx,
-                     const framework::Tensor *x, const framework::Tensor *y,
-                     const framework::Tensor *out,
-                     const framework::Tensor *dout, framework::Tensor *dx,
-                     framework::Tensor *dy) {
-  auto blas = math::GetBlas<DeviceContext, T>(ctx);
-  if (dx) {
-    blas.VCOPY(dout->numel(), dout->data<T>(),
-               dx->mutable_data<T>(ctx.GetPlace()));
-  }
-
-  if (dy) {
-    blas.VCOPY(dout->numel(), dout->data<T>(),
-               dy->mutable_data<T>(ctx.GetPlace()));
-  }
-}
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    !std::is_floating_point<T>::value &&
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add_grad(const framework::ExecutionContext &ctx,
-                     const framework::Tensor *x, const framework::Tensor *y,
-                     const framework::Tensor *out,
-                     const framework::Tensor *dout, framework::Tensor *dx,
-                     framework::Tensor *dy) {
-  default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-// cuda definition
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-elementwise_add_grad(const framework::ExecutionContext &ctx,
-                     const framework::Tensor *x, const framework::Tensor *y,
-                     const framework::Tensor *out,
-                     const framework::Tensor *dout, framework::Tensor *dx,
-                     framework::Tensor *dy);
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-default_elementwise_add_grad(const framework::ExecutionContext &ctx,
-                             const framework::Tensor *x,
-                             const framework::Tensor *y,
-                             const framework::Tensor *out,
-                             const framework::Tensor *dout,
-                             framework::Tensor *dx, framework::Tensor *dy);
-#endif
-
 template <typename DeviceContext, typename T>
 class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-
     using Tensor = framework::Tensor;
-
     auto *x = ctx.Input<Tensor>("X");
     auto *y = ctx.Input<Tensor>("Y");
     auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    // skip out
-    auto *out = dout;
-
-    // Special case when dy is not needed and dx doesn't reduce
-    if (dx != nullptr && dy == nullptr && dx->dims() == dout->dims()) {
-      VLOG(4) << "Special case when dy is not needed and dx doesn't "
-                 "reduce";
-      framework::TensorCopy(
-          *dout, ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(), dx);
-    } else if (dx == nullptr && dy != nullptr && dy->dims() == dout->dims()) {
-      VLOG(4) << "Special case when dx is not needed and dy doesn't "
-                 "reduce";
-      framework::TensorCopy(
-          *dout, ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(), dy);
-    } else if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-      elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
-    } else {
-      default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
-                                                     dy);
-    }
+    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    int axis = ctx.Attr<int>("axis");
+    pten::AddGradKernel<T>(
+        static_cast<const typename framework::ConvertToPtenContext<
+            DeviceContext>::TYPE &>(dev_ctx),
+        *x, *y, *dout, axis, dx, dy);
   }
 };
 
@@ -195,17 +74,20 @@ class ElementwiseAddDoubleGradKernel : public framework::OpKernel<T> {
     auto *ddy = ctx.Input<Tensor>("DDY");
 
     auto *ddout = ctx.Output<Tensor>("DDOut");
-
-    // ddOut = ddx + ddy
-    if (ddout) {
-      Tensor ddx_safe, ddy_safe;
-      GetDoubleGradSafeTensor<DeviceContext, T>(ctx, dout, ddx, &ddx_safe);
-      GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
-
-      ddout->mutable_data<T>(ctx.GetPlace());
-      LaunchBroadcastElementwiseCpuKernel<DeviceContext, T>(ctx, &ddx_safe,
-                                                            &ddy_safe, ddout);
+    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    int axis = ctx.Attr<int>("axis");
+    paddle::optional<const pten::DenseTensor &> ddx_optional = paddle::none;
+    paddle::optional<const pten::DenseTensor &> ddy_optional = paddle::none;
+    if (ddx != nullptr) {
+      ddx_optional = *ddx;
+    }
+    if (ddy != nullptr) {
+      ddy_optional = *ddy;
     }
+    pten::AddDoubleGradKernel<T>(
+        static_cast<const typename framework::ConvertToPtenContext<
+            DeviceContext>::TYPE &>(dev_ctx),
+        *y, ddx_optional, ddy_optional, *dout, axis, ddout);
   }
 };
 
@@ -219,32 +101,13 @@ class ElementwiseAddTripleGradKernel : public framework::OpKernel<T> {
     auto *d_ddout = ctx.Input<Tensor>("D_DDOut");
     auto *d_ddx = ctx.Output<Tensor>("D_DDX");
     auto *d_ddy = ctx.Output<Tensor>("D_DDY");
-    // skip out
-    auto *out = d_ddout;
-
-    // Special case when d_ddy is not needed and d_ddx doesn't reduce
-    if (d_ddx != nullptr && d_ddy == nullptr &&
-        d_ddx->dims() == d_ddout->dims()) {
-      VLOG(4) << "Special case when d_ddy is not needed and d_ddx doesn't "
-                 "reduce";
-      framework::TensorCopy(
-          *d_ddout, ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(), d_ddx);
-    } else if (d_ddx == nullptr && d_ddy != nullptr &&
-               d_ddy->dims() == d_ddout->dims()) {
-      VLOG(4) << "Special case when d_ddx is not needed and d_ddy doesn't "
-                 "reduce";
-      framework::TensorCopy(
-          *d_ddout, ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(), d_ddy);
-    } else if (d_ddx != nullptr && d_ddy != nullptr &&
-               (d_ddx->dims() == d_ddy->dims())) {
-      elementwise_add_grad<DeviceContext, T>(ctx, ddx, ddy, out, d_ddout, d_ddx,
-                                             d_ddy);
-    } else {
-      default_elementwise_add_grad<DeviceContext, T>(ctx, ddx, ddy, out,
-                                                     d_ddout, d_ddx, d_ddy);
-    }
+
+    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    int axis = ctx.Attr<int>("axis");
+    pten::AddTripleGradKernel<T>(
+        static_cast<const typename framework::ConvertToPtenContext<
+            DeviceContext>::TYPE &>(dev_ctx),
+        *ddx, *ddy, *d_ddout, axis, d_ddx, d_ddy);
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 0c7d12ae0ad55cedfced38705ae40d7394c07158..8923f1fd4b866252ec8048729c717e79230f1f7b 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -32,7 +32,7 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
                           ctx.InputName("X")));
     const auto& cuda_ctx =
         ctx.template device_context<platform::CUDADeviceContext>();
-    if (x_var->IsType<framework::SelectedRows>()) {
+    if (x_var->IsType<pten::SelectedRows>()) {
       framework::Tensor x_for_selectedrows;
       std::vector<const framework::Tensor*> ins;
       std::vector<framework::Tensor*> outs;
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index e7a5e48b1f1b5570d8a4c32b44aac4d8f0705d9a..40faf7cbbe8cd8b30891f5b5865a6eb17f5e27ed 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -92,20 +92,20 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
     auto* y = ctx.Input<framework::LoDTensor>("Y");
 
     framework::Tensor x, *z;
-    if (x_var->IsType<framework::SelectedRows>()) {
+    if (x_var->IsType<pten::SelectedRows>()) {
       PADDLE_ENFORCE_EQ(y->dims().size() == 1 && y->dims()[0] == 1, true,
                         platform::errors::InvalidArgument(
                             "For elementwise_op, if X is Sparse, Y must be "
                             "scalar. But reveived the size of Y = %s.",
                             y->dims().size()));
-      auto& x_sele = x_var->Get<framework::SelectedRows>();
-      auto out_sele = ctx.Output<framework::SelectedRows>("Out");
+      auto& x_sele = x_var->Get<pten::SelectedRows>();
+      auto out_sele = ctx.Output<pten::SelectedRows>("Out");
       x = x_sele.value();
       out_sele->set_rows(x_sele.rows());
       out_sele->set_height(x_sele.height());
       out_sele->mutable_value()->Resize(x_sele.value().dims());
       out_sele->mutable_value()->mutable_data(ctx.GetPlace(), x.type());
-      z = ctx.Output<framework::SelectedRows>("Out")->mutable_value();
+      z = ctx.Output<pten::SelectedRows>("Out")->mutable_value();
       z->mutable_data<T>(ctx.GetPlace());
       auto dims_equal = x.dims() == y->dims();
       if (dims_equal) {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index aaf33ca67448865abd172d7fdb9f10728ec5766d..64beac0804d0f650a65fe218d2a68495da2303f1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -354,6 +354,18 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
                                      tensor.place(), tensor.layout());
     }
   }
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext &ctx) const override {
+    if (Type() == "elementwise_add_grad") {
+      if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
+        return framework::KernelSignature(
+            "add_grad", {"X", "Y", framework::GradVarName("Out")}, {"axis"},
+            {framework::GradVarName("X"), framework::GradVarName("Y")});
+      }
+    }
+
+    return framework::KernelSignature("None", {"X"}, {}, {"Out"});
+  }
 };
 
 class ElementwiseOpDoubleGrad : public framework::OperatorWithKernel {
@@ -522,11 +534,9 @@ class ElemwiseGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &context) const override {
     auto *dx =
         context.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    if (dx != nullptr) {
-      auto &dout =
-          *context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-      dx->set_lod(dout.lod());
-    }
+    auto &dout =
+        *context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    pten::funcs::ElementwiseGradPreProcess(dout, dx);
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index f0641dd97d87f448021fc7f7a8e02be4cb44d2ba..fdf04181de76c64ba239ce8fbd83bf9f5d1c5124 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -84,7 +84,7 @@ int PackTensorsIntoVector(const framework::ExecutionContext &ctx,
     auto *x = ctx.Input<framework::LoDTensor>("X");
     z = ctx.Output<framework::LoDTensor>("Out");
     ins->emplace_back(x);
-  } else if (x_var->IsType<framework::SelectedRows>()) {
+  } else if (x_var->IsType<pten::SelectedRows>()) {
     PADDLE_ENFORCE_EQ(y->dims().size() == 1 && y->dims()[0] == 1, true,
                       platform::errors::InvalidArgument(
                           "For elementwise_op, if X is Sparse, Y must be "
@@ -96,15 +96,15 @@ int PackTensorsIntoVector(const framework::ExecutionContext &ctx,
             "The parameter x_for_selectedrows is excepted to "
             "be valid, once input varible X`s class type is "
             "SelectedRows.\n"));
-    auto &x_sele = x_var->Get<framework::SelectedRows>();
-    auto out_sele = ctx.Output<framework::SelectedRows>("Out");
+    auto &x_sele = x_var->Get<pten::SelectedRows>();
+    auto out_sele = ctx.Output<pten::SelectedRows>("Out");
     *x_for_selectedrows = x_sele.value();
     out_sele->set_rows(x_sele.rows());
     out_sele->set_height(x_sele.height());
     out_sele->mutable_value()->Resize(x_sele.value().dims());
     out_sele->mutable_value()->mutable_data(ctx.GetPlace(),
                                             x_for_selectedrows->type());
-    z = ctx.Output<framework::SelectedRows>("Out")->mutable_value();
+    z = ctx.Output<pten::SelectedRows>("Out")->mutable_value();
     ins->emplace_back(x_for_selectedrows);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
@@ -158,32 +158,6 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx,
   }
 }
 
-// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
-// explicit gradient can cut off X, Y, Out from gradient op
-// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
-// elementwise code.
-template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
-void ElemwiseExplicitGradCompute(const framework::ExecutionContext &ctx,
-                                 const framework::Tensor &x,
-                                 const framework::Tensor &y,
-                                 const framework::Tensor &out,
-                                 const framework::Tensor &dout, int axis,
-                                 framework::Tensor *dx, framework::Tensor *dy,
-                                 DX_OP dx_op, DY_OP dy_op) {
-  const framework::DDim &x_dim = x.dims();
-  const framework::DDim &y_dim = y.dims();
-  const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-  if (x.dims() == y.dims()) {
-    pten::funcs::ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
-        dev_ctx, x_dim, y_dim, dout, dout, out, dout, axis, dx, dy, dx_op,
-        dy_op);
-  } else {
-    pten::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP>(
-        dev_ctx, x_dim, y_dim, dout, dout, out, dout, axis, dx, dy, dx_op,
-        dy_op);
-  }
-}
-
 // It is a common implementation to compute binary calculation with the support
 // of broadcast, supporting both CPU and GPU.
 // - CPU implementation cannot support the case when x needs broadcast, thus
@@ -199,30 +173,20 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
                           const framework::Tensor *x,
                           const framework::Tensor *y, int axis, Functor func,
                           framework::Tensor *z) {
+  z->mutable_data<OutType>(ctx.GetPlace());
   if (platform::is_gpu_place(ctx.GetPlace())) {
 #if defined(__NVCC__) || defined(__HIPCC__)
-    std::vector<const framework::Tensor *> ins = {x, y};
-    std::vector<framework::Tensor *> outs = {z};
-    z->mutable_data<OutType>(ctx.GetPlace());
-
     const auto &dev_ctx =
         ctx.template device_context<platform::CUDADeviceContext>();
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   OutType>(dev_ctx, ins, &outs,
-                                                            axis, func);
+    pten::ElementwiseCompute<Functor, T, OutType>(dev_ctx, *x, *y, axis, func,
+                                                  z);
+
 #endif
     return;
   }
-
-  z->mutable_data<OutType>(ctx.GetPlace());
-  auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
-  auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
-  auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-
   const auto &dev_ctx =
       ctx.template device_context<platform::CPUDeviceContext>();
-  pten::ElementwiseCompute<Functor, T, OutType>(
-      dev_ctx, *pt_x.get(), *pt_y.get(), axis, func, pt_z.get());
+  pten::ElementwiseCompute<Functor, T, OutType>(dev_ctx, *x, *y, axis, func, z);
 }
 
 // FusedElemwiseAndAct
@@ -1207,36 +1171,16 @@ template <typename DeviceContext, typename T>
 static inline void GetDoubleGradSafeTensor(
     const framework::ExecutionContext &ctx, const framework::Tensor *x,
     const framework::Tensor *ddx, framework::Tensor *ddx_safe) {
-  if (ddx) {
-    *ddx_safe = *ddx;
-  } else {
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    *ddx_safe = ctx.AllocateTmpTensor<T, DeviceContext>(x->dims(), dev_ctx);
-    math::SetConstant<DeviceContext, T> set_zero;
-    set_zero(ctx.template device_context<DeviceContext>(), ddx_safe,
-             static_cast<T>(0));
-  }
+  const auto &dev_ctx = ctx.template device_context<DeviceContext>();
+  pten::funcs::GetDoubleGradSafeTensor<DeviceContext, T>(dev_ctx, *x, ddx,
+                                                         ddx_safe);
 }
 
 // for broadcast backwards
 static inline std::vector<int> GetReduceDim(const framework::DDim &in,
                                             const framework::DDim &out,
                                             int axis) {
-  axis =
-      (axis == -1 ? std::abs(static_cast<int>(out.size() - in.size())) : axis);
-  std::vector<int> dims;
-  for (int i = 0; i < axis; ++i) {
-    dims.push_back(i);
-  }
-  for (int i = 0; i < in.size(); ++i) {
-    if (out[i + axis] != in[i]) {
-      dims.push_back(i + axis);
-    }
-  }
-  for (int i = axis + in.size(); i < out.size(); ++i) {
-    dims.push_back(i);
-  }
-  return dims;
+  return pten::funcs::GetReduceDim(in, out, axis);
 }
 
 #if defined(__NVCC__) || defined(__HIPCC__)
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index 7d1749f20abf29f155e0d05931902a63aa9a1837..8fc6038ab65819dbf6e108b0f3df1a4478e915c4 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -78,9 +78,11 @@ default_elementwise_sub_grad(const framework::ExecutionContext& ctx,
                              const framework::Tensor* dout,
                              framework::Tensor* dx, framework::Tensor* dy) {
   int axis = ctx.Attr<int>("axis");
-
-  ElemwiseExplicitGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
+  const auto& dev_ctx =
+      ctx.template device_context<platform::CPUDeviceContext>();
+  pten::ElemwiseExplicitGradCompute<T, SubGradDX<T>, SubGradDY<T>>(
+      dev_ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(),
+      SubGradDY<T>());
 }
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index ee7c0eb96eae5c14e68023a3ebbdc2ef4ea9ca04..c0e2b4584d0260e221b2fc45d3e7e46415a9b7b5 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -117,7 +117,7 @@ class FillConstantOp : public framework::OperatorWithKernel {
       const auto& str_value = ctx.Attr<std::string>("str_value");
       value = str_value.empty() ? "value" : "str_value";
     }
-    if (!ctx.OutputVar("Out")->IsType<framework::SelectedRows>()) {
+    if (!ctx.OutputVar("Out")->IsType<pten::SelectedRows>()) {
       return framework::KernelSignature("full", {}, {shape, value}, {"Out"});
     }
     return framework::KernelSignature("fill_constant.unregistered", {}, {}, {});
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
index 9e9bd2e0fbbc94c1aaa85018de3b4ed96a8f686c..c74cf2a824c830a7a3b00f90e31b8508c23aba68 100644
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -92,8 +92,8 @@ class FillConstantKernel : public framework::OpKernel<T> {
     if (out_var->IsType<framework::LoDTensor>()) {
       tensor = out_var->GetMutable<framework::LoDTensor>();
       tensor->Resize(shape);
-    } else if (out_var->IsType<framework::SelectedRows>()) {
-      tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
+    } else if (out_var->IsType<pten::SelectedRows>()) {
+      tensor = out_var->GetMutable<pten::SelectedRows>()->mutable_value();
       tensor->Resize(shape);
     } else {
       PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/operators/fill_constant_op_mlu.cc b/paddle/fluid/operators/fill_constant_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d161a857d6c1778e8136702564dae2582c8a0465
--- /dev/null
+++ b/paddle/fluid/operators/fill_constant_op_mlu.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fill_constant_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class FillConstantMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto str_value = ctx.Attr<std::string>("str_value");
+    auto float_value = ctx.Attr<float>("value");
+
+    auto *out_var = ctx.Output<framework::Tensor>("Out");
+
+    T value;
+    if (str_value.empty()) {
+      value = static_cast<T>(float_value);
+    } else {
+      // handle NaN/Inf first, which cannot be read from stream.
+      if (str_value == "inf") {
+        value = static_cast<T>(std::numeric_limits<double>::infinity());
+      } else if (str_value == "-inf") {
+        value = static_cast<T>(-std::numeric_limits<double>::infinity());
+      } else if (str_value == "nan") {
+        value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
+      } else {
+        std::stringstream convert_stream(str_value);
+        if (std::is_same<int64_t, T>::value) {
+          int64_t tmp_value;
+          convert_stream >> tmp_value;
+          value = static_cast<T>(tmp_value);
+        } else {
+          double tmp_value;
+          convert_stream >> tmp_value;
+          value = static_cast<T>(tmp_value);
+        }
+      }
+    }
+    if (ctx.HasInput("ValueTensor")) {
+      auto *value_tensor = ctx.Input<framework::Tensor>("ValueTensor");
+      PADDLE_ENFORCE_EQ(
+          value_tensor->numel(), 1,
+          platform::errors::InvalidArgument(
+              "When use Tensor as value to set Tensor value in fill_cosntant, "
+              "value input(ValueTensor) size must be 1, but get %d",
+              value_tensor->numel()));
+      const T *tensor_data = value_tensor->data<T>();
+      framework::Tensor mlu_tensor;
+      auto tmp_place = value_tensor->place();
+      if (platform::is_mlu_place(tmp_place)) {
+        TensorCopySync(*value_tensor, platform::CPUPlace(), &mlu_tensor);
+        tensor_data = mlu_tensor.data<T>();
+      }
+      value = tensor_data[0];
+    }
+
+    auto shape = GetShape(ctx);
+    out_var->mutable_data<T>(shape, ctx.GetPlace());
+    MLUCnnlTensorDesc output_desc(*out_var, CNNL_LAYOUT_ARRAY,
+                                  ToCnnlDataType(out_var->type()));
+    MLUCnnl::Fill(ctx, value, output_desc.get(), GetBasePtr(out_var));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_MLU_KERNEL(
+    fill_constant, paddle::operators::FillConstantMLUKernel<float>,
+    paddle::operators::FillConstantMLUKernel<bool>,
+    paddle::operators::FillConstantMLUKernel<int>,
+    paddle::operators::FillConstantMLUKernel<uint8_t>,
+    paddle::operators::FillConstantMLUKernel<int16_t>,
+    paddle::operators::FillConstantMLUKernel<int64_t>,
+    paddle::operators::FillConstantMLUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
index fa0cab04168d1e3ea48fc3cf7397e976a39eac2a..1402f3404fd6de57a244703305db7361879f7bf7 100644
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ b/paddle/fluid/operators/filter_by_instag_op.h
@@ -29,7 +29,7 @@
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
-using SelectedRows = framework::SelectedRows;
+using SelectedRows = pten::SelectedRows;
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
index 4e4322947a8571284202a0fb89af8b167b1a58b9..fc782dc55117519494cb8d527672b01e5654f384 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -30,7 +30,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
+using SelectedRows = pten::SelectedRows;
 using DDim = framework::DDim;
 
 constexpr int64_t kNoPadding = -1;
@@ -200,8 +200,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
     DDim table_dim;
     if (table_var->IsType<LoDTensor>()) {
       table_dim = context.Input<LoDTensor>("W")->dims();
-    } else if (table_var->IsType<SelectedRows>()) {
-      auto *table_t = context.Input<SelectedRows>("W");
+    } else if (table_var->IsType<pten::SelectedRows>()) {
+      auto *table_t = context.Input<pten::SelectedRows>("W");
       table_dim = table_t->value().dims();
     } else {
       PADDLE_THROW(platform::errors::PermissionDenied(
@@ -215,7 +215,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
     if (is_sparse) {
       auto *ids = context.Input<LoDTensor>("Ids");
       auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      auto *d_table =
+          context.Output<pten::SelectedRows>(framework::GradVarName("W"));
       // runtime shape
       d_table->set_height(table_dim[0]);
 
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index c6205863103ff99e3d850c5acc739a400cdb5696..babf1c657f232d8316df924487a925c6b6162cf9 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -19,6 +19,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+#define LN_NUM_COLS 1024
+
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
@@ -153,6 +155,191 @@ __global__ void FusedLayernormResidualDropoutBias(
       invvar);
 }
 
+/*
+* @brief layernorm(residual + dropout(x));
+ * Conditions:
+ * (1) The number of cols is 1024;
+ * (2) layer_norm scale and bias is not null;
+ * (3) linear bias is null;
+ * @param
+ * rows: batch_size * seq_len
+ * cols: 1024
+ * x_: [rows, cols], inputs
+ * residual_:[rows, cols]
+ * gamma_: [cols]: layernorm scale, not null
+ * beta_: [cols], layernorm bias, not null
+ * mask_out_: [rows, cols], dropout result
+ * residual_out_: [rows, cols], residual + dropout(src)
+ * y_: [rows, cols], layernorm result
+ * mean_out_: [rows]: layernorm means
+ * var_out_: [rows]: layernorm vars
+*/
+template <
+    typename T, typename U, typename ScaleT = U, typename MaskType = uint8_t,
+    int VecSize = 8, int WARPS_M = 4, int WARPS_N = 1, int BYTES_PER_LDG = 16,
+    int ELTS_PER_ROW = 1024, int THREADS_PER_WARP = 32,
+    int THREADS_PER_ROW = WARPS_N *THREADS_PER_WARP,
+    int THREADS_PER_CTA = WARPS_M *THREADS_PER_ROW, int ROWS_PER_CTA = WARPS_M,
+    int ELTS_PER_ROW_PER_CTA = THREADS_PER_ROW *VecSize,
+    int LDGS = ELTS_PER_ROW / ELTS_PER_ROW_PER_CTA>
+__global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
+    int rows, int cols, uint64_t seed, const float dropout_prob,
+    const bool is_upscale_in_train, const bool is_test,
+    const uint64_t increment, const float epsilon, const T *__restrict__ x_ptr,
+    const T *__restrict__ residual_ptr, const ScaleT *__restrict__ gamma_ptr,
+    const ScaleT *__restrict__ beta_ptr, MaskType *__restrict__ mask_out_ptr,
+    U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr,
+    T *__restrict__ residual_out_ptr, T *__restrict__ y_ptr) {
+  using Vec = platform::AlignedVector<T, VecSize>;
+  using Vec_scale = platform::AlignedVector<ScaleT, VecSize>;
+  using MaskStoreT = platform::AlignedVector<MaskType, VecSize>;
+
+  const int tidx = threadIdx.x;
+  const int bidx = blockIdx.x;
+  const int lane = tidx % THREADS_PER_WARP;  // 0, 1, ..., 31
+  const int warp = tidx / THREADS_PER_WARP;  // 0, 1, 2, 3
+  const int warp_n = warp % WARPS_N;         // 0
+  const int warp_m = warp / WARPS_N;         // 0, 1, 2, 3
+
+  const int c = warp_n * THREADS_PER_WARP + lane;  // lane
+  const int r = bidx * ROWS_PER_CTA + warp_m;      // row id
+
+  int idx = r * LN_NUM_COLS + c;
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, idx, increment, &state);
+
+  T factor = GetFactor<T>(dropout_prob, is_upscale_in_train, is_test);
+
+  Vec_scale gamma[LDGS];
+  Vec_scale beta[LDGS];
+#pragma unroll
+  for (int it = 0, col = c; it < LDGS; it++) {
+    platform::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
+    platform::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
+    col += THREADS_PER_ROW;
+  }
+
+  constexpr U rn = 1.f / U(LN_NUM_COLS);
+  for (int row = r; row < rows; row += gridDim.x * ROWS_PER_CTA) {
+    Vec x[LDGS];
+    Vec residual[LDGS];
+#pragma unroll
+    for (int it = 0, col = c; it < LDGS; it++) {
+      platform::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize,
+                                 &x[it]);
+      platform::Load<T, VecSize>(
+          residual_ptr + row * LN_NUM_COLS + col * VecSize, &residual[it]);
+      col += THREADS_PER_ROW;
+    }
+
+    MaskStoreT mask_vec[LDGS];
+    if (!is_test) {
+#pragma unroll
+      for (int it = 0; it < LDGS; it++) {
+        float rand[VecSize];
+        RandVec<VecSize>(&state, rand);
+#pragma unroll
+        for (int jt = 0; jt < VecSize; jt++) {
+#pragma unroll
+          mask_vec[it][jt] = static_cast<MaskType>(rand[jt] >= dropout_prob);
+        }
+      }
+    } else {
+#pragma unroll
+      for (int it = 0; it < LDGS; it++) {
+#pragma unroll
+        for (int jt = 0; jt < VecSize; jt++) {
+          mask_vec[it][jt] = static_cast<MaskType>(1);
+        }
+      }
+    }
+
+    // 4 * 8
+    U xf[LDGS * VecSize];
+#pragma unroll
+    for (int it = 0; it < LDGS; it++) {
+#pragma unroll
+      for (int jt = 0; jt < VecSize; jt++) {
+        // dropout(x) + residual
+        x[it][jt] = x[it][jt] * static_cast<T>(mask_vec[it][jt]) * factor +
+                    residual[it][jt];
+        xf[it * VecSize + jt] = U(x[it][jt]);
+      }
+    }
+
+// store dropout_residual_out and mask_out
+#pragma unroll
+    for (int it = 0, col = c; it < LDGS; it++) {
+      platform::Store<T, VecSize>(
+          x[it], residual_out_ptr + row * LN_NUM_COLS + col * VecSize);
+      platform::Store<MaskType, VecSize>(
+          mask_vec[it], mask_out_ptr + row * LN_NUM_COLS + col * VecSize);
+      col += THREADS_PER_ROW;
+    }
+
+    U mu_local = 0.f;
+#pragma unroll
+    for (int it = 0; it < LDGS; it++) {
+#pragma unroll
+      for (int jt = 0; jt < VecSize; jt++) {
+        mu_local += xf[it * VecSize + jt];
+      }
+    }
+
+#pragma unroll
+    for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+      mu_local += __shfl_xor_sync(uint32_t(-1), mu_local, it);
+    }
+    mu_local *= rn;
+    if (lane == 0) {
+      mean_out_ptr[row] = mu_local;
+    }
+    U var_local = 0.f;
+
+#pragma unroll
+    for (int it = 0; it < LDGS; it++) {
+#pragma unroll
+      for (int jt = 0; jt < VecSize; jt++) {
+        U diff = xf[it * VecSize + jt] - mu_local;
+        var_local += diff * diff;
+      }
+    }
+
+#pragma unroll
+    for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+      var_local += __shfl_xor_sync(uint32_t(-1), var_local, it);
+    }
+    U rsigma = rsqrtf(var_local * rn + epsilon);
+    if (lane == 0) {
+      // Note: the stored var is different for paddle(ln) and apex (fast ln).
+      // var_out_ptr[row] = rsigma;
+      var_out_ptr[row] = var_local * rn;
+    }
+
+#pragma unroll
+    for (int it = 0; it < LDGS; it++) {
+#pragma unroll
+      for (int jt = 0; jt < VecSize; jt++) {
+        // use fp16 to compute
+        // ScaleT tmp = static_cast<ScaleT>(rsigma * (xf[it * VecSize + jt] -
+        // mu_local));
+        // x[it][jt] = gamma[it][jt] *  tmp + beta[it][jt];
+        // cast to fp32 to compute
+        U tmp = rsigma * (static_cast<U>(xf[it * VecSize + jt]) - mu_local);
+        x[it][jt] = static_cast<T>(static_cast<U>(gamma[it][jt]) * tmp +
+                                   static_cast<U>(beta[it][jt]));
+      }
+    }
+
+#pragma unroll
+    for (int it = 0, col = c; it < LDGS; it++) {
+      platform::Store<T, VecSize>(x[it],
+                                  y_ptr + row * LN_NUM_COLS + col * VecSize);
+      col += THREADS_PER_ROW;
+    }
+  }
+}
+
 /**
  * @brief layernorm(residual + dropout(src + bias));
  * @param
@@ -205,6 +392,13 @@ void LaunchLayernormResidualDropoutBias(
     return;
   }
 
+  bool can_call_1024_kernel = false;
+  if (cols == 1024 && scale != nullptr && layernorm_bias != nullptr &&
+      bias == nullptr) {
+    can_call_1024_kernel = true;
+  }
+  VLOG(6) << "can_call_1024_kernel = " << can_call_1024_kernel;
+
   const int VecSize = MAX_CACHE_BYTES / sizeof(T);
   if (cols % VecSize != 0) {
     int blockDim = GetDesiredBlockDim(cols);
@@ -215,13 +409,35 @@ void LaunchLayernormResidualDropoutBias(
         epsilon, src, residual, bias, scale, layernorm_bias, mask_data, dst,
         layernorm_dst, mean, var);
   } else {
-    int blockDim = GetDesiredBlockDim(cols / VecSize);
-    FusedLayernormResidualDropoutBias<
-        T, uint8_t, VecSize, U,
-        ScaleBiasWithSameTypeX><<<rows, blockDim, 0, ctx.stream()>>>(
-        rows, cols, seed, dropout_prob, is_upscale_in_train, is_test, increment,
-        epsilon, src, residual, bias, scale, layernorm_bias, mask_data, dst,
-        layernorm_dst, mean, var);
+    if (can_call_1024_kernel) {
+      const int WARPS_M = 4;
+      const int WARPS_N = 1;
+      const int THREADS_PER_WARP = 32;
+      const int BYTES_PER_LDG = 16;
+      const int VecSize = BYTES_PER_LDG / sizeof(T);
+
+      const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;
+      const int ROWS_PER_CTA = WARPS_M;
+
+      // Note: the grid can not exceed max_grid of the gpu.
+      const int grid =
+          static_cast<int>(std::ceil(rows / static_cast<float>(ROWS_PER_CTA)));
+      fused_ln_fwd_1024_kernel<
+          T, U, LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, uint8_t,
+          VecSize, WARPS_M, WARPS_N,
+          BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(
+          rows, cols, seed, dropout_prob, is_upscale_in_train, is_test,
+          increment, epsilon, src, residual, scale, layernorm_bias, mask_data,
+          mean, var, dst, layernorm_dst);
+    } else {
+      int blockDim = GetDesiredBlockDim(cols / VecSize);
+      FusedLayernormResidualDropoutBias<
+          T, uint8_t, VecSize, U,
+          ScaleBiasWithSameTypeX><<<rows, blockDim, 0, ctx.stream()>>>(
+          rows, cols, seed, dropout_prob, is_upscale_in_train, is_test,
+          increment, epsilon, src, residual, bias, scale, layernorm_bias,
+          mask_data, dst, layernorm_dst, mean, var);
+    }
   }
 }
 
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
index 57d3fc94dc88a0699b103c081642757798719332..cc14d0680d381ff2bbe73ee712e218c9c4d79185 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
@@ -66,12 +66,10 @@ struct TestFusedLayernormResidualDropoutBias {
     ctx = reinterpret_cast<platform::CUDADeviceContext *>(devicectx);
   }
 
-  TestFusedLayernormResidualDropoutBias(int _rows, int _cols,
-                                        uint64_t _seed = 0,
-                                        float _dropout_prob = 0.0,
-                                        float _epsilon = 0.00001f,
-                                        bool _is_upscale_in_train = false,
-                                        bool _is_test = false) {
+  TestFusedLayernormResidualDropoutBias(
+      int _rows, int _cols, uint64_t _seed = 0, float _dropout_prob = 0.0,
+      float _epsilon = 0.00001f, bool _is_upscale_in_train = false,
+      bool _is_test = false, bool _has_bias = true) {
     rows = _rows;
     cols = _cols;
     seed = _seed;
@@ -79,7 +77,7 @@ struct TestFusedLayernormResidualDropoutBias {
     epsilon = _epsilon;
     is_upscale_in_train = _is_upscale_in_train;
     is_test = _is_test;
-    has_bias = true;
+    has_bias = _has_bias;
     has_scale = true;
     has_layernorm_bias = true;
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
@@ -283,7 +281,6 @@ static void BaseTest(const bool is_fp16 = false) {
     }
   }
 }
-
 TEST(FusedDropout, GPUFusedLayernormResidualDropoutBias) { BaseTest<float>(); }
 
 TEST(FusedDropout, GPUFusedLayernormResidualDropoutBiasDouble) {
@@ -330,3 +327,12 @@ TEST(FusedDropout, GPUFusedLayernormResidualDropoutLargeShape) {
   test.Run();
   test.CheckOut(static_cast<float>(1e-4));
 }
+
+TEST(FusedDropout, GPUFusedLayernormResidualDropoutFp16MLperf) {
+  const int rows = 512;
+  const int cols = 1024;
+  TestFusedLayernormResidualDropoutBias<platform::float16> test(
+      rows, cols, 0, 0, 0.00001f, false, false, false);
+  test.Run();
+  test.CheckOut(static_cast<platform::float16>(1e-2));
+}
diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
index 8ce7df7eec15ead25ffb590454dd11228ffdadfc..67c265c97e46160fd824db1a8201b917d3414260 100644
--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
@@ -57,7 +57,7 @@ class GetTensorFromSelectedRowsOp : public framework::OperatorWithKernel {
 class GetTensorFromSelectedRowsKernel {
  public:
   void operator()(const framework::ExecutionContext &ctx) const {
-    auto *x = ctx.Input<framework::SelectedRows>("X");
+    auto *x = ctx.Input<pten::SelectedRows>("X");
     auto *out = ctx.Output<framework::LoDTensor>("Out");
 
     out->Resize(x->value().dims());
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index a6f5fb017a752ee15fe70a3b57d0dabce3854f50..17734b9c542c830b9aab3498cabac5a8a1c8beca 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -204,7 +204,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
                                   "Custom tree must be set for sparse mode!"));
       framework::Vector<int64_t> real_rows = PathToRows(*path);
       auto* w_grad =
-          ctx.Output<framework::SelectedRows>(framework::GradVarName("W"));
+          ctx.Output<pten::SelectedRows>(framework::GradVarName("W"));
       w_grad->set_rows(real_rows);
       // Build a map of id -> row_index to speed up finding the index of one id
       w_grad->set_height(w.dims()[0]);
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index 3db0fdf5e6da4e7b5ed7f0a8dbc2b96b7265cd83..72dd0fc743247116e7b9060676955dbd0ba31c76 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -16,39 +16,121 @@
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/fluid/platform/fast_divmod.h"
 
 namespace paddle {
 namespace operators {
 
 using framework::Tensor;
+using platform::FastDivMod;
 using DataLayout = framework::DataLayout;
 
+static inline int GetLastPow2(int n) {
+  n |= (n >> 1);
+  n |= (n >> 2);
+  n |= (n >> 4);
+  n |= (n >> 8);
+  n |= (n >> 16);
+  return std::max(1, n - (n >> 1));
+}
+
+inline platform::GpuLaunchConfig GetGpuLaunchConfig3D(
+    const platform::CUDADeviceContext& context, int num_img, int height,
+    int width) {
+  const int kThreadsPerBlock = 256;
+  int max_threads_per_block = context.GetMaxThreadsPerBlock();  // 1024
+  int max_threads = std::min(kThreadsPerBlock, max_threads_per_block);
+
+  int block_x = std::min(GetLastPow2(width), max_threads);
+  int block_y = std::min(GetLastPow2(height), max_threads / block_x);
+  int block_z = std::min(num_img, max_threads / block_x / block_y);
+
+  dim3 max_grid_dim = context.GetCUDAMaxGridDimSize();
+  int grid_x = std::min<int>(max_grid_dim.x, platform::DivUp(width, block_x));
+  int grid_y = std::min<int>(max_grid_dim.y, platform::DivUp(height, block_y));
+  int grid_z =
+      std::min<int>(max_grid_dim.z, platform::DivUp(num_img, block_z * 4));
+
+  const int capability = context.GetComputeCapability();
+  platform::GpuLaunchConfig config;
+  config.compute_capability = capability;
+  config.thread_per_block = dim3(block_x, block_y, block_z);
+  config.block_per_grid = dim3(grid_x, grid_y, grid_z);
+  return config;
+}
+
+struct FastDivModForInterpolate {
+ public:
+  FastDivMod channels_div;
+  FastDivMod output_w_div;
+  FastDivMod output_wc_div;
+
+  explicit HOSTDEVICE FastDivModForInterpolate(const int channels,
+                                               const int output_w,
+                                               const int outout_wc)
+      : channels_div(FastDivMod(channels)),
+        output_w_div(FastDivMod(output_w)),
+        output_wc_div(FastDivMod(outout_wc)) {}
+};
+
+template <typename T>
+__global__ void KeNearestNeighborInterpNCHWFw(
+    const T* in, const size_t in_img_h, const size_t in_img_w, T* out,
+    const size_t out_img_h, const size_t out_img_w, const size_t nc,
+    const float ratio_h, const float ratio_w, const bool align_corners) {
+  int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y;
+  int nc_id = threadIdx.z + blockIdx.z * blockDim.z;
+  int nc_stride = blockDim.z * gridDim.z;
+
+  // nearest_sampling by multiple read in_addr and write to out_addr
+  int in_img_idx = (align_corners)
+                       ? static_cast<int>(ratio_w * out_img_idx + 0.5)
+                       : static_cast<int>(ratio_w * out_img_idx);
+  int in_img_idy = (align_corners)
+                       ? static_cast<int>(ratio_h * out_img_idy + 0.5)
+                       : static_cast<int>(ratio_h * out_img_idy);
+
+  int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx;
+  int in_index_stride = nc_stride * in_img_h * in_img_w;
+
+  int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx;
+  int out_index_stride = nc_stride * out_img_h * out_img_w;
+
+  // prevent from multiple threads writing
+  if (out_img_idx < out_img_w && out_img_idy < out_img_h) {
+    while (nc_id < nc) {
+      out[out_index] = in[in_index];
+      in_index += in_index_stride;
+      out_index += out_index_stride;
+      nc_id += nc_stride;
+    }
+  }
+}
+
 template <typename T>
 __global__ void KeNearestNeighborInterpFw(
     const T* in, const size_t in_img_h, const size_t in_img_w,
     const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
     const size_t out_img_w, const size_t output_h, const size_t output_w,
     const size_t num_channels, const float ratio_h, const float ratio_w,
-    const bool align_corners, const DataLayout data_layout) {
+    const bool align_corners, FastDivModForInterpolate divmods) {
   int nthreads = output_h * output_w;
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
+  int in_img_size = in_img_h * in_img_w;
+  int out_img_size = out_img_h * out_img_w;
+
   for (; tid < nthreads; tid += stride) {
-    int out_id_h = tid / output_w;
-    int out_id_w = tid % output_w;
-    int in_img_size = input_w / num_channels;
-    int out_img_size = output_w / num_channels;
+    auto out_id_divmod = divmods.output_w_div.Divmod(tid);
+    int out_id_h = out_id_divmod.val[0];
+    int out_id_w = out_id_divmod.val[1];
 
-    int channel_id, out_img_idy, out_img_idx;
-    if (data_layout == DataLayout::kNCHW) {
-      channel_id = out_id_w / out_img_size;
-      out_img_idy = (out_id_w % out_img_size) / out_img_w;
-      out_img_idx = tid % out_img_w;
-    } else {
-      out_img_idy = out_id_w / (out_img_w * num_channels);
-      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
-      channel_id = tid % num_channels;
-    }
+    int channel_id = divmods.channels_div.Divmod(tid).val[1];
+    auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w);
+    int out_img_idy = outimg_id_divmod.val[0];
+    int out_img_idx =
+        divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0];
 
     int in_img_idy = (align_corners)
                          ? static_cast<int>(ratio_h * out_img_idy + 0.5)
@@ -57,13 +139,8 @@ __global__ void KeNearestNeighborInterpFw(
                          ? static_cast<int>(ratio_w * out_img_idx + 0.5)
                          : static_cast<int>(ratio_w * out_img_idx);
 
-    if (data_layout == DataLayout::kNCHW) {
-      out[tid] = in[out_id_h * input_w + channel_id * in_img_size +
-                    in_img_idy * in_img_w + in_img_idx];
-    } else {
-      out[tid] = in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
-                    in_img_idx * num_channels + channel_id];
-    }
+    out[tid] = in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+                  in_img_idx * num_channels + channel_id];
   }
 }
 
@@ -1292,11 +1369,25 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
       platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("nearest" == interp_method) {
-    KeNearestNeighborInterpFw<
-        T><<<config.block_per_grid, config.thread_per_block, 0,
-             ctx.cuda_device_context().stream()>>>(
-        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
-        out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+    if (data_layout == DataLayout::kNCHW) {
+      // get launch 3D config
+      int nc = n * c;
+      platform::GpuLaunchConfig config_3d =
+          GetGpuLaunchConfig3D(ctx.cuda_device_context(), nc, out_h, out_w);
+      KeNearestNeighborInterpNCHWFw<
+          T><<<config_3d.block_per_grid, config_3d.thread_per_block, 0,
+               ctx.cuda_device_context().stream()>>>(
+          input_data, in_h, in_w, output_data, out_h, out_w, nc, ratio_h,
+          ratio_w, align_corners);
+    } else {
+      int64_t cw = c * out_w;
+      auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw);
+      KeNearestNeighborInterpFw<
+          T><<<config.block_per_grid, config.thread_per_block, 0,
+               ctx.cuda_device_context().stream()>>>(
+          input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
+          out_chw, c, ratio_h, ratio_w, align_corners, interp_divmods);
+    }
   } else if ("bilinear" == interp_method) {
     dim3 thread_num = config.thread_per_block;
 #ifdef WITH_NV_JETSON
diff --git a/paddle/fluid/operators/ipu/CMakeLists.txt b/paddle/fluid/operators/ipu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..66373d4b5f6b91914e9bb1f3ed5b7fdd5dec37ea
--- /dev/null
+++ b/paddle/fluid/operators/ipu/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(WITH_IPU)
+  op_library(ipu_runtime_op DEPS ipu_backend)
+endif(WITH_IPU)
diff --git a/paddle/fluid/operators/ipu_runtime_op.h b/paddle/fluid/operators/ipu/ipu_runtime_op.cc
similarity index 55%
rename from paddle/fluid/operators/ipu_runtime_op.h
rename to paddle/fluid/operators/ipu/ipu_runtime_op.cc
index b6fc9ae98895d40d2e2d1c9eb02a63d200b0b1f8..3b6982d4b2b8e3fe29587e2e6cbbc16107326f78 100644
--- a/paddle/fluid/operators/ipu_runtime_op.h
+++ b/paddle/fluid/operators/ipu/ipu_runtime_op.cc
@@ -1,4 +1,4 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,32 +12,29 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#pragma once
-#include <memory>
-#include <vector>
+#ifdef PADDLE_WITH_IPU
 
 #include "paddle/fluid/framework/op_registry.h"
-#ifdef PADDLE_WITH_IPU
-#include "paddle/fluid/framework/ipu/ipu_backend.h"
-#include "paddle/fluid/framework/tensor.h"
-#endif
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename T>
-class IpuRuntimeKernel : public framework::OpKernel<T> {
+class IpuRuntimeOp : public framework::OperatorBase {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#ifdef PADDLE_WITH_IPU
-    auto ipu_backend = framework::ipu::IpuBackend::GetInstance();
-    if (!ipu_backend->DeviceIsAttached()) {
-      const platform::IPUDeviceContext& ipu_ctx =
-          reinterpret_cast<const platform::IPUDeviceContext&>(
-              ctx.device_context());
-      ipu_backend->AttachDevice(ipu_ctx.DeviceId());
-    }
+  IpuRuntimeOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
 
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const {
+    auto ipu_backend = platform::ipu::IpuBackend::GetInstance();
+    auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    framework::RuntimeContext runtime_ctx(inputs_, outputs_, scope);
+    framework::ExecutionContext ctx(*this, scope, *dev_ctx, runtime_ctx);
     auto inputs = ctx.MultiInput<framework::Tensor>("FeedList");
     auto outputs = ctx.MultiOutput<framework::Tensor>("FetchList");
     auto output_names = ctx.OutputNames("FetchList");
@@ -58,12 +55,24 @@ class IpuRuntimeKernel : public framework::OpKernel<T> {
                  << "(" << dim << ")";
       }
     }
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Please compile WITH_IPU option to enable ipu_runtime op"));
-#endif
+  }
+};
+
+class IpuRuntimeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("FeedList", "FeedList of Graph").AsDuplicable();
+    AddOutput("FetchList", "FetchList of Graph").AsDuplicable();
+    AddComment(R"DOC(
+Run graph by PopART runtime.
+)DOC");
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(ipu_runtime, ops::IpuRuntimeOp, ops::IpuRuntimeOpMaker);
+
+#endif  // PADDLE_WITH_IPU
diff --git a/paddle/fluid/operators/ipu_runtime_op.cc b/paddle/fluid/operators/ipu_runtime_op.cc
deleted file mode 100644
index 4b473da00f3318135f194dd90151fbfb39315fee..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ipu_runtime_op.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/ipu_runtime_op.h"
-
-namespace paddle {
-namespace operators {
-
-class IpuRuntimeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
-        ctx.device_context());
-  }
-};
-
-class IpuRuntimeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("FeedList", "FeedList of Graph").AsDuplicable();
-    AddOutput("FetchList", "FetchList of Graph").AsDuplicable();
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::VarType::FP32);
-    AddComment(R"DOC(
-Run graph by PopART runtime.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(ipu_runtime, ops::IpuRuntimeOp, ops::IpuRuntimeOpMaker);
-
-REGISTER_OP_IPU_KERNEL(ipu_runtime, ops::IpuRuntimeKernel<float>,
-                       ops::IpuRuntimeKernel<double>,
-                       ops::IpuRuntimeKernel<int>,
-                       ops::IpuRuntimeKernel<int64_t>,
-                       ops::IpuRuntimeKernel<bool>,
-                       ops::IpuRuntimeKernel<int8_t>,
-                       ops::IpuRuntimeKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index 753b34484e41128f4c38332f4c0dd077fd42776b..c4bc3a7fda154f42a07e10b453bba70afa41c629 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -55,8 +55,8 @@ class OverflowOp : public framework::OperatorWithKernel {
     auto *x_var = ctx.InputVar("X");
     if (x_var->IsType<framework::LoDTensor>()) {
       dtype = x_var->Get<framework::LoDTensor>().type();
-    } else if (x_var->IsType<framework::SelectedRows>()) {
-      dtype = x_var->Get<framework::SelectedRows>().value().type();
+    } else if (x_var->IsType<pten::SelectedRows>()) {
+      dtype = x_var->Get<pten::SelectedRows>().value().type();
     } else {
       PADDLE_ENFORCE_EQ(
           true, false,
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
index 99db1c7e081dade476e0012275071719d4281b78..abed0e6903dd39d2b3447455f8982e3df24e73fd 100644
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -58,8 +58,8 @@ class OverflowKernel : public framework::OpKernel<T> {
     if (x->IsType<framework::LoDTensor>()) {
       auto* in = ctx.Input<framework::Tensor>("X");
       functor(*in, out);
-    } else if (x->IsType<framework::SelectedRows>()) {
-      auto& in = ctx.Input<framework::SelectedRows>("X")->value();
+    } else if (x->IsType<pten::SelectedRows>()) {
+      auto& in = ctx.Input<pten::SelectedRows>("X")->value();
       functor(in, out);
     } else {
       PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc
index 316197ac23c850fde85b65659fe988e61d4b2173..3b48a41ed4f75eba33788f3139a3ff5ae85e300d 100644
--- a/paddle/fluid/operators/isfinite_v2_op.cc
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
@@ -62,8 +62,8 @@ class OverflowV2Op : public framework::OperatorWithKernel {
     auto *x_var = ctx.InputVar("X");
     if (x_var->IsType<framework::LoDTensor>()) {
       dtype = x_var->Get<framework::LoDTensor>().type();
-    } else if (x_var->IsType<framework::SelectedRows>()) {
-      dtype = x_var->Get<framework::SelectedRows>().value().type();
+    } else if (x_var->IsType<pten::SelectedRows>()) {
+      dtype = x_var->Get<pten::SelectedRows>().value().type();
     } else {
       PADDLE_THROW(plat::errors::InvalidArgument(
           "Cannot find the input data type by all input data"));
diff --git a/paddle/fluid/operators/kernel_primitives/functor_primitives.h b/paddle/fluid/operators/kernel_primitives/functor_primitives.h
index 03610d4589058e074f64940741df34bd8f66e379..15bb01a865d402f8da3fb7ed4178548c8da46b40 100644
--- a/paddle/fluid/operators/kernel_primitives/functor_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/functor_primitives.h
@@ -13,241 +13,10 @@
 // limitations under the License.
 
 #pragma once
-
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/kernels/funcs/eigen/extensions.h"
+#include "paddle/pten/kernels/primitive/functor_primitives.h"
 
 namespace paddle {
 namespace operators {
-namespace kernel_primitives {
-namespace details {
-
-static __device__ __forceinline__ platform::float16 Exp(platform::float16 x) {
-  return ::Eigen::numext::exp(x);
-}
-
-static __device__ __forceinline__ float Exp(float x) { return expf(x); }
-
-static __device__ __forceinline__ double Exp(double x) { return exp(x); }
-
-static __device__ __forceinline__ platform::float16 Log(platform::float16 x) {
-  return ::Eigen::numext::log(x);
-}
-
-static __device__ __forceinline__ float Log(float x) { return logf(x); }
-
-static __device__ __forceinline__ double Log(double x) { return log(x); }
-
-}  // namespace details
-
-/******************************** Unary Functor *******************************/
-
-/**
- * @brief Default unary exp functor
- */
-template <typename Tx, typename Ty = Tx>
-struct ExpFunctor {
-  HOSTDEVICE inline ExpFunctor() {}
-
-  HOSTDEVICE explicit inline ExpFunctor(int n) {}
-
-  HOSTDEVICE inline Ty operator()(const Tx x) const {
-    return static_cast<Ty>(details::Exp(x));
-  }
-};
-
-/**
- * @brief Default unary identity functor
- */
-template <typename Tx, typename Ty = Tx>
-struct IdentityFunctor {
-  HOSTDEVICE inline IdentityFunctor() {}
-
-  HOSTDEVICE explicit inline IdentityFunctor(int n) {}
-
-  HOSTDEVICE inline Ty operator()(const Tx x) const {
-    return static_cast<Ty>(x);
-  }
-};
-
-/**
- * @brief Default unary div functor. Divide by a constant
- */
-template <typename Tx, typename Ty = Tx>
-struct DivideFunctor {
- private:
-  using MPType = typename ::paddle::operators::details::MPTypeTrait<Tx>::Type;
-
- public:
-  HOSTDEVICE inline DivideFunctor() { n_inv = static_cast<MPType>(1.0f); }
-
-  HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((MPType)(1.0 / n)) {}
-
-  HOSTDEVICE inline Ty operator()(const Tx x) const {
-    return static_cast<Ty>(static_cast<MPType>(x) * n_inv);
-  }
-
- private:
-  MPType n_inv;
-};
-
-/**
- * @brief Default inverse functor
- */
-template <typename Tx, typename Ty = Tx>
-struct InverseFunctor {
-  HOSTDEVICE inline InverseFunctor() {}
-
-  HOSTDEVICE explicit inline InverseFunctor(int n) {}
-
-  HOSTDEVICE inline Ty operator()(const Tx x) const {
-    return static_cast<Ty>(-x);
-  }
-};
-
-/**
- * @brief Default unary square functor
- */
-template <typename Tx, typename Ty = Tx>
-struct SquareFunctor {
-  HOSTDEVICE inline SquareFunctor() {}
-
-  HOSTDEVICE explicit inline SquareFunctor(int n) {}
-
-  HOSTDEVICE inline Ty operator()(const Tx x) const {
-    return static_cast<Ty>(x) * static_cast<Ty>(x);
-  }
-};
-
-/****************************** Binary Functor ********************************/
-
-/**
- * @brief Default binary min functor
- */
-template <typename T>
-struct MinFunctor {
-  inline T initial() { return static_cast<T>(std::numeric_limits<T>::max()); }
-
-  __device__ __forceinline__ T operator()(const T a, const T b) const {
-    return (b < a) ? b : a;
-  }
-};
-
-/**
- * @brief Default binary max functor
- */
-template <typename T>
-struct MaxFunctor {
-  inline T initial() {
-    return static_cast<T>(std::numeric_limits<T>::lowest());
-  }
-
-  __device__ __forceinline__ T operator()(const T a, const T b) const {
-    return (b > a) ? b : a;
-  }
-};
-
-/**
- * @brief Default binary add functor
- */
-template <typename T>
-struct AddFunctor {
-  inline T initial() { return static_cast<T>(0.0f); }
-
-  __device__ __forceinline__ T operator()(const T a, const T b) const {
-    return b + a;
-  }
-};
-
-/**
- * @brief Default binary add functor
- */
-template <typename T>
-struct MulFunctor {
-  inline T initial() { return static_cast<T>(1.0f); }
-
-  __device__ __forceinline__ T operator()(const T a, const T b) const {
-    return b * a;
-  }
-};
-
-/**
- * @brief Default binary logic or functor
- */
-template <typename T>
-struct LogicalOrFunctor {
-  inline T initial() { return static_cast<T>(false); }
-
-  __device__ __forceinline__ T operator()(const T a, const T b) const {
-    return b || a;
-  }
-};
-
-/**
- * @brief Default binary logic and functor
- */
-template <typename T>
-struct LogicalAndFunctor {
-  inline T initial() { return static_cast<T>(true); }
-
-  __device__ __forceinline__ T operator()(const T a, const T b) const {
-    return b && a;
-  }
-};
-
-/**
- * @brief Default binary sub functor
- */
-template <typename T>
-struct SubFunctor {
-  inline T initial() { return static_cast<T>(0.0f); }
-
-  inline HOSTDEVICE T operator()(const T a, const T b) const { return a - b; }
-};
-
-/**
- * @brief Default binary div functor
- */
-template <typename T, typename Enable = void>
-struct DivFunctor {
-  inline T initial() { return static_cast<T>(1.0f); }
-
-  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
-};
-
-template <typename T>
-struct DivFunctor<T,
-                  typename std::enable_if<std::is_integral<T>::value>::type> {
-  inline T initial() { return static_cast<T>(1.0f); }
-
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    // For int32/int64, need to check whether the divison is zero.
-    PADDLE_ENFORCE_NE(b, 0,
-                      platform::errors::InvalidArgument(
-                          "Integer division by zero encountered "
-                          "in (floor) divide. Please check the input value."));
-    return a / b;
-  }
-};
-
-/**
- * @brief Default binary floor divide functor
- */
-template <typename T>
-struct FloorDivFunctor {
-  inline T initial() { return static_cast<T>(1.0f); }
-
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    PADDLE_ENFORCE_NE(b, 0,
-                      platform::errors::InvalidArgument(
-                          "Integer division by zero encountered "
-                          "in (floor) divide. Please check the input value."));
-    return static_cast<T>(std::trunc(a / b));
-  }
-};
-
-}  // namespace kernel_primitives
+namespace kernel_primitives = pten::kps;
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
index 558f8c81c66428ca0561806b8021f09261e32e3b..4ec3741bc91bb58a183ee9a2ff106461c6d71d05 100644
--- a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
@@ -13,61 +13,10 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/operators/kernel_primitives/helper_primitives.h"
-#ifdef PADDLE_WITH_XPU2
-#include "paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h"
-#include "paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h"
-#include "paddle/fluid/operators/kernel_primitives/functor_primitives_xpu2.h"
-
-#define KPStream XPUStream
-#define KPDevice paddle::platform::XPUDeviceContext
-#define _ptr_ _global_ptr_
-#define __forceinline__ __inline__
-#define __restrict__
-
-#define THREAD_ID_X core_id()
-#define THREAD_ID_Y 0
-#define THREAD_ID_Z 0
-
-#define BLOCK_NUM_X core_num()
-#define BLOCK_NUM_Y 0
-#define BLOCK_NUM_Z 0
-
-#define BLOCK_ID_X cluster_id()
-#define BLOCK_ID_Y 0
-#define BLOCK_ID_Z 0
-
-#define GRID_NUM_X cluster_num()
-#define GRID_NUM_Y 0
-#define GRID_NUM_Z 0
-#else
-#include "paddle/fluid/operators/kernel_primitives/compute_primitives.h"
-#include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h"
-#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
-
-#define KPStream gpuStream_t
-#define KPDevice paddle::platform::CUDADeviceContext
-#define _ptr_
-
-#define THREAD_ID_X threadIdx.x
-#define THREAD_ID_Y threadIdx.y
-#define THREAD_ID_Z threadIdx.z
-
-#define BLOCK_NUM_X blockDim.x
-#define BLOCK_NUM_Y blockDim.y
-#define BLOCK_NUM_Z blockDim.z
-
-#define BLOCK_ID_X blockIdx.x
-#define BLOCK_ID_Y blockIdx.y
-#define BLOCK_ID_Z blockIdx.z
-
-#define GRID_NUM_X gridDim.x
-#define GRID_NUM_Y gridDim.y
-#define GRID_NUM_Z gridDim.z
-#endif
+#include "paddle/pten/kernels/primitive/kernel_primitives.h"
 
 namespace paddle {
 namespace operators {
-namespace kernel_primitives {}
+namespace kernel_primitives = pten::kps;
 }
 }
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 0c1f58a2f30f68c184906a0cebd78da98a83d952..bc00d875cd1dd37b64ae8a38c6949054bc168c7c 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -23,6 +23,7 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
@@ -35,6 +36,8 @@ using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
 using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
+#define LN_NUM_COLS 1024
+
 inline static int GetDesiredBlockDim(int64_t block_dim) {
 #ifdef __HIPCC__
   const int kMaxBlockDim = 256;
@@ -169,6 +172,118 @@ __inline__ __device__ half rsqrt_(const half val) {
 }
 #endif
 
+#ifdef PADDLE_WITH_CUDA
+template <typename T, typename U, typename ScaleT = U, int VecSize = 8,
+          int WARPS_M = 4, int WARPS_N = 1, int BYTES_PER_LDG = 16,
+          int ELTS_PER_ROW = 1024, int THREADS_PER_WARP = 32,
+          int THREADS_PER_ROW = WARPS_N *THREADS_PER_WARP,
+          int THREADS_PER_CTA = WARPS_M *THREADS_PER_ROW,
+          int ROWS_PER_CTA = WARPS_M,
+          int ELTS_PER_ROW_PER_CTA = THREADS_PER_ROW *VecSize,
+          int LDGS = ELTS_PER_ROW / ELTS_PER_ROW_PER_CTA>
+__global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
+    int rows, int cols, const float epsilon, const T *__restrict__ x_ptr,
+    const ScaleT *__restrict__ gamma_ptr, const ScaleT *__restrict__ beta_ptr,
+    U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr,
+    T *__restrict__ y_ptr) {
+  using Vec = platform::AlignedVector<T, VecSize>;
+  using Vec_scale = platform::AlignedVector<ScaleT, VecSize>;
+
+  const int tidx = threadIdx.x;
+  const int bidx = blockIdx.x;
+  const int lane = tidx % THREADS_PER_WARP;  // 0, 1, ..., 31
+  const int warp = tidx / THREADS_PER_WARP;  // 0, 1, 2, 3
+  const int warp_n = warp % WARPS_N;         // 0
+  const int warp_m = warp / WARPS_N;         // 0, 1, 2, 3
+
+  const int c = warp_n * THREADS_PER_WARP + lane;  // lane
+  const int r = bidx * ROWS_PER_CTA + warp_m;      // row id
+
+  Vec_scale gamma[LDGS];
+  Vec_scale beta[LDGS];
+#pragma unroll
+  for (int it = 0, col = c; it < LDGS; it++) {
+    platform::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
+    platform::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
+    col += THREADS_PER_ROW;
+  }
+
+  constexpr U rn = 1.f / U(LN_NUM_COLS);
+  for (int row = r; row < rows; row += gridDim.x * ROWS_PER_CTA) {
+    Vec x[LDGS];
+#pragma unroll
+    for (int it = 0, col = c; it < LDGS; it++) {
+      platform::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize,
+                                 &x[it]);
+      col += THREADS_PER_ROW;
+    }
+    U xf[LDGS * VecSize];
+
+    U mu_local = 0.f;
+
+#pragma unroll
+    for (int it = 0; it < LDGS; it++) {
+#pragma unroll
+      for (int jt = 0; jt < VecSize; jt++) {
+        xf[it * VecSize + jt] = U(x[it][jt]);
+        mu_local += xf[it * VecSize + jt];
+      }
+    }
+
+#pragma unroll
+    for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+      mu_local += __shfl_xor_sync(uint32_t(-1), mu_local, it);
+    }
+    mu_local *= rn;
+    if (lane == 0) {
+      mean_out_ptr[row] = mu_local;
+    }
+    U var_local = 0.f;
+
+#pragma unroll
+    for (int it = 0; it < LDGS; it++) {
+#pragma unroll
+      for (int jt = 0; jt < VecSize; jt++) {
+        U diff = xf[it * VecSize + jt] - mu_local;
+        var_local += diff * diff;
+      }
+    }
+
+#pragma unroll
+    for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+      var_local += __shfl_xor_sync(uint32_t(-1), var_local, it);
+    }
+    // Note: to assure if it is right for double
+    U rsigma = rsqrtf(var_local * rn + epsilon);
+    if (lane == 0) {
+      var_out_ptr[row] = var_local * rn;
+    }
+
+#pragma unroll
+    for (int it = 0; it < LDGS; it++) {
+#pragma unroll
+      for (int jt = 0; jt < VecSize; jt++) {
+        // use fp16 to compute
+        // ScaleT tmp = static_cast<ScaleT>(rsigma * (xf[it * VecSize + jt] -
+        // mu_local));
+        // x[it][jt] = gamma[it][jt] *  tmp + beta[it][jt];
+        // cast to fp32 to compute
+        U tmp = (rsigma * (static_cast<U>(xf[it * VecSize + jt]) - mu_local));
+        x[it][jt] = static_cast<T>(static_cast<U>(gamma[it][jt]) * tmp +
+                                   static_cast<U>(beta[it][jt]));
+      }
+    }
+
+#pragma unroll
+    for (int it = 0, col = c; it < LDGS; it++) {
+      platform::Store<T, VecSize>(x[it],
+                                  y_ptr + row * LN_NUM_COLS + col * VecSize);
+      col += THREADS_PER_ROW;
+    }
+  }
+}
+#endif
+
 template <typename T, typename U, bool ScaleBiasWithSameTypeX>
 using LayerNormScaleBiasT =
     typename std::conditional<ScaleBiasWithSameTypeX, T, U>::type;
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
index 7725f336416dbb80e0f65a38b6a4f16c88fb799f..ef4f0c6ba7063d4ff39732aed85ab5bbe007e7ca 100644
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -112,11 +112,49 @@ class LayerNormKernel<platform::CUDADeviceContext, T>
     }                                                                      \
   } while (0)
 
-    if (is_scale_bias_same_dtype_with_x) {
-      PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
+#ifdef PADDLE_WITH_CUDA
+    bool can_call_1024_kernel = false;
+    if (feature_size == 1024 && scale != nullptr && bias != nullptr) {
+      can_call_1024_kernel = true;
+    }
+    if (can_call_1024_kernel) {
+      const int WARPS_M = 4;
+      const int WARPS_N = 1;
+      const int THREADS_PER_WARP = 32;
+      const int BYTES_PER_LDG = 16;
+      const int VecSize = BYTES_PER_LDG / sizeof(T);
+
+      const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;
+      const int ROWS_PER_CTA = WARPS_M;
+
+      const int grid = static_cast<int>(
+          std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));
+      if (is_scale_bias_same_dtype_with_x) {
+        ln_fwd_1024_kernel<T, U, T, VecSize, WARPS_M, WARPS_N,
+                           BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
+            batch_size, feature_size, epsilon, x_data,
+            static_cast<const T *>(void_scale_data),
+            static_cast<const T *>(void_bias_data), mean_data, var_data,
+            y_data);
+      } else {
+        ln_fwd_1024_kernel<T, U, U, VecSize, WARPS_M, WARPS_N,
+                           BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
+            batch_size, feature_size, epsilon, x_data,
+            static_cast<const U *>(void_scale_data),
+            static_cast<const U *>(void_bias_data), mean_data, var_data,
+            y_data);
+      }
     } else {
-      PADDLE_LAUNCH_LAYERNORM_FWD(U, false);
+#endif
+      if (is_scale_bias_same_dtype_with_x) {
+        PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
+      } else {
+        PADDLE_LAUNCH_LAYERNORM_FWD(U, false);
+      }
+#ifdef PADDLE_WITH_CUDA
     }
+#endif
+
 #undef PADDLE_LAUNCH_LAYERNORM_FWD
   }
 };
diff --git a/paddle/fluid/operators/load_op.h b/paddle/fluid/operators/load_op.h
index 66160695c3d5aa9f7b18ea84156236752e42ae8e..89ad4325a5a534bb246c7017cafab3b96239b463 100644
--- a/paddle/fluid/operators/load_op.h
+++ b/paddle/fluid/operators/load_op.h
@@ -50,7 +50,7 @@ class LoadOpKernel : public framework::OpKernel<T> {
 
     if (out_var->IsType<framework::LoDTensor>()) {
       LoadLodTensor(fin, place, out_var, ctx);
-    } else if (out_var->IsType<framework::SelectedRows>()) {
+    } else if (out_var->IsType<pten::SelectedRows>()) {
       LoadSelectedRows(fin, place, out_var);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
@@ -105,7 +105,7 @@ class LoadOpKernel : public framework::OpKernel<T> {
 
   void LoadSelectedRows(std::istream &fin, const platform::Place &place,
                         framework::Variable *var) const {
-    auto *selectedRows = var->GetMutable<framework::SelectedRows>();
+    auto *selectedRows = var->GetMutable<pten::SelectedRows>();
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
diff --git a/paddle/fluid/operators/lookup_table_dequant_op.h b/paddle/fluid/operators/lookup_table_dequant_op.h
index 70aad1d3238f2f8fe65c9a3e8bedeb1fd0762e1a..475d0922ccc693bab14000c24413c55b626833e1 100644
--- a/paddle/fluid/operators/lookup_table_dequant_op.h
+++ b/paddle/fluid/operators/lookup_table_dequant_op.h
@@ -29,7 +29,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
+using SelectedRows = pten::SelectedRows;
 using DDim = framework::DDim;
 
 template <typename T>
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 89c84d9e14377315659efc1f3b8a5a9d0406b336..7a32e13122852c7c8e4f00cb03a7d0c85e727e05 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -151,7 +151,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       auto *ids = context.Input<LoDTensor>("Ids");
       auto *table = context.Input<LoDTensor>("W");
       auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      auto *d_table =
+          context.Output<pten::SelectedRows>(framework::GradVarName("W"));
 
       auto *ids_data = ids->data<int64_t>();
       int64_t ids_num = ids->numel();
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index a89d5fb7cb6e5db4546e0ff2e90bf9d722e7cd82..91b7f91c8e3bc5319db28a9585c029230d7a33e8 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -28,7 +28,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
+using SelectedRows = pten::SelectedRows;
 using DDim = framework::DDim;
 
 constexpr int64_t kNoPadding = -1;
@@ -82,8 +82,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
         }
       }
 
-    } else if (table_var->IsType<SelectedRows>()) {
-      const auto &table_t = table_var->Get<SelectedRows>();
+    } else if (table_var->IsType<pten::SelectedRows>()) {
+      const auto &table_t = table_var->Get<pten::SelectedRows>();
       int64_t row_width = table_t.value().dims()[1];
       const auto *table = table_t.value().data<T>();
       auto *output = output_t->mutable_data<T>(context.GetPlace());
@@ -155,8 +155,8 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
     DDim table_dim;
     if (table_var->IsType<LoDTensor>()) {
       table_dim = context.Input<LoDTensor>("W")->dims();
-    } else if (table_var->IsType<SelectedRows>()) {
-      auto *table_t = context.Input<SelectedRows>("W");
+    } else if (table_var->IsType<pten::SelectedRows>()) {
+      auto *table_t = context.Input<pten::SelectedRows>("W");
       table_dim = table_t->value().dims();
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
@@ -171,7 +171,8 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
     if (is_sparse) {
       auto *ids = context.Input<LoDTensor>("Ids");
       auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      auto *d_table =
+          context.Output<pten::SelectedRows>(framework::GradVarName("W"));
 
       auto *ids_data = ids->data<int64_t>();
       int64_t ids_num = ids->numel();
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index 44a6151f1b6ce665932a5c9b5f84c9cd2c817ab3..74ad0e4978b4ec6b3aa5553fc0a6202286ea6ffd 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -152,7 +152,8 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
       auto *ids = context.Input<LoDTensor>("Ids");
       auto *table = context.Input<LoDTensor>("W");
       auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      auto *d_table =
+          context.Output<pten::SelectedRows>(framework::GradVarName("W"));
 
       auto *ids_data = ids->data<int64_t>();
       int64_t ids_num = ids->numel();
diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h
index 54564395c6d04cebd8861c378c5aa34c899ffd7f..6ea9e58198fbffff5729ed7799a38f5dfece4b35 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ b/paddle/fluid/operators/lookup_table_v2_op.h
@@ -29,7 +29,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
+using SelectedRows = pten::SelectedRows;
 using DDim = framework::DDim;
 
 constexpr int64_t kNoPadding = -1;
@@ -86,8 +86,8 @@ class LookupTableV2Kernel : public framework::OpKernel<T> {
                  row_width * sizeof(T));
         }
       }
-    } else if (table_var->IsType<SelectedRows>()) {
-      const auto &table_t = table_var->Get<SelectedRows>();
+    } else if (table_var->IsType<pten::SelectedRows>()) {
+      const auto &table_t = table_var->Get<pten::SelectedRows>();
       int64_t row_width = table_t.value().dims()[1];
       const auto *table = table_t.value().data<T>();
       auto *output = output_t->mutable_data<T>(context.GetPlace());
@@ -132,8 +132,8 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> {
     DDim table_dim;
     if (table_var->IsType<LoDTensor>()) {
       table_dim = context.Input<LoDTensor>("W")->dims();
-    } else if (table_var->IsType<SelectedRows>()) {
-      auto *table_t = context.Input<SelectedRows>("W");
+    } else if (table_var->IsType<pten::SelectedRows>()) {
+      auto *table_t = context.Input<pten::SelectedRows>("W");
       table_dim = table_t->value().dims();
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
@@ -148,7 +148,8 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> {
     if (is_sparse) {
       auto *ids_t = context.Input<LoDTensor>("Ids");
       auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      auto *d_table =
+          context.Output<pten::SelectedRows>(framework::GradVarName("W"));
       int64_t ids_num = ids_t->numel();
 
       std::vector<int64_t> ids;
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index f2d1e79f03524a8cd7d20ec9aefb205c5e12bb0b..2672d02db008e7aadd00d79669e4ab07c36011b5 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/kernels/funcs/eigen/common.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
@@ -52,6 +53,18 @@ template struct SetConstant<platform::CPUDeviceContext,
 template struct SetConstant<platform::CPUDeviceContext,
                             platform::complex<double>>;
 
+template struct SetConstant<pten::CPUContext, platform::float16>;
+template struct SetConstant<pten::CPUContext, platform::bfloat16>;
+template struct SetConstant<pten::CPUContext, float>;
+template struct SetConstant<pten::CPUContext, double>;
+template struct SetConstant<pten::CPUContext, int16_t>;
+template struct SetConstant<pten::CPUContext, int>;
+template struct SetConstant<pten::CPUContext, int64_t>;
+template struct SetConstant<pten::CPUContext, bool>;
+template struct SetConstant<pten::CPUContext, uint8_t>;
+template struct SetConstant<pten::CPUContext, platform::complex<float>>;
+template struct SetConstant<pten::CPUContext, platform::complex<double>>;
+
 #ifdef PADDLE_WITH_XPU
 template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
 template struct SetConstant<platform::XPUDeviceContext, platform::bfloat16>;
diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
index 7c50ba630dbd91ef8c6d51cbde862336b5ab83cb..a94bb594be5f9d8c3c5eeb58f524161287dc0607 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -227,11 +227,11 @@ template <typename T>
 struct MatrixBitCodeFunctorMulGradWeightSR
     : public boost::static_visitor<void> {
   const framework::Tensor &tmat_;
-  framework::SelectedRows *weight_;
+  pten::SelectedRows *weight_;
   const framework::Tensor &input_;
 
   MatrixBitCodeFunctorMulGradWeightSR(const framework::Tensor &tmat,
-                                      framework::SelectedRows *weight,
+                                      pten::SelectedRows *weight,
                                       const framework::Tensor &input)
       : tmat_(tmat), weight_(weight), input_(input) {}
 
@@ -274,7 +274,7 @@ struct MatrixBitCodeFunctorMulGradWeightSR
 
 template <typename T>
 void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor &tmat,
-                                            framework::SelectedRows *weight,
+                                            pten::SelectedRows *weight,
                                             const framework::Tensor &input) {
   MatrixBitCodeFunctorMulGradWeightSR<T> func(tmat, weight, input);
   code_table_.apply_visitor(func);
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 71d905214ab9f57013bb553179ab6e75116af76d..13ddd27cbf0d7bd1dc1adbd8bfa278827107787a 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -252,8 +252,7 @@ class MatrixBitCodeFunctor {
   /* For SelectedRows Weight, For index(i, j) >= 0:
       weight.row(index(i, j)) += tmat(i, j) * input.row(i)
   */
-  void MulGradWeight(const framework::Tensor& tmat,
-                     framework::SelectedRows* weight,
+  void MulGradWeight(const framework::Tensor& tmat, pten::SelectedRows* weight,
                      const framework::Tensor& input);
   /* For j < code_length
     input.row(i) += tmat(i, j) * weight.row(index(i, j))
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index f6178eb0a1eb6e8a4d1886443ec77b945c3b182f..8cd3e1367d86d9bc31e4b12af8baa25144cd14f2 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -24,9 +24,9 @@ namespace math {
 template <typename T>
 struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const framework::SelectedRows& input2,
-                  framework::SelectedRows* output) {
+                  const pten::SelectedRows& input1,
+                  const pten::SelectedRows& input2,
+                  pten::SelectedRows* output) {
     auto in1_height = input1.height();
     PADDLE_ENFORCE_EQ(
         in1_height, input2.height(),
@@ -94,7 +94,7 @@ template struct SelectedRowsAdd<platform::CPUDeviceContext, double>;
 template <typename T>
 struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input1,
+                  const pten::SelectedRows& input1,
                   const framework::Tensor& input2, framework::Tensor* output) {
     auto in1_height = input1.height();
     auto in2_dims = input2.dims();
@@ -154,9 +154,8 @@ template struct SelectedRowsAddTensor<platform::CPUDeviceContext, double>;
 template <typename T>
 struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const int64_t input2_offset,
-                  framework::SelectedRows* input2) {
+                  const pten::SelectedRows& input1, const int64_t input2_offset,
+                  pten::SelectedRows* input2) {
     auto in1_height = input1.height();
     PADDLE_ENFORCE_EQ(
         in1_height, input2->height(),
@@ -198,9 +197,9 @@ template struct SelectedRowsAddTo<platform::CPUDeviceContext, int64_t>;
 template <typename T>
 struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
-                  const std::vector<framework::SelectedRows*>& input1,
+                  const std::vector<pten::SelectedRows*>& input1,
                   const std::vector<int64_t>& input2_offsets,
-                  framework::SelectedRows* input2) {
+                  pten::SelectedRows* input2) {
     // Ensure all selected rows have the same height
     size_t size = 0u;
     for (auto iter = input1.begin(); iter != input1.end(); ++iter) {
@@ -242,8 +241,7 @@ template struct SelectedRowsSumTo<platform::CPUDeviceContext, double>;
 template <typename T>
 struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  framework::Tensor* input2) {
+                  const pten::SelectedRows& input1, framework::Tensor* input2) {
     if (UNLIKELY(input1.rows().size() == 0)) {
       LOG(WARNING) << "input selected rows is empty!";
       return;
@@ -313,7 +311,7 @@ typename std::enable_if<std::is_integral<T>::value>::type elementwise_add_to(
 
 template <typename T>
 typename std::enable_if<std::is_same<T, platform::bfloat16>::value>::type
-add_sparse_inputs(const std::vector<const framework::SelectedRows*>& inputs,
+add_sparse_inputs(const std::vector<const pten::SelectedRows*>& inputs,
                   const std::unordered_map<int64_t, size_t>& rows_to_id,
                   int64_t input_width,
                   const platform::CPUDeviceContext& context, T* out_data) {
@@ -347,7 +345,7 @@ add_sparse_inputs(const std::vector<const framework::SelectedRows*>& inputs,
 
 template <typename T>
 typename std::enable_if<!std::is_same<T, platform::bfloat16>::value>::type
-add_sparse_inputs(const std::vector<const framework::SelectedRows*>& inputs,
+add_sparse_inputs(const std::vector<const pten::SelectedRows*>& inputs,
                   const std::unordered_map<int64_t, size_t>& rows_to_id,
                   int64_t input_width,
                   const platform::CPUDeviceContext& context, T* out_data) {
@@ -371,32 +369,31 @@ add_sparse_inputs(const std::vector<const framework::SelectedRows*>& inputs,
 
 template <typename T>
 struct MergeAdd<platform::CPUDeviceContext, T> {
-  framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
-                                     const framework::SelectedRows& input,
-                                     const bool sorted_result = false) {
-    framework::SelectedRows out;
+  pten::SelectedRows operator()(const platform::CPUDeviceContext& context,
+                                const pten::SelectedRows& input,
+                                const bool sorted_result = false) {
+    pten::SelectedRows out;
     (*this)(context, input, &out, sorted_result);
     return out;
   }
 
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input,
-                  framework::SelectedRows* output,
+                  const pten::SelectedRows& input, pten::SelectedRows* output,
                   const bool sorted_result = false) {
-    std::vector<const framework::SelectedRows*> inputs;
+    std::vector<const pten::SelectedRows*> inputs;
     inputs.push_back(&input);
     (*this)(context, inputs, output, sorted_result);
   }
 
   void operator()(const platform::CPUDeviceContext& context,
-                  const std::vector<const framework::SelectedRows*>& inputs,
-                  framework::SelectedRows* output,
+                  const std::vector<const pten::SelectedRows*>& inputs,
+                  pten::SelectedRows* output,
                   const bool sorted_result = false) {
     if (inputs.size() == 0) {
       VLOG(3) << "no input! return";
       return;
     }
-    const framework::SelectedRows* has_value_input = nullptr;
+    const pten::SelectedRows* has_value_input = nullptr;
     for (auto* in : inputs) {
       if (in->rows().size() > 0) {
         has_value_input = in;
@@ -409,7 +406,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
     }
     auto input_width = has_value_input->value().dims()[1];
     auto input_height = has_value_input->height();
-    framework::SelectedRows& out = *output;
+    pten::SelectedRows& out = *output;
     std::set<int64_t> merged_row_set;
     size_t row_num = 0;
     for (auto* input : inputs) {
@@ -480,24 +477,23 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
 #ifdef PADDLE_WITH_XPU
 template <typename T>
 struct MergeAdd<platform::XPUDeviceContext, T> {
-  framework::SelectedRows operator()(const platform::XPUDeviceContext& context,
-                                     const framework::SelectedRows& input,
-                                     const bool sorted_result = false) {
-    framework::SelectedRows out;
+  pten::SelectedRows operator()(const platform::XPUDeviceContext& context,
+                                const pten::SelectedRows& input,
+                                const bool sorted_result = false) {
+    pten::SelectedRows out;
     (*this)(context, input, &out, sorted_result);
     return out;
   }
 
   void operator()(const platform::XPUDeviceContext& context,
-                  const framework::SelectedRows& input,
-                  framework::SelectedRows* output,
+                  const pten::SelectedRows& input, pten::SelectedRows* output,
                   const bool sorted_result = false) {
     framework::Vector<int64_t> input_rows(input.rows());
     if (input_rows.size() == 0) {
       return;
     }
 
-    framework::SelectedRows& out = *output;
+    pten::SelectedRows& out = *output;
     std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
     std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
     auto input_width = input.value().dims()[1];
@@ -537,14 +533,14 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
   }
 
   void operator()(const platform::XPUDeviceContext& context,
-                  const std::vector<const framework::SelectedRows*>& inputs,
-                  framework::SelectedRows* output,
+                  const std::vector<const pten::SelectedRows*>& inputs,
+                  pten::SelectedRows* output,
                   const bool sorted_result = false) {
     if (inputs.size() == 0) {
       VLOG(3) << "no input! return";
       return;
     }
-    const framework::SelectedRows* has_value_input = nullptr;
+    const pten::SelectedRows* has_value_input = nullptr;
     for (auto* in : inputs) {
       if (in->rows().size() > 0) {
         has_value_input = in;
@@ -557,7 +553,7 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
     }
     auto input_width = has_value_input->value().dims()[1];
     auto input_height = has_value_input->height();
-    framework::SelectedRows& out = *output;
+    pten::SelectedRows& out = *output;
     std::set<int64_t> merged_row_set;
     size_t row_num = 0;
     for (auto* input : inputs) {
@@ -628,29 +624,28 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
 #endif
 template <typename T>
 struct MergeAverage<platform::CPUDeviceContext, T> {
-  framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
-                                     const framework::SelectedRows& input) {
-    framework::SelectedRows out;
+  pten::SelectedRows operator()(const platform::CPUDeviceContext& context,
+                                const pten::SelectedRows& input) {
+    pten::SelectedRows out;
     (*this)(context, input, &out);
     return out;
   }
 
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input,
-                  framework::SelectedRows* output) {
-    std::vector<const framework::SelectedRows*> inputs;
+                  const pten::SelectedRows& input, pten::SelectedRows* output) {
+    std::vector<const pten::SelectedRows*> inputs;
     inputs.push_back(&input);
     (*this)(context, inputs, output);
   }
 
   void operator()(const platform::CPUDeviceContext& context,
-                  const std::vector<const framework::SelectedRows*>& inputs,
-                  framework::SelectedRows* output) {
+                  const std::vector<const pten::SelectedRows*>& inputs,
+                  pten::SelectedRows* output) {
     if (inputs.size() == 0) {
       VLOG(3) << "no input! return";
       return;
     }
-    const framework::SelectedRows* has_value_input = nullptr;
+    const pten::SelectedRows* has_value_input = nullptr;
     for (auto* in : inputs) {
       if (in->rows().size() > 0) {
         has_value_input = in;
@@ -663,7 +658,7 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
     }
     auto input_width = has_value_input->value().dims()[1];
     auto input_height = has_value_input->height();
-    framework::SelectedRows& out = *output;
+    pten::SelectedRows& out = *output;
     std::set<int64_t> merged_row_set;
     size_t row_num = 0;
     for (auto* input : inputs) {
@@ -750,7 +745,7 @@ template struct MergeAverage<platform::CPUDeviceContext, double>;
 template <typename T>
 struct UpdateToTensor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
-                  const ScatterOps& op, const framework::SelectedRows& input1,
+                  const ScatterOps& op, const pten::SelectedRows& input1,
                   framework::Tensor* input2) {
     auto in1_height = input1.height();
     auto in2_dims = input2->dims();
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 654a5653cbed1f5595c7e24e0f3da0516d582926..2ae2aaebb6c5324b82e1347d464835c3f0bc4068 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -26,9 +26,9 @@ namespace math {
 template <typename T>
 struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const framework::SelectedRows& input2,
-                  framework::SelectedRows* output) {
+                  const pten::SelectedRows& input1,
+                  const pten::SelectedRows& input2,
+                  pten::SelectedRows* output) {
     auto in1_height = input1.height();
     PADDLE_ENFORCE_EQ(
         in1_height, input2.height(),
@@ -117,7 +117,7 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
 template <typename T>
 struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::SelectedRows& input1,
+                  const pten::SelectedRows& input1,
                   const framework::Tensor& input2, framework::Tensor* output) {
     auto in1_height = input1.height();
     auto in2_dims = input2.dims();
@@ -182,9 +182,8 @@ template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
 template <typename T>
 struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const int64_t input2_offset,
-                  framework::SelectedRows* input2) {
+                  const pten::SelectedRows& input1, const int64_t input2_offset,
+                  pten::SelectedRows* input2) {
     auto in1_height = input1.height();
     PADDLE_ENFORCE_EQ(
         in1_height, input2->height(),
@@ -250,8 +249,7 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
 template <typename T>
 struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  framework::Tensor* input2) {
+                  const pten::SelectedRows& input1, framework::Tensor* input2) {
     auto in1_height = input1.height();
     auto in2_dims = input2->dims();
     PADDLE_ENFORCE_EQ(
@@ -320,24 +318,23 @@ __global__ void MergeAddKernel(const T* input, const int64_t* input_rows,
 
 template <typename T>
 struct MergeAdd<platform::CUDADeviceContext, T> {
-  framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
-                                     const framework::SelectedRows& input,
-                                     const bool sorted_result = false) {
-    framework::SelectedRows out;
+  pten::SelectedRows operator()(const platform::CUDADeviceContext& context,
+                                const pten::SelectedRows& input,
+                                const bool sorted_result = false) {
+    pten::SelectedRows out;
     (*this)(context, input, &out);
     return out;
   }
 
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::SelectedRows& input,
-                  framework::SelectedRows* output,
+                  const pten::SelectedRows& input, pten::SelectedRows* output,
                   const bool sorted_result = false) {
     framework::Vector<int64_t> input_rows(input.rows());
     if (input_rows.size() == 0) {
       return;
     }
 
-    framework::SelectedRows& out = *output;
+    pten::SelectedRows& out = *output;
     std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
     std::vector<int64_t> merge_rows_cpu(row_set.begin(), row_set.end());
     framework::Vector<int64_t> merge_rows(merge_rows_cpu);
@@ -368,14 +365,14 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
   }
 
   void operator()(const platform::CUDADeviceContext& context,
-                  const std::vector<const framework::SelectedRows*>& inputs,
-                  framework::SelectedRows* output,
+                  const std::vector<const pten::SelectedRows*>& inputs,
+                  pten::SelectedRows* output,
                   const bool sorted_result = false) {
     if (inputs.size() == 0) {
       VLOG(3) << "no input! return";
       return;
     }
-    const framework::SelectedRows* has_value_input = nullptr;
+    const pten::SelectedRows* has_value_input = nullptr;
     for (auto* in : inputs) {
       if (in->rows().size() > 0) {
         has_value_input = in;
@@ -388,7 +385,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
     }
     auto input_width = has_value_input->value().dims()[1];
     auto input_height = has_value_input->height();
-    framework::SelectedRows& out = *output;
+    pten::SelectedRows& out = *output;
     std::set<int64_t> merged_row_set;
     for (auto* input : inputs) {
       if (input->rows().size() == 0) {
@@ -499,7 +496,7 @@ __global__ void UpdateToTensorKernel(const T* selected_rows,
 template <typename T>
 struct UpdateToTensor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
-                  const ScatterOps& op, const framework::SelectedRows& input1,
+                  const ScatterOps& op, const pten::SelectedRows& input1,
                   framework::Tensor* input2) {
     // NOTE: Use SelectedRowsAddToTensor for better performance
     //       no additional MergeAdd called.
diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h
index 8ba7851d7b979aec33318a237a6c74a15d296e1a..690082036c5e0a4b8da99abc2a4aae588ab6fe31 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -35,15 +35,14 @@ namespace math {
 template <typename DeviceContext, typename T>
 struct SelectedRowsAdd {
   void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const framework::SelectedRows& input2,
-                  framework::SelectedRows* output);
+                  const pten::SelectedRows& input1,
+                  const pten::SelectedRows& input2, pten::SelectedRows* output);
 };
 
 template <typename DeviceContext, typename T>
 struct SelectedRowsAddTensor {
   void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& input1,
+                  const pten::SelectedRows& input1,
                   const framework::Tensor& input2, framework::Tensor* output);
 };
 
@@ -51,17 +50,17 @@ struct SelectedRowsAddTensor {
 template <typename DeviceContext, typename T>
 struct SelectedRowsAddTo {
   void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const int64_t input2_offset, framework::SelectedRows* input2);
+                  const pten::SelectedRows& input1, const int64_t input2_offset,
+                  pten::SelectedRows* input2);
 };
 
 // input2 = [all input in input1] + input2
 template <typename DeviceContext, typename T>
 struct SelectedRowsSumTo {
   void operator()(const DeviceContext& context,
-                  const std::vector<framework::SelectedRows*>& input1,
+                  const std::vector<pten::SelectedRows*>& input1,
                   const std::vector<int64_t>& input2_offsets,
-                  framework::SelectedRows* input2);
+                  pten::SelectedRows* input2);
 };
 
 // FIXME: The result of SelectedRowsAddToTensor maybe non deterministic,
@@ -70,8 +69,7 @@ struct SelectedRowsSumTo {
 template <typename DeviceContext, typename T>
 struct SelectedRowsAddToTensor {
   void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  framework::Tensor* input2);
+                  const pten::SelectedRows& input1, framework::Tensor* input2);
 };
 
 namespace scatter {
@@ -80,29 +78,25 @@ template <typename DeviceContext, typename T>
 struct MergeAdd {
   // unary functor, merge by adding duplicated rows in
   // the input SelectedRows object.
-  framework::SelectedRows operator()(const DeviceContext& context,
-                                     const framework::SelectedRows& input,
-                                     const bool sorted_result = false);
+  pten::SelectedRows operator()(const DeviceContext& context,
+                                const pten::SelectedRows& input,
+                                const bool sorted_result = false);
+  void operator()(const DeviceContext& context, const pten::SelectedRows& input,
+                  pten::SelectedRows* output, const bool sorted_result = false);
   void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& input,
-                  framework::SelectedRows* output,
-                  const bool sorted_result = false);
-  void operator()(const DeviceContext& context,
-                  const std::vector<const framework::SelectedRows*>& inputs,
-                  framework::SelectedRows* output,
-                  const bool sorted_result = false);
+                  const std::vector<const pten::SelectedRows*>& inputs,
+                  pten::SelectedRows* output, const bool sorted_result = false);
 };
 
 template <typename DeviceContext, typename T>
 struct MergeAverage {
-  framework::SelectedRows operator()(const DeviceContext& context,
-                                     const framework::SelectedRows& input);
-  void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& input,
-                  framework::SelectedRows* output);
+  pten::SelectedRows operator()(const DeviceContext& context,
+                                const pten::SelectedRows& input);
+  void operator()(const DeviceContext& context, const pten::SelectedRows& input,
+                  pten::SelectedRows* output);
   void operator()(const DeviceContext& context,
-                  const std::vector<const framework::SelectedRows*>& inputs,
-                  framework::SelectedRows* output);
+                  const std::vector<const pten::SelectedRows*>& inputs,
+                  pten::SelectedRows* output);
 };
 
 enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
@@ -111,8 +105,7 @@ enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
 template <typename DeviceContext, typename T>
 struct UpdateToTensor {
   void operator()(const DeviceContext& context, const ScatterOps& op,
-                  const framework::SelectedRows& input1,
-                  framework::Tensor* input2);
+                  const pten::SelectedRows& input1, framework::Tensor* input2);
 };
 
 }  // namespace scatter
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
index e0b368164906626e99a35cffc406a2f30edcc388..19e70f924f15e7d2a7d33a17911b711fc812b501 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -27,8 +27,8 @@ TEST(selected_rows_functor, cpu_add) {
   int64_t row_numel = 10;
 
   std::vector<int64_t> rows1{0, 4, 7};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
-      new paddle::framework::SelectedRows(rows1, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows1{
+      new pten::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
       paddle::framework::make_ddim(
@@ -37,8 +37,8 @@ TEST(selected_rows_functor, cpu_add) {
   functor(ctx, in1_value, 1.0);
 
   std::vector<int64_t> rows2{0, 5, 7, 9};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
-      new paddle::framework::SelectedRows(rows2, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows2{
+      new pten::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
       paddle::framework::make_ddim(
@@ -46,8 +46,7 @@ TEST(selected_rows_functor, cpu_add) {
       cpu_place);
   functor(ctx, in2_value, 2.0);
 
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
+  std::unique_ptr<pten::SelectedRows> output{new pten::SelectedRows()};
   auto* out_value = output->mutable_value();
 
   // simplely concat two SelectedRows
@@ -130,8 +129,8 @@ TEST(selected_rows_functor, cpu_add_to) {
   int64_t row_numel = 10;
 
   std::vector<int64_t> rows1{0, 4, 7};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
-      new paddle::framework::SelectedRows(rows1, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows1{
+      new pten::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
       paddle::framework::make_ddim(
@@ -140,8 +139,8 @@ TEST(selected_rows_functor, cpu_add_to) {
   functor(ctx, in1_value, 1.0);
 
   std::vector<int64_t> rows2{0, 5, 7, 9};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
-      new paddle::framework::SelectedRows(rows2, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows2{
+      new pten::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
       paddle::framework::make_ddim(
@@ -149,8 +148,7 @@ TEST(selected_rows_functor, cpu_add_to) {
       cpu_place);
   functor(ctx, in2_value, 2.0);
 
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
+  std::unique_ptr<pten::SelectedRows> output{new pten::SelectedRows()};
   output->set_height(height);
   auto* out_value = output->mutable_value();
 
@@ -230,8 +228,8 @@ TEST(selected_rows_functor, cpu_merge_average_float) {
   int64_t row_numel = 10;
 
   std::vector<int64_t> rows{0, 4, 4, 7};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows{
-      new paddle::framework::SelectedRows(rows, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows{
+      new pten::SelectedRows(rows, height)};
   auto* in_value = selected_rows->mutable_value();
   in_value->mutable_data<float>(
       paddle::framework::make_ddim(
@@ -242,8 +240,7 @@ TEST(selected_rows_functor, cpu_merge_average_float) {
   paddle::operators::math::scatter::MergeAverage<
       paddle::platform::CPUDeviceContext, float>
       merge_average_functor;
-  paddle::framework::SelectedRows output =
-      merge_average_functor(ctx, *selected_rows);
+  pten::SelectedRows output = merge_average_functor(ctx, *selected_rows);
 
   auto out_height = output.height();
   EXPECT_EQ(out_height, height);
@@ -270,8 +267,8 @@ TEST(selected_rows_functor, cpu_merge_add_float) {
   int64_t row_numel = 10;
 
   std::vector<int64_t> rows{0, 4, 4, 7};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows{
-      new paddle::framework::SelectedRows(rows, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows{
+      new pten::SelectedRows(rows, height)};
   auto* in_value = selected_rows->mutable_value();
   in_value->mutable_data<float>(
       paddle::framework::make_ddim(
@@ -279,8 +276,7 @@ TEST(selected_rows_functor, cpu_merge_add_float) {
       cpu_place);
   functor(ctx, in_value, 1.0);
 
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
+  std::unique_ptr<pten::SelectedRows> output{new pten::SelectedRows()};
 
   paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
                                              float>
@@ -311,8 +307,8 @@ TEST(selected_rows_functor, cpu_merge_add_int) {
   int64_t row_numel = 10;
 
   std::vector<int64_t> rows{0, 4, 4, 7};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows{
-      new paddle::framework::SelectedRows(rows, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows{
+      new pten::SelectedRows(rows, height)};
   auto* in_value = selected_rows->mutable_value();
   in_value->mutable_data<int>(
       paddle::framework::make_ddim(
@@ -320,8 +316,7 @@ TEST(selected_rows_functor, cpu_merge_add_int) {
       cpu_place);
   functor(ctx, in_value, 1);
 
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
+  std::unique_ptr<pten::SelectedRows> output{new pten::SelectedRows()};
 
   paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
                                              int>
@@ -354,8 +349,8 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
   int64_t row_numel = 8;
 
   std::vector<int64_t> rows1{5, 2, 5, 3, 5};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
-      new paddle::framework::SelectedRows(rows1, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows1{
+      new pten::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
       paddle::framework::make_ddim(
@@ -364,8 +359,8 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
   set_const(ctx, in1_value, 1.0);
 
   std::vector<int64_t> rows2{2, 5, 3, 5, 3};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
-      new paddle::framework::SelectedRows(rows2, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows2{
+      new pten::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
       paddle::framework::make_ddim(
@@ -373,14 +368,13 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
       cpu_place);
   set_const(ctx, in2_value, 1.0);
 
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
+  std::unique_ptr<pten::SelectedRows> output{new pten::SelectedRows()};
   output->set_height(height);
   paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
                                              float>
       merge_add_functor;
 
-  std::vector<const paddle::framework::SelectedRows*> inputs;
+  std::vector<const pten::SelectedRows*> inputs;
   inputs.push_back(selected_rows1.get());
   inputs.push_back(selected_rows2.get());
   merge_add_functor(ctx, inputs, output.get());
@@ -411,8 +405,8 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
   int64_t row_numel = 8;
 
   std::vector<int64_t> rows1{1, 3, 5, 7, 9};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
-      new paddle::framework::SelectedRows(rows1, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows1{
+      new pten::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
       paddle::framework::make_ddim(
@@ -421,8 +415,8 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
   set_const(ctx, in1_value, 1.0);
 
   std::vector<int64_t> rows2{0, 2, 4, 6, 8};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
-      new paddle::framework::SelectedRows(rows2, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows2{
+      new pten::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
       paddle::framework::make_ddim(
@@ -430,14 +424,13 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
       cpu_place);
   set_const(ctx, in2_value, 2.0);
 
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
+  std::unique_ptr<pten::SelectedRows> output{new pten::SelectedRows()};
   output->set_height(height);
   paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
                                              float>
       merge_add_functor;
 
-  std::vector<const paddle::framework::SelectedRows*> inputs;
+  std::vector<const pten::SelectedRows*> inputs;
   inputs.push_back(selected_rows1.get());
   inputs.push_back(selected_rows2.get());
   merge_add_functor(ctx, inputs, output.get());
@@ -472,8 +465,8 @@ TEST(selected_rows_functor, cpu_sum_to) {
   int64_t height = 10;
   int64_t row_numel = 10;
   std::vector<int64_t> rows1{0, 4, 7};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
-      new paddle::framework::SelectedRows(rows1, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows1{
+      new pten::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
       paddle::framework::make_ddim(
@@ -482,8 +475,8 @@ TEST(selected_rows_functor, cpu_sum_to) {
 
   functor(ctx, in1_value, 1.0);
   std::vector<int64_t> rows2{0, 5, 7, 9};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
-      new paddle::framework::SelectedRows(rows2, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows2{
+      new pten::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
       paddle::framework::make_ddim(
@@ -491,8 +484,7 @@ TEST(selected_rows_functor, cpu_sum_to) {
       cpu_place);
 
   functor(ctx, in2_value, 2.0);
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
+  std::unique_ptr<pten::SelectedRows> output{new pten::SelectedRows()};
   output->set_height(height);
   auto* out_value = output->mutable_value();
   // simplely concat two SelectedRows
@@ -501,7 +493,7 @@ TEST(selected_rows_functor, cpu_sum_to) {
   paddle::operators::math::SelectedRowsSumTo<paddle::platform::CPUDeviceContext,
                                              float>
       sum_to_functor;
-  sum_to_functor(ctx, std::vector<paddle::framework::SelectedRows*>(
+  sum_to_functor(ctx, std::vector<pten::SelectedRows*>(
                           {selected_rows1.get(), selected_rows2.get()}),
                  std::vector<int64_t>({0, in1_value->numel()}), output.get());
   auto out_height = output->height();
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
index ebcd97b32c4a30d76b844546b4e5cd7d177be192..e826c2a7244f719df28ea57a074093d211fe5e6e 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
@@ -29,8 +29,8 @@ TEST(selected_rows_functor, gpu_add) {
   int64_t row_numel = 10;
 
   std::vector<int64_t> rows1{0, 4, 7};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
-      new paddle::framework::SelectedRows(rows1, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows1{
+      new pten::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
       paddle::framework::make_ddim(
@@ -48,8 +48,8 @@ TEST(selected_rows_functor, gpu_add) {
 #endif
 
   std::vector<int64_t> rows2{0, 5, 7, 9};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
-      new paddle::framework::SelectedRows(rows2, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows2{
+      new pten::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
       paddle::framework::make_ddim(
@@ -57,8 +57,7 @@ TEST(selected_rows_functor, gpu_add) {
       gpu_place);
   functor(ctx, in2_value, 2.0);
 
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
+  std::unique_ptr<pten::SelectedRows> output{new pten::SelectedRows()};
   auto* out_value = output->mutable_value();
 
   // simply concat two SelectedRows
@@ -152,8 +151,8 @@ TEST(selected_rows_functor, gpu_add_to) {
   int64_t row_numel = 10;
 
   std::vector<int64_t> rows1{0, 4, 7};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
-      new paddle::framework::SelectedRows(rows1, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows1{
+      new pten::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
       paddle::framework::make_ddim(
@@ -162,8 +161,8 @@ TEST(selected_rows_functor, gpu_add_to) {
   functor(ctx, in1_value, 1.0);
 
   std::vector<int64_t> rows2{0, 5, 7, 9};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
-      new paddle::framework::SelectedRows(rows2, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows2{
+      new pten::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
       paddle::framework::make_ddim(
@@ -171,8 +170,7 @@ TEST(selected_rows_functor, gpu_add_to) {
       gpu_place);
   functor(ctx, in2_value, 2.0);
 
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
+  std::unique_ptr<pten::SelectedRows> output{new pten::SelectedRows()};
   output->set_height(height);
   auto* out_value = output->mutable_value();
 
@@ -264,8 +262,8 @@ TEST(selected_rows_functor, gpu_merge_add) {
   int64_t row_numel = 8;
 
   std::vector<int64_t> rows1{5, 2, 5, 3, 5};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
-      new paddle::framework::SelectedRows(rows1, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows1{
+      new pten::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
       paddle::framework::make_ddim(
@@ -274,8 +272,8 @@ TEST(selected_rows_functor, gpu_merge_add) {
   set_const(ctx, in1_value, 1.0);
 
   std::vector<int64_t> rows2{2, 5, 3, 5, 3};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
-      new paddle::framework::SelectedRows(rows2, height)};
+  std::unique_ptr<pten::SelectedRows> selected_rows2{
+      new pten::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
       paddle::framework::make_ddim(
@@ -283,14 +281,13 @@ TEST(selected_rows_functor, gpu_merge_add) {
       gpu_place);
   set_const(ctx, in2_value, 1.0);
 
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
+  std::unique_ptr<pten::SelectedRows> output{new pten::SelectedRows()};
   output->set_height(height);
   paddle::operators::math::scatter::MergeAdd<
       paddle::platform::CUDADeviceContext, float>
       merge_add_functor;
 
-  std::vector<const paddle::framework::SelectedRows*> inputs;
+  std::vector<const pten::SelectedRows*> inputs;
   inputs.push_back(selected_rows1.get());
   inputs.push_back(selected_rows2.get());
   merge_add_functor(ctx, inputs, output.get());
diff --git a/paddle/fluid/operators/memcpy_d2h_op.h b/paddle/fluid/operators/memcpy_d2h_op.h
index e1b81c0c59241a9e0fbc4c5615d74aba47074764..bdedb8e7d29458fec231865879a4aa706bdbedbb 100644
--- a/paddle/fluid/operators/memcpy_d2h_op.h
+++ b/paddle/fluid/operators/memcpy_d2h_op.h
@@ -51,7 +51,7 @@ class MemcpyD2HFunctor {
     }
   }
 
-  void operator()(const framework::SelectedRows &rows) const {
+  void operator()(const pten::SelectedRows &rows) const {
     // (JZ-LIANG) to support SelectedRows
     PADDLE_THROW(platform::errors::Unimplemented(
         "Memcpy for SelectedRows is NOT support yet."));
diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h
index 7f4870010403070b7b07e44c80f32e3162179795..c9995eeca16cd42aaf8d69229a15dde7d949ea72 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.h
+++ b/paddle/fluid/operators/memcpy_h2d_op.h
@@ -59,7 +59,7 @@ class MemcpyH2DFunctor {
     out_tensor.set_lod(lod_tensor.lod());
   }
 
-  void operator()(const framework::SelectedRows &rows) const {
+  void operator()(const pten::SelectedRows &rows) const {
     // (JZ-LIANG) to support SelectedRows
     PADDLE_THROW(platform::errors::Unimplemented(
         "Memcpy for SelectedRows is NOT support yet."));
diff --git a/paddle/fluid/operators/memcpy_op.h b/paddle/fluid/operators/memcpy_op.h
index ac4a0d1ab111ed250edf620faefe0e98a28ea78d..40c7aceda5116075d9498903fd788f3a10e080ad 100644
--- a/paddle/fluid/operators/memcpy_op.h
+++ b/paddle/fluid/operators/memcpy_op.h
@@ -75,7 +75,7 @@ class MemcpyFunctor {
     out_tensor.set_lod(lod_tensor.lod());
   }
 
-  void operator()(const framework::SelectedRows &rows) const {
+  void operator()(const pten::SelectedRows &rows) const {
     // (JZ-LIANG) to support SelectedRows
     PADDLE_THROW(platform::errors::Unimplemented(
         "Memcpy for SelectedRows is NOT support yet."));
diff --git a/paddle/fluid/operators/merge_selected_rows_op.h b/paddle/fluid/operators/merge_selected_rows_op.h
index 4c977e94b175c988e4253b273365b0cabc4b87aa..0fe262dea3b1352e590f9018b9152e7537299108 100644
--- a/paddle/fluid/operators/merge_selected_rows_op.h
+++ b/paddle/fluid/operators/merge_selected_rows_op.h
@@ -24,8 +24,8 @@ template <typename DeviceContext, typename T>
 class MergeSelectedRowsKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::SelectedRows>("X");
-    auto* out = context.Output<framework::SelectedRows>("Out");
+    auto* x = context.Input<pten::SelectedRows>("X");
+    auto* out = context.Output<pten::SelectedRows>("Out");
 
     math::scatter::MergeAdd<DeviceContext, T> merge_func;
     merge_func(context.template device_context<DeviceContext>(), *x, out);
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 67b6b3ec1614dd51adc62cf418d9eadadf276ca9..82d7c56aea1234bec8c1d22cc10717c100fbe369 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -1137,5 +1137,28 @@ class MLUCnnl {
                          void* output);
 };
 
+template <typename T>
+inline void TransposeFromMLUTensor(const ExecutionContext& ctx,
+                                   const std::vector<int> perm,
+                                   const Tensor* transformed_input,
+                                   Tensor* transformed_output,
+                                   bool need_reshape_or_alloc) {
+  auto in_dims_vec = framework::vectorize(transformed_input->dims());
+  if (need_reshape_or_alloc) {
+    transformed_output->mutable_data<T>(
+        {in_dims_vec[perm[0]], in_dims_vec[perm[1]], in_dims_vec[perm[2]],
+         in_dims_vec[perm[3]]},
+        ctx.GetPlace());
+  }
+  MLUCnnlTensorDesc trans_in_desc(*transformed_input, CNNL_LAYOUT_ARRAY,
+                                  ToCnnlDataType<T>());
+  MLUCnnlTensorDesc trans_out_desc(*transformed_output, CNNL_LAYOUT_ARRAY,
+                                   ToCnnlDataType<T>());
+
+  MLUCnnl::Transpose(ctx, perm, in_dims_vec.size(), trans_in_desc.get(),
+                     GetBasePtr(transformed_input), trans_out_desc.get(),
+                     GetBasePtr(transformed_output));
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 55f684b66485bb3d23b443000fdbe35c35332486..edd2ae4ca9c87c0c06e41913a4829a5ff057c82a 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -31,7 +31,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
+using SelectedRows = pten::SelectedRows;
 using Sampler = math::Sampler;
 using DDim = framework::DDim;
 
@@ -364,8 +364,8 @@ class NCEGradKernel : public framework::OpKernel<T> {
       DDim table_dim;
       if (table_var->IsType<LoDTensor>()) {
         table_dim = context.Input<LoDTensor>("Weight")->dims();
-      } else if (table_var->IsType<SelectedRows>()) {
-        auto *table_t = context.Input<SelectedRows>("Weight");
+      } else if (table_var->IsType<pten::SelectedRows>()) {
+        auto *table_t = context.Input<pten::SelectedRows>("Weight");
         table_dim = table_t->value().dims();
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
@@ -373,7 +373,8 @@ class NCEGradKernel : public framework::OpKernel<T> {
             "must be either LoDTensor or SelectedRows"));
       }
 
-      auto d_w = context.Output<SelectedRows>(framework::GradVarName("Weight"));
+      auto d_w =
+          context.Output<pten::SelectedRows>(framework::GradVarName("Weight"));
 
       d_w->set_rows(labels);
       d_w->set_height(table_dim[0]);
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc
index 255dc5bb083114c4bc85739c621f3558d153cc93..31d3e1208dadb72ed9add4d90ad68ca189411f8f 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cc
@@ -111,7 +111,7 @@ size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
 template <typename T>
 struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& grad,
+                  const pten::SelectedRows& grad,
                   const framework::Tensor& learning_rate, T epsilon,
                   framework::Tensor* moment, framework::Tensor* param) {
     // 1. g_m.rows = set(g.rows)
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cu b/paddle/fluid/operators/optimizers/adagrad_op.cu
index 8b939b7c6b3ba275ef050abc08636ea9c8740621..a7c32255bd1ee060435abf1e4d80cf05e4d979ed 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cu
@@ -72,7 +72,7 @@ __global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows,
 template <typename T>
 struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::SelectedRows& grad,
+                  const pten::SelectedRows& grad,
                   const framework::Tensor& learning_rate, T epsilon,
                   framework::Tensor* moment, framework::Tensor* param) {
     // 1. g_m.rows = set(g.rows)
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.h b/paddle/fluid/operators/optimizers/adagrad_op.h
index 057bd4e863ddf7ae27b54ee784174e1452619395..c2dc3f095ed99de2917f70aed27dc1a6b2a8bb4c 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/adagrad_op.h
@@ -22,16 +22,15 @@ namespace operators {
 
 template <typename DeviceContext, typename T>
 struct SparseAdagradFunctor {
-  void operator()(const DeviceContext &context,
-                  const framework::SelectedRows &grad,
+  void operator()(const DeviceContext &context, const pten::SelectedRows &grad,
                   const framework::Tensor &learning_rate, T epsilon,
                   framework::Tensor *moment, framework::Tensor *param);
 };
 
 template <typename DeviceContext, typename T>
-framework::SelectedRows SquareSelectedRows(
-    const DeviceContext &context, const framework::SelectedRows &input) {
-  framework::SelectedRows out;
+pten::SelectedRows SquareSelectedRows(const DeviceContext &context,
+                                      const pten::SelectedRows &input) {
+  pten::SelectedRows out;
   out.set_rows(input.rows());
   out.set_height(input.height());
   out.mutable_value()->mutable_data<T>(input.value().dims(),
@@ -88,7 +87,7 @@ class AdagradOpKernel : public framework::OpKernel<T> {
             param -
             lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
       }
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
+    } else if (grad_var->IsType<pten::SelectedRows>()) {
       auto *param_tensor = ctx.Input<framework::Tensor>("Param");
       PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor,
                         platform::errors::InvalidArgument(
@@ -101,7 +100,7 @@ class AdagradOpKernel : public framework::OpKernel<T> {
 
       SparseAdagradFunctor<DeviceContext, T> functor;
       functor(ctx.template device_context<DeviceContext>(),
-              *ctx.Input<framework::SelectedRows>("Grad"),
+              *ctx.Input<pten::SelectedRows>("Grad"),
               *ctx.Input<framework::Tensor>("LearningRate"), epsilon,
               moment_out_tensor, param_out_tensor);
     } else {
diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu
index 1ef46ef085c5d73b63ed25ef353cb1477a17776c..c7ffb53a0588267ef205971d5899d5aa36168072 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cu
+++ b/paddle/fluid/operators/optimizers/adam_op.cu
@@ -314,8 +314,8 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
               beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
         }
       }
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
+    } else if (grad_var->IsType<pten::SelectedRows>()) {
+      auto* grad = ctx.Input<pten::SelectedRows>("Grad");
       if (grad->rows().size() == 0) {
         VLOG(3) << "grad row size is 0!!";
         return;
@@ -330,8 +330,8 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
         }
       }
 
-      framework::SelectedRows tmp_grad_merge;
-      const framework::SelectedRows* grad_merge_ptr;
+      pten::SelectedRows tmp_grad_merge;
+      const pten::SelectedRows* grad_merge_ptr;
       if (is_strict_sorted) {
         grad_merge_ptr = grad;
       } else {
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index bb044b4b4986e3ec9cecc38cc52cf53cddcb45f9..bcc314cd57c017b577d8370a6e593366364dbdd9 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -521,8 +521,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
         beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
             beta2 * beta2_pow->data<T>()[0];
       }
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
+    } else if (grad_var->IsType<pten::SelectedRows>()) {
+      auto* grad = ctx.Input<pten::SelectedRows>("Grad");
       if (grad->rows().size() == 0) {
         VLOG(3) << "grad row size is 0!!";
         return;
@@ -537,8 +537,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
         }
       }
 
-      framework::SelectedRows tmp_grad_merge;
-      const framework::SelectedRows* grad_merge_ptr;
+      pten::SelectedRows tmp_grad_merge;
+      const pten::SelectedRows* grad_merge_ptr;
       if (is_strict_sorted) {
         grad_merge_ptr = grad;
       } else {
diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
index e462c20c7f51db8195c3acba019d0aa225005dce..fd83b76e02a24f86899c851efe3f773873dc50ce 100644
--- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
@@ -195,8 +195,8 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
           xpu_wait(dev_ctx.x_context()->xpu_stream);
         }
       }
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
+    } else if (grad_var->IsType<pten::SelectedRows>()) {
+      auto* grad = ctx.Input<pten::SelectedRows>("Grad");
       auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
       if (grad->rows().size() == 0) {
@@ -213,8 +213,8 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
         }
       }
 
-      framework::SelectedRows tmp_grad_merge;
-      const framework::SelectedRows* grad_merge_ptr;
+      pten::SelectedRows tmp_grad_merge;
+      const pten::SelectedRows* grad_merge_ptr;
       if (is_strict_sorted) {
         grad_merge_ptr = grad;
       } else {
diff --git a/paddle/fluid/operators/optimizers/adamw_op.cu b/paddle/fluid/operators/optimizers/adamw_op.cu
index a8b16e73dbfffe69e2b4b10371b30f3c77305696..8bce415cb1ab9835d6c87c9617e9147187b2a2c8 100644
--- a/paddle/fluid/operators/optimizers/adamw_op.cu
+++ b/paddle/fluid/operators/optimizers/adamw_op.cu
@@ -337,8 +337,8 @@ class AdamWOpCUDAKernel : public framework::OpKernel<T> {
               beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
         }
       }
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
+    } else if (grad_var->IsType<pten::SelectedRows>()) {
+      auto* grad = ctx.Input<pten::SelectedRows>("Grad");
       if (grad->rows().size() == 0) {
         VLOG(3) << "grad row size is 0!!";
         return;
@@ -353,8 +353,8 @@ class AdamWOpCUDAKernel : public framework::OpKernel<T> {
         }
       }
 
-      framework::SelectedRows tmp_grad_merge;
-      const framework::SelectedRows* grad_merge_ptr;
+      pten::SelectedRows tmp_grad_merge;
+      const pten::SelectedRows* grad_merge_ptr;
       if (is_strict_sorted) {
         grad_merge_ptr = grad;
       } else {
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h
index 6bf8c8d724fb892ed934d4c1ee9305e641b62851..9c9355921d8273ea1e7f587e43eaa0bdc6a80838 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.h
+++ b/paddle/fluid/operators/optimizers/ftrl_op.h
@@ -180,11 +180,11 @@ class FTRLOpKernel : public framework::OpKernel<T> {
       }
 
       s_acc_out.device(place) = sq_accum + g * g;
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      auto grad = ctx.Input<framework::SelectedRows>("Grad");
+    } else if (grad_var->IsType<pten::SelectedRows>()) {
+      auto grad = ctx.Input<pten::SelectedRows>("Grad");
 
-      framework::SelectedRows tmp_merged_grad;
-      framework::SelectedRows* merged_grad = &tmp_merged_grad;
+      pten::SelectedRows tmp_merged_grad;
+      pten::SelectedRows* merged_grad = &tmp_merged_grad;
       math::scatter::MergeAdd<DeviceContext, T> merge_func;
       merge_func(ctx.template device_context<DeviceContext>(), *grad,
                  merged_grad);
diff --git a/paddle/fluid/operators/optimizers/lamb_op.h b/paddle/fluid/operators/optimizers/lamb_op.h
index 9a3eaa66caa8e870f2692c67aea29535dbd7492a..f1158703f028b6c2ebbb9e1596b240e81b5b0b2b 100644
--- a/paddle/fluid/operators/optimizers/lamb_op.h
+++ b/paddle/fluid/operators/optimizers/lamb_op.h
@@ -552,7 +552,7 @@ class LambOpKernel : public framework::OpKernel<T> {
             trust_ratio_div_ptr, skip_update_flag);
         for_range(moment_update_functor);
       }
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
+    } else if (grad_var->IsType<pten::SelectedRows>()) {
       PADDLE_ENFORCE_EQ(IsMultiPrecision, false,
                         platform::errors::Unimplemented(
                             "SelectedRows gradient is not supported when "
@@ -562,7 +562,7 @@ class LambOpKernel : public framework::OpKernel<T> {
                         platform::errors::Unimplemented(
                             "SelectedRows gradient is not supported when "
                             "multi_precision=True."));
-      auto& grad = GET_DATA_SAFELY(ctx.Input<framework::SelectedRows>("Grad"),
+      auto& grad = GET_DATA_SAFELY(ctx.Input<pten::SelectedRows>("Grad"),
                                    "Input", "Grad", "Lamb");
       if (grad.rows().size() == 0) {
         VLOG(3) << "grad row size is 0!!";
@@ -578,8 +578,8 @@ class LambOpKernel : public framework::OpKernel<T> {
         }
       }
 
-      framework::SelectedRows tmp_grad_merge;
-      const framework::SelectedRows* grad_merge_ptr;
+      pten::SelectedRows tmp_grad_merge;
+      const pten::SelectedRows* grad_merge_ptr;
       if (is_strict_sorted) {
         grad_merge_ptr = &grad;
       } else {
diff --git a/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc
index 450ef376799d3d383f3fa55f65850ca73e6c51a3..ee3111c7dd6a09c22682b738ec6a1ea9525134d9 100644
--- a/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc
+++ b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc
@@ -48,7 +48,7 @@ class SGDOneDNNKernel : public SGDOpKernel<pplat::CPUDeviceContext, T> {
     VLOG(4) << "[ONEDNN]: sgd_dense_param_kernel<T, SelectedRows>";
     const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
     auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
-    const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+    const auto *grad = ctx.Input<pten::SelectedRows>("Grad");
 
     const auto &grad_value = grad->value();
     const auto &grad_rows = grad->rows();
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
index 2d713308fd938996d45badf1549d2a60d6c8c4ec..79d76d52f48c8c2d5f1c62f4cd08977ce268c573 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -27,7 +27,7 @@ namespace paddle {
 namespace operators {
 
 using framework::Tensor;
-using framework::SelectedRows;
+using pten::SelectedRows;
 struct NoNesterov;
 struct UseNesterov;
 
@@ -545,9 +545,9 @@ class MomentumOpKernel : public framework::OpKernel<T> {
         }
       }
 
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
+    } else if (grad_var->IsType<pten::SelectedRows>()) {
       // sparse update embedding with selectedrows
-      auto grad = ctx.Input<framework::SelectedRows>("Grad");
+      auto grad = ctx.Input<pten::SelectedRows>("Grad");
 
       // sparse update maybe empty.
       if (grad->rows().size() == 0) {
@@ -555,8 +555,8 @@ class MomentumOpKernel : public framework::OpKernel<T> {
         return;
       }
 
-      framework::SelectedRows tmp_merged_grad;
-      framework::SelectedRows* merged_grad = &tmp_merged_grad;
+      pten::SelectedRows tmp_merged_grad;
+      pten::SelectedRows* merged_grad = &tmp_merged_grad;
       math::scatter::MergeAdd<DeviceContext, T> merge_func;
       merge_func(ctx.template device_context<DeviceContext>(), *grad,
                  merged_grad);
diff --git a/paddle/fluid/operators/optimizers/momentum_op_npu.cc b/paddle/fluid/operators/optimizers/momentum_op_npu.cc
index e3f0e5cc04d9ee970de19a9a1f1724f24fb4eb15..a71847c4690821e33eb5dfa4240a748fe9bd9472 100644
--- a/paddle/fluid/operators/optimizers/momentum_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_npu.cc
@@ -74,7 +74,7 @@ class NPUMomentumOpKernel : public framework::OpKernel<T> {
                             regularized_grad, mu_tensor},
           {*param_out}, {{"use_nesterov", use_nesterov}});
       runner.Run(dev_ctx.stream());
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
+    } else if (grad_var->IsType<pten::SelectedRows>()) {
       PADDLE_ENFORCE_EQ(false, true, platform::errors::PermissionDenied(
                                          "Unsupport SparseMomentum"));
     } else {
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.h b/paddle/fluid/operators/optimizers/rmsprop_op.h
index 9971cb92306a2710e02998fade05c8a498e88627..a01f84b37c4eb236c7aff591a49e4d76c55f152d 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.h
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.h
@@ -218,10 +218,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
               rho, epsilon, momentum, grad_func));
         }
       }
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      auto &grad = grad_var->Get<framework::SelectedRows>();
-      framework::SelectedRows tmp_merged_grad;
-      framework::SelectedRows *merged_grad = &tmp_merged_grad;
+    } else if (grad_var->IsType<pten::SelectedRows>()) {
+      auto &grad = grad_var->Get<pten::SelectedRows>();
+      pten::SelectedRows tmp_merged_grad;
+      pten::SelectedRows *merged_grad = &tmp_merged_grad;
       math::scatter::MergeAdd<DeviceContext, T> merge_func;
       merge_func(dev_ctx, grad, merged_grad);
 
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index 28f73e0618c2ae6cfd5ec67bc2372cc5584f5586..08c40e02b1702b069052eb0e086111c40739c04e 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -80,7 +80,7 @@ class SGDOp : public framework::OperatorWithKernel {
       // supported cases
       bool dense_param_sparse_grad =
           param_var->IsType<framework::LoDTensor>() &&
-          grad_var->IsType<framework::SelectedRows>();
+          grad_var->IsType<pten::SelectedRows>();
       bool dense_param_and_grad = param_var->IsType<framework::LoDTensor>() &&
                                   grad_var->IsType<framework::LoDTensor>();
 
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index 5e3ae6c017bcac71ea2668e914378627ea39b1a2..7ecd84f4ff16a36a1e2e27f45a0ca46a05c35cda 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -112,7 +112,7 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
           param->numel(), param_out->mutable_data<T>(ctx.GetPlace()),
           master_in_data, master_out_data);
 
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
+    } else if (grad_var->IsType<pten::SelectedRows>()) {
       // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
       // This manual optimization brings difficulty to track data dependency.
       // It's better to find a more elegant solution.
@@ -121,7 +121,7 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
           platform::errors::InvalidArgument(
               "The input tensor Param of SgdOp should be equal with ParamOut "
               "if variable's type is SelectedRows."));
-      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
+      auto* grad = ctx.Input<pten::SelectedRows>("Grad");
 
       auto in_height = grad->height();
       auto out_dims = param_out->dims();
diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
index 9d98e745a01aec9ec02e754ec9186ff66f58f53d..7df6bbf410d2d731367c9a6537ab63664f868c82 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -60,13 +60,13 @@ struct sgd_dense_param_kernel<
 // SelectedRows
 template <typename T>
 struct sgd_dense_param_kernel<
-    T, framework::VarTypeTrait<framework::SelectedRows>::kId> {
+    T, framework::VarTypeTrait<pten::SelectedRows>::kId> {
   void operator()(const framework::ExecutionContext &ctx) const {
     VLOG(4) << "[CPU]: sgd_dense_param_kernel<T, SelectedRows>";
     const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
     const auto *param = ctx.Input<framework::Tensor>("Param");
     auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
-    const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+    const auto *grad = ctx.Input<pten::SelectedRows>("Grad");
 
     const auto &grad_value = grad->value();
     const auto &grad_rows = grad->rows();
@@ -114,12 +114,12 @@ struct sgd_dense_param_kernel<
 // SelectedRows
 template <>
 struct sgd_dense_param_kernel<
-    platform::bfloat16, framework::VarTypeTrait<framework::SelectedRows>::kId> {
+    platform::bfloat16, framework::VarTypeTrait<pten::SelectedRows>::kId> {
   void operator()(const framework::ExecutionContext &ctx) const {
     VLOG(4) << "[CPU]: sgd_dense_param_kernel<bfloat16, SelectedRows>";
     const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
     auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
-    const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+    const auto *grad = ctx.Input<pten::SelectedRows>("Grad");
 
     const auto &grad_value = grad->value();
     const auto &grad_rows = grad->rows();
@@ -163,7 +163,7 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
 
     if (param_var->IsType<framework::LoDTensor>()) {
       invoke_dense_param_kernel(ctx);
-    } else if (param_var->IsType<framework::SelectedRows>()) {
+    } else if (param_var->IsType<pten::SelectedRows>()) {
       sparse_param_and_grad_kernel(ctx);
     } else {
       PADDLE_ENFORCE_EQ(
@@ -200,7 +200,7 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
                             grad->numel(), sz));
 
       dense_param_and_grad_kernel(ctx);
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
+    } else if (grad_var->IsType<pten::SelectedRows>()) {
       // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
       // This manual optimization brings difficulty to track data dependency.
       // It's better to find a more elegant solution.
@@ -209,7 +209,7 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
                             "The input tensor Param of SgdOp "
                             "should be equal with ParamOut if variable's "
                             "type is SelectedRows. "));
-      const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+      const auto *grad = ctx.Input<pten::SelectedRows>("Grad");
 
       // for distributed training, a sparse var may be empty,
       // just skip updating.
@@ -259,13 +259,13 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
     const auto *param_var = ctx.InputVar("Param");
     const auto *grad_var = ctx.InputVar("Grad");
 
-    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::SelectedRows>(), true,
+    PADDLE_ENFORCE_EQ(grad_var->IsType<pten::SelectedRows>(), true,
                       platform::errors::InvalidArgument(
                           "When param is SelectedRows, gradient should also "
                           "be SelectedRows"));
-    const auto &param = param_var->Get<framework::SelectedRows>();
-    auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
-    const auto &grad = grad_var->Get<framework::SelectedRows>();
+    const auto &param = param_var->Get<pten::SelectedRows>();
+    auto *param_out = ctx.Output<pten::SelectedRows>("ParamOut");
+    const auto &grad = grad_var->Get<pten::SelectedRows>();
 
     // for distributed training, a sparse var may be empty,
     // just skip updating.
@@ -309,7 +309,7 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
   virtual void dense_param_sparse_grad_kernel(
       const framework::ExecutionContext &ctx) const {
     detail::sgd_dense_param_kernel<
-        T, framework::VarTypeTrait<framework::SelectedRows>::kId>()(ctx);
+        T, framework::VarTypeTrait<pten::SelectedRows>::kId>()(ctx);
   }
 };
 
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index 88e94ba039ac277adc8ae4597886da16a0894465..c0bd906685d4d8c5fcb561ffaeab1cdc1b4d1e44 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -76,22 +76,13 @@ struct AbsFunctor {
   }
 };
 
-template <typename Tx, typename Ty = Tx>
+template <typename T>
 struct UnsignedPowFunctor {
   HOSTDEVICE explicit inline UnsignedPowFunctor(float porder) {
     this->porder = porder;
   }
-  HOSTDEVICE inline Ty operator()(const Tx x) const {
-    return static_cast<Ty>(inline_pow(inline_abs(x), static_cast<Tx>(porder)));
-  }
-  float porder;
-};
-
-template <typename Tx, typename Ty = Tx>
-struct PowFunctor {
-  HOSTDEVICE explicit inline PowFunctor(float porder) { this->porder = porder; }
-  HOSTDEVICE inline Ty operator()(const Tx x) const {
-    return static_cast<Ty>(inline_pow(x, static_cast<Tx>(porder)));
+  HOSTDEVICE inline T operator()(const T x) const {
+    return static_cast<T>(inline_pow(inline_abs(x), static_cast<T>(porder)));
   }
   float porder;
 };
@@ -105,13 +96,11 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
     const T* x = in_x->data<T>();
     T* norm = out_norm->mutable_data<T>(ctx.GetPlace());
     auto xdim = in_x->dims();
-    auto ndim = out_norm->dims();
     float porder = ctx.Attr<float>("porder");
     bool asvector = ctx.Attr<bool>("asvector");
     int axis = ctx.Attr<int>("axis");
     std::vector<int> reduce_axis = {axis};
     reduce_axis = GetReduceDim(reduce_axis, xdim.size(), asvector);
-
     auto stream = ctx.cuda_device_context().stream();
 
     using MT = typename details::MPTypeTrait<T>::Type;
@@ -125,29 +114,17 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
       TensorReduceFunctorImpl<T, T, kps::MinFunctor, AbsFunctor<T>>(
           *in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream);
     } else {
-      framework::Tensor tmp_x;
-      tmp_x.mutable_data<T>(xdim, ctx.GetPlace());
-      std::vector<const framework::Tensor*> ins = {in_x};
-      std::vector<framework::Tensor*> outs = {&tmp_x};
-      auto func = UnsignedPowFunctor<MT, T>(porder);
+      TensorReduceFunctorImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
+          *in_x, out_norm, UnsignedPowFunctor<T>(porder), reduce_axis, stream);
+
+      const framework::Tensor* tmp_norm = out_norm;
+      std::vector<const framework::Tensor*> ins = {tmp_norm};
+      std::vector<framework::Tensor*> outs = {out_norm};
       const auto& cuda_ctx =
           ctx.template device_context<platform::CUDADeviceContext>();
-
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-          ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>(
-          cuda_ctx, ins, &outs, func);
-      framework::Tensor tmp_y;
-      tmp_y.mutable_data<T>(ndim, ctx.GetPlace());
-      TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          tmp_x, &tmp_y, kps::IdentityFunctor<T>(), reduce_axis, stream);
-      const framework::Tensor* tmp_norm = &tmp_y;
-      ins = {tmp_norm};
-      outs = {out_norm};
-      auto func_inverse = UnsignedPowFunctor<MT, T>(1. / porder);
-
       paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-          ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>(
-          cuda_ctx, ins, &outs, func_inverse);
+          ElementwiseType::kUnary, T, T, UnsignedPowFunctor<T>>(
+          cuda_ctx, ins, &outs, UnsignedPowFunctor<T>(1. / porder));
     }
   }
 };
@@ -158,29 +135,25 @@ struct AbsMaxAndMinGradFunctor {
             typename DY, typename Dim>
   void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    auto equals = ((*x).abs() == y->broadcast(dim));
-    auto ones = dx->constant(static_cast<T>(1.));
-    auto negs = dx->constant(static_cast<T>(-1.));
-    auto zeros = dx->constant(static_cast<T>(0.));
-    auto positives = (*x) > zeros;
-    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros) *
-                        positives.select(ones, negs);
+    dx->device(place) = dy->broadcast(dim) * (*x).sign() *
+                        ((*x).abs() == y->broadcast(dim)).template cast<T>();
   }
 };
 
 template <typename T>
-struct PNormPostGradFunctor {
+struct PNormGradFunctor {
+  HOSTDEVICE explicit inline PNormGradFunctor(float porder) {
+    this->porder = static_cast<T>(porder - 1.);
+  }
   template <typename DeviceContext, typename X, typename Y, typename DX,
             typename DY, typename Dim>
   void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    auto ones = dx->constant(static_cast<T>(1.));
-    auto negs = dx->constant(static_cast<T>(-1.));
-    auto zeros = dx->constant(static_cast<T>(0.));
-    auto positives = (*x) > zeros;
-    dx->device(place) = (*dx) * dy->broadcast(dim) * y->broadcast(dim) *
-                        positives.select(ones, negs);
+    dx->device(place) = (*x).abs().pow(this->porder) * (*x).sign() *
+                        dy->broadcast(dim) *
+                        (*y).pow(-this->porder).broadcast(dim);
   }
+  T porder;
 };
 
 template <typename DeviceContext, typename T, typename AttrType = T>
@@ -207,26 +180,13 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
       math::SetConstant<DeviceContext, T> set_zero;
       set_zero(cuda_ctx, out_dx, static_cast<T>(0));
     } else if (porder == INFINITY || porder == -INFINITY) {
+      AbsMaxAndMinGradFunctor<T> functor;
       LaunchReduceGradKernel<DeviceContext, T, AbsMaxAndMinGradFunctor<T>>(
-          ctx, in_x, in_norm, in_norm_dy, out_dx, dims, reduce_all);
+          ctx, in_x, in_norm, in_norm_dy, out_dx, functor, dims, reduce_all);
     } else {
-      framework::Tensor tmp_norm;
-      tmp_norm.mutable_data<T>(in_norm->dims(), ctx.GetPlace());
-      std::vector<const framework::Tensor*> ins = {in_norm};
-      std::vector<framework::Tensor*> outs = {&tmp_norm};
-      auto pow_functor = PowFunctor<T>(1. - porder);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-          ElementwiseType::kUnary, T, T, PowFunctor<T>>(cuda_ctx, ins, &outs,
-                                                        pow_functor);
-      ins = {in_x};
-      outs = {out_dx};
-      auto unsigned_pow = UnsignedPowFunctor<T>(porder - 1.);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-          ElementwiseType::kUnary, T, T, UnsignedPowFunctor<T>>(
-          cuda_ctx, ins, &outs, unsigned_pow);
-      const framework::Tensor* tmp_norm_const = &tmp_norm;
-      LaunchReduceGradKernel<DeviceContext, T, PNormPostGradFunctor<T>>(
-          ctx, in_x, tmp_norm_const, in_norm_dy, out_dx, dims, reduce_all);
+      auto functor = PNormGradFunctor<T>(porder);
+      LaunchReduceGradKernel<DeviceContext, T, PNormGradFunctor<T>>(
+          ctx, in_x, in_norm, in_norm_dy, out_dx, functor, dims, reduce_all);
     }
   }
 };
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
index 292db60079e4cde97ba992b3c9d7151ecae9434a..d715bf34a49ef10de11affacde4ac892be259da8 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
@@ -13,8 +13,8 @@
 #include <algorithm>
 #include <string>
 #include <vector>
-#include "paddle/fluid/distributed/fleet.h"
-#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
+#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -40,8 +40,8 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
 
     if (var->IsType<framework::LoDTensor>()) {
       emb_dim = var->Get<framework::LoDTensor>().dims()[1];
-    } else if (var->IsType<framework::SelectedRows>()) {
-      emb_dim = var->Get<framework::SelectedRows>().value().dims()[1];
+    } else if (var->IsType<pten::SelectedRows>()) {
+      emb_dim = var->Get<pten::SelectedRows>().value().dims()[1];
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Expected type of `W` must be Tensor, SelectedRows.But got "
diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.h b/paddle/fluid/operators/pscore/distributed_push_sparse_op.h
index a232d52dec8d62fb42a6d662e94a9e29c5d935f7..f19ba5f2e41da3de710c726bc7899f12cbbc92dc 100644
--- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.h
+++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.h
@@ -13,8 +13,8 @@
 #include <algorithm>
 #include <string>
 #include <vector>
-#include "paddle/fluid/distributed/fleet.h"
-#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
+#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
diff --git a/paddle/fluid/operators/pscore/fake_init_op.cc b/paddle/fluid/operators/pscore/fake_init_op.cc
index cb27dc75eb2faf15746e596265d1b1e4b3717e52..b3a745fc99538edf2a0b387a67d28cb7722709f0 100644
--- a/paddle/fluid/operators/pscore/fake_init_op.cc
+++ b/paddle/fluid/operators/pscore/fake_init_op.cc
@@ -39,8 +39,8 @@ class FakeInitOp : public framework::OperatorBase {
     if (out_var.IsType<framework::LoDTensor>()) {
       tensor = out_var.GetMutable<framework::LoDTensor>();
       tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else if (out_var.IsType<framework::SelectedRows>()) {
-      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
+    } else if (out_var.IsType<pten::SelectedRows>()) {
+      tensor = out_var.GetMutable<pten::SelectedRows>()->mutable_value();
       tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h
index 77c755581f9de2830c4ae2ab9c281321f7fb986f..2d2d8abe7062788b14b543bad22d699a1f41bd2d 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h
+++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h
@@ -23,9 +23,9 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/distributed/service/brpc_utils.h"
-#include "paddle/fluid/distributed/service/heter_server.h"
-#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/ps/service/brpc_utils.h"
+#include "paddle/fluid/distributed/ps/service/heter_server.h"
+#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
index c870e758e96afc1c70a26236b0d20ac05d77aaf1..a195b8dee3c2f5580be5f7c094194576b9eccb88 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/distributed/service/heter_client.h"
-#include "paddle/fluid/distributed/service/heter_server.h"
+#include "paddle/fluid/distributed/ps/service/heter_client.h"
+#include "paddle/fluid/distributed/ps/service/heter_server.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/pscore/heter_server_test.cc b/paddle/fluid/operators/pscore/heter_server_test.cc
index 5029aa0ebdcc0c547c394053ce110dbc9f401a3f..7914e9d9a1058ab15a08e3b0dee8725e7a74bb38 100644
--- a/paddle/fluid/operators/pscore/heter_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_server_test.cc
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/distributed/service/heter_client.h"
-#include "paddle/fluid/distributed/service/heter_server.h"
+#include "paddle/fluid/distributed/ps/service/heter_client.h"
+#include "paddle/fluid/distributed/ps/service/heter_server.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace framework = paddle::framework;
@@ -52,7 +52,7 @@ framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
 
 void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
   auto w_var = scope->Var("w");
-  w_var->GetMutable<framework::SelectedRows>();
+  w_var->GetMutable<pten::SelectedRows>();
 
   auto out_var = scope->Var("out");
   out_var->GetMutable<framework::LoDTensor>();
@@ -123,7 +123,7 @@ void InitTensorsOnClient2(framework::Scope* scope, platform::CPUPlace* place,
 void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
                          int64_t rows_numel) {
   CreateVarsOnScope(scope, place);
-  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
+  auto w = scope->Var("w")->GetMutable<pten::SelectedRows>();
   auto w_value = w->mutable_value();
   w_value->Resize({rows_numel, 10});
   for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true);
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc
index 46f22bcc8b26bc0b4f782ed9459491d471ad219d..980351e12a030760b6793ab665d80db737bfa9d5 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <future>  // NOLINT
 #include <ostream>
 
-#include "paddle/fluid/distributed/service/heter_client.h"
+#include "paddle/fluid/distributed/ps/service/heter_client.h"
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
index 6b1ab77b45d35dfb4439cb4e1927cc928d7ffd4c..07fe44601ca08831a9e4372d04c097a8e56644f2 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/distributed/service/heter_client.h"
-#include "paddle/fluid/distributed/service/heter_server.h"
+#include "paddle/fluid/distributed/ps/service/heter_client.h"
+#include "paddle/fluid/distributed/ps/service/heter_server.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -55,7 +55,7 @@ framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
 
 void CreateVarsOnScope(framework::Scope* scope) {
   auto w_var = scope->Var("w");
-  w_var->GetMutable<framework::SelectedRows>();
+  w_var->GetMutable<pten::SelectedRows>();
 
   auto out_var = scope->Var("out");
   out_var->GetMutable<framework::LoDTensor>();
@@ -76,7 +76,7 @@ void CreateVarsOnScope(framework::Scope* scope) {
 void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
                          int64_t rows_numel) {
   CreateVarsOnScope(scope);
-  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
+  auto w = scope->Var("w")->GetMutable<pten::SelectedRows>();
   auto w_value = w->mutable_value();
   w_value->Resize({rows_numel, 10});
   for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true);
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
index 3a361360e2ed7e7de3c995b60ecf6e8c0f33e415..21f21cdc95606ec98700736f51a2f50af6364e1a 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 #include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/distributed/service/heter_client.h"
-#include "paddle/fluid/distributed/service/heter_server.h"
+#include "paddle/fluid/distributed/ps/service/heter_client.h"
+#include "paddle/fluid/distributed/ps/service/heter_server.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
@@ -59,7 +59,7 @@ framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
 
 void CreateVarsOnScope(framework::Scope* scope) {
   auto w_var = scope->Var("w");
-  w_var->GetMutable<framework::SelectedRows>();
+  w_var->GetMutable<pten::SelectedRows>();
 
   auto out_var = scope->Var("out");
   out_var->GetMutable<framework::LoDTensor>();
@@ -121,7 +121,7 @@ void InitTensorsOnClient(framework::Scope* scope, int64_t rows_numel,
 void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
                          int64_t rows_numel) {
   CreateVarsOnScope(scope);
-  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
+  auto w = scope->Var("w")->GetMutable<pten::SelectedRows>();
   auto w_value = w->mutable_value();
   w_value->Resize({rows_numel, 10});
   for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true);
diff --git a/paddle/fluid/operators/pscore/send_barrier_op.cc b/paddle/fluid/operators/pscore/send_barrier_op.cc
index 1def919ffdf9fdb8976d6745ac718977eb57df73..fe850bb25d67f33a6dfa076f9a75c0b36cd82e5c 100644
--- a/paddle/fluid/operators/pscore/send_barrier_op.cc
+++ b/paddle/fluid/operators/pscore/send_barrier_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/pscore/send_op.cc b/paddle/fluid/operators/pscore/send_op.cc
index 482c6ba60d26fdab5776e99d036162a7c67b21f8..bbb3c76beca20b4a20d3ec664ed4fc47ce542414 100644
--- a/paddle/fluid/operators/pscore/send_op.cc
+++ b/paddle/fluid/operators/pscore/send_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/distributed/fleet.h"
-#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
+#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.h b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
index 06c9f23dd2c26fae8fefe6ae7da7df7aa5e67563..4490f08b2129ad0a1dfcd42602ce1ad6f694d1f7 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.h
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
@@ -139,26 +139,27 @@ class LogsumexpGradKernel : public framework::OpKernel<T> {
                              broadcast_dim[0]);
     } else {
       int rank = input->dims().size();
+      LogsumexpGradFunctor functor;
       switch (rank) {
         case 1:
           ReduceGradFunctor<DeviceContext, T, 1, LogsumexpGradFunctor>(
               context.template device_context<DeviceContext>(), *input, *output,
-              *output_grad, input_grad, axis);
+              *output_grad, input_grad, functor, axis);
           break;
         case 2:
           ReduceGradFunctor<DeviceContext, T, 2, LogsumexpGradFunctor>(
               context.template device_context<DeviceContext>(), *input, *output,
-              *output_grad, input_grad, axis);
+              *output_grad, input_grad, functor, axis);
           break;
         case 3:
           ReduceGradFunctor<DeviceContext, T, 3, LogsumexpGradFunctor>(
               context.template device_context<DeviceContext>(), *input, *output,
-              *output_grad, input_grad, axis);
+              *output_grad, input_grad, functor, axis);
           break;
         case 4:
           ReduceGradFunctor<DeviceContext, T, 4, LogsumexpGradFunctor>(
               context.template device_context<DeviceContext>(), *input, *output,
-              *output_grad, input_grad, axis);
+              *output_grad, input_grad, functor, axis);
           break;
       }
     }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
index 197ced2beaac26299ae1ed705ae49d0055dc3c02..30a699e979efc40190a5c83850340f1f15dd918a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
@@ -22,4 +22,6 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ReduceCudaKernel<paddle::platform::float16, kps::AddFunctor,
                           kps::DivideFunctor>,
     ops::ReduceCudaKernel<float, kps::AddFunctor, kps::DivideFunctor>,
-    ops::ReduceCudaKernel<double, kps::AddFunctor, kps::DivideFunctor>);
+    ops::ReduceCudaKernel<double, kps::AddFunctor, kps::DivideFunctor>,
+    ops::ReduceCudaKernel<int, kps::AddFunctor, kps::DivideFunctor>,
+    ops::ReduceCudaKernel<int64_t, kps::AddFunctor, kps::DivideFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 2e5bd7a42b1d1a4224f6aa516e7b6adb28b4f17a..87f51e4b8002f277a50ca0af5abf1e0f43214758 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -143,7 +143,7 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context,
                         const framework::Tensor* x,
                         const framework::Tensor* out,
                         const framework::Tensor* dout, framework::Tensor* dx,
-                        const std::vector<int>& dims) {
+                        Functor functor, const std::vector<int>& dims) {
   const int64_t unreduced = out->numel();
   const int64_t reduced = x->numel() / unreduced;
   DDim out_dim(out->dims());
@@ -157,7 +157,7 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context,
   dx->Resize({unreduced, reduced});
   ReduceGradFunctor<DeviceContext, T, 2, Functor>(
       context.template device_context<DeviceContext>(), shuffled_x, *out, *dout,
-      dx, {1});
+      dx, functor, {1});
   // transpose dX
   std::vector<int> origin_axis(x_dim.size());
   GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
@@ -333,7 +333,7 @@ void LaunchReduceGradKernel(const framework::ExecutionContext& context,
                             const framework::Tensor* input0,
                             const framework::Tensor* input1,
                             const framework::Tensor* input2,
-                            paddle::framework::Tensor* output,
+                            paddle::framework::Tensor* output, Functor functor,
                             const std::vector<int>& dims,
                             bool reduce_all = false) {
   if (reduce_all) {
@@ -345,7 +345,6 @@ void LaunchReduceGradKernel(const framework::ExecutionContext& context,
         *context.template device_context<DeviceContext>().eigen_device();
     auto broadcast_dim =
         Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
-    Functor functor;
     functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
             broadcast_dim[0]);
   } else {
@@ -354,36 +353,36 @@ void LaunchReduceGradKernel(const framework::ExecutionContext& context,
       case 1:
         ReduceGradFunctor<DeviceContext, T, 1, Functor>(
             context.template device_context<DeviceContext>(), *input0, *input1,
-            *input2, output, dims);
+            *input2, output, functor, dims);
         break;
       case 2:
         ReduceGradFunctor<DeviceContext, T, 2, Functor>(
             context.template device_context<DeviceContext>(), *input0, *input1,
-            *input2, output, dims);
+            *input2, output, functor, dims);
         break;
       case 3:
         ReduceGradFunctor<DeviceContext, T, 3, Functor>(
             context.template device_context<DeviceContext>(), *input0, *input1,
-            *input2, output, dims);
+            *input2, output, functor, dims);
         break;
       case 4:
         ReduceGradFunctor<DeviceContext, T, 4, Functor>(
             context.template device_context<DeviceContext>(), *input0, *input1,
-            *input2, output, dims);
+            *input2, output, functor, dims);
         break;
       case 5:
         ReduceGradFunctor<DeviceContext, T, 5, Functor>(
             context.template device_context<DeviceContext>(), *input0, *input1,
-            *input2, output, dims);
+            *input2, output, functor, dims);
         break;
       case 6:
         ReduceGradFunctor<DeviceContext, T, 6, Functor>(
             context.template device_context<DeviceContext>(), *input0, *input1,
-            *input2, output, dims);
+            *input2, output, functor, dims);
         break;
       default:
-        HandleLargeDimGrad<DeviceContext, T, Functor>(context, input0, input1,
-                                                      input2, output, dims);
+        HandleLargeDimGrad<DeviceContext, T, Functor>(
+            context, input0, input1, input2, output, functor, dims);
         break;
     }
   }
@@ -430,8 +429,10 @@ class ReduceGradKernel : public framework::OpKernel<T> {
     // NOTE(dengkaipeng): Out is unnecessary in some reduce kernel and
     // not be set as Input in grad Maker, use Out_grad to replace here
     if (!input1) input1 = input2;
-    LaunchReduceGradKernel<DeviceContext, T, Functor>(
-        context, input0, input1, input2, output, const_dims, reduce_all);
+    Functor functor;
+    LaunchReduceGradKernel<DeviceContext, T, Functor>(context, input0, input1,
+                                                      input2, output, functor,
+                                                      const_dims, reduce_all);
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
@@ -556,7 +557,7 @@ class ReduceOp : public framework::OperatorWithKernel {
       if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
         if (!reduce_all) {
           return framework::KernelSignature(
-              "sum", {"X"}, {"dim", "keep_dim", "out_dtype"}, {"Out"});
+              "sum", {"X"}, {"dim", "out_dtype", "keep_dim"}, {"Out"});
         }
         return framework::KernelSignature(
             "sum_raw", {"X"}, {"dim", "keep_dim", "reduce_all", "out_dtype"},
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_function.h b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
index 3da27bc8ac8d448471b9ff3779ac6aca59fac523..1f3839c8dc7e6d1285462c0e442a5f856dd50066 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op_function.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
@@ -74,7 +74,7 @@ void ReduceGradFunctor(const DeviceContext& context,
                        const framework::Tensor& input0,
                        const framework::Tensor& input1,
                        const framework::Tensor& input2,
-                       framework::Tensor* output,
+                       framework::Tensor* output, Functor functor,
                        const std::vector<int>& dims) {
   auto x = EigenTensor<T, D>::From(input0);
   auto x_grad = EigenTensor<T, D>::From(*output);
@@ -100,7 +100,6 @@ void ReduceGradFunctor(const DeviceContext& context,
 
   auto& place = *context.eigen_device();
 
-  Functor functor;
   functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
           broad_cats_times);
 }
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index dc82d7c6c1ee49d6b3f74dbd5d0b1c835819266e..6c2d5ebcc7d880aa33786df153f270db685f3525 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -38,33 +38,6 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-inline std::vector<int> get_new_shape(
-    const std::vector<const Tensor *> &list_new_shape_tensor) {
-  // get tensor from
-  std::vector<int> vec_new_shape;
-  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
-    auto tensor = list_new_shape_tensor[i];
-    PADDLE_ENFORCE_EQ(
-        tensor->dims(), framework::make_ddim({1}),
-        platform::errors::InvalidArgument(
-            "If the element type of 'shape' in ReshapeOp is Tensor, "
-            "the element's shape must be [1]. But received the element's shape "
-            "is [%s]",
-            tensor->dims()));
-    if (platform::is_gpu_place(tensor->place()) ||
-        platform::is_xpu_place(tensor->place())) {
-      framework::Tensor temp;
-      paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-
-      vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
-    } else {
-      vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
-    }
-  }
-
-  return vec_new_shape;
-}
-
 class ReshapeOp : public framework::OperatorWithKernel {
  public:
   ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
@@ -370,30 +343,6 @@ class ReshapeKernel {
   void operator()(const framework::ExecutionContext &ctx) const {
     auto *out = ctx.Output<framework::LoDTensor>("Out");
     auto *in = ctx.Input<framework::LoDTensor>("X");
-    // framework::DDim out_dims = out->dims();
-    auto pt_x = paddle::experimental::MakePtenDenseTensor(*in);
-
-    // we can't MakePtenDenseTensor by out, because the out of reshape may have
-    // multiple states, some can MakePtenDenseTensor but other's cannot:
-    // 1. out tensor is not initialized
-    // 2. out tensor is input (complete inplace)
-    // 3. out tensor is view of input
-    // We can't MakePtenDenseTensor for case 2, so we solve this case by
-    // creating a temporary tensor here:
-    pten::DenseTensorMeta meta{pten::TransToPtenDataType(in->type()),
-                               in->dims(), in->layout()};
-    auto pt_out_tmp = std::make_shared<pten::DenseTensor>(
-        pten::make_intrusive<paddle::experimental::SharedStorage>(
-            ctx.GetPlace()),
-        std::move(meta));
-    pten::DenseTensor *pt_out = nullptr;
-    if (in != nullptr && out != nullptr && in->Holder() != nullptr &&
-        out->Holder() != nullptr &&
-        in->Holder()->ptr() == out->Holder()->ptr()) {
-      pt_out = pt_x.get();
-    } else {
-      pt_out = pt_out_tmp.get();
-    }
 
     auto list_new_shape_tensor =
         ctx.MultiInput<framework::Tensor>("ShapeTensor");
@@ -410,54 +359,46 @@ class ReshapeKernel {
           framework::Tensor temp;
           paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(),
                                             &temp);
-          pt_vec_shape.push_back(
-              std::move(*(paddle::experimental::MakePtenDenseTensor(temp))));
+          pt_vec_shape.push_back(std::move(temp));
         } else {
-          pt_vec_shape.push_back(
-              std::move(*(paddle::experimental::MakePtenDenseTensor(*tensor))));
+          pt_vec_shape.push_back(*tensor);
         }
       }
       pt_scalar_shape = pten::ScalarArray(pt_vec_shape);
     } else if (shape_tensor) {
-      std::unique_ptr<pten::DenseTensor> pt_shape;
+      pten::DenseTensor pt_shape;
       if (platform::is_gpu_place(shape_tensor->place()) ||
           platform::is_xpu_place(shape_tensor->place())) {
         framework::Tensor temp;
         paddle::framework::TensorCopySync(*shape_tensor, platform::CPUPlace(),
                                           &temp);
-        pt_shape = paddle::experimental::MakePtenDenseTensor(temp);
+        pt_shape = std::move(temp);
       } else {
-        pt_shape = paddle::experimental::MakePtenDenseTensor(*shape_tensor);
+        pt_shape = *shape_tensor;
       }
-      pt_scalar_shape = pten::ScalarArray(*pt_shape.get());
+      pt_scalar_shape = pten::ScalarArray(pt_shape);
     } else {
       auto &shape_attr = ctx.Attr<std::vector<int>>("shape");
       pt_scalar_shape = pten::ScalarArray(shape_attr);
     }
     if (platform::is_cpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
-      pten::ReshapeKernel(static_cast<const pten::CPUContext &>(dev_ctx),
-                          *pt_x.get(), pt_scalar_shape, pt_out);
+      pten::ReshapeKernel(static_cast<const pten::CPUContext &>(dev_ctx), *in,
+                          pt_scalar_shape, out);
     }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-      pten::ReshapeKernel(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out);
+      pten::ReshapeKernel(dev_ctx, *in, pt_scalar_shape, out);
     }
 #endif
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
-      pten::ReshapeKernel(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out);
+      pten::ReshapeKernel(static_cast<const pten::XPUContext &>(dev_ctx), *in,
+                          pt_scalar_shape, out);
     }
 #endif
-    // non-inplace need move all result from pt_out to out, inplace need set
-    // result dims.
-    if (in != out) {
-      paddle::experimental::SharesStorage(pt_out, static_cast<Tensor *>(out));
-    } else {
-      out->Resize(pt_out->dims());
-    }
   }
 };
 
@@ -468,24 +409,22 @@ class ReshapeGradKernel {
     auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     d_x->mutable_data(ctx.GetPlace(), d_out->type());
 
-    auto pt_d_x = paddle::experimental::MakePtenDenseTensor(*d_x);
-    auto pt_d_out = paddle::experimental::MakePtenDenseTensor(*d_out);
-
     if (platform::is_cpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
       pten::ReshapeGradKernel(static_cast<const pten::CPUContext &>(dev_ctx),
-                              *pt_d_out.get(), pt_d_x.get());
+                              *d_out, d_x);
     }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-      pten::ReshapeGradKernel(dev_ctx, *pt_d_out.get(), pt_d_x.get());
+      pten::ReshapeGradKernel(dev_ctx, *d_out, d_x);
     }
 #endif
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
-      pten::ReshapeGradKernel(dev_ctx, *pt_d_out.get(), pt_d_x.get());
+      pten::ReshapeGradKernel(static_cast<const pten::XPUContext &>(dev_ctx),
+                              *d_out, d_x);
     }
 #endif
   }
@@ -498,25 +437,22 @@ class ReshapeDoubleGradKernel {
     auto *dd_out = ctx.Output<framework::Tensor>("DDOut");
     dd_out->mutable_data(ctx.GetPlace(), dd_x->type());
 
-    auto pt_dd_x = paddle::experimental::MakePtenDenseTensor(*dd_x);
-    auto pt_dd_out = paddle::experimental::MakePtenDenseTensor(*dd_out);
-
     if (platform::is_cpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
       pten::ReshapeDoubleGradKernel(
-          static_cast<const pten::CPUContext &>(dev_ctx), *pt_dd_x.get(),
-          pt_dd_out.get());
+          static_cast<const pten::CPUContext &>(dev_ctx), *dd_x, dd_out);
     }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-      pten::ReshapeDoubleGradKernel(dev_ctx, *pt_dd_x.get(), pt_dd_out.get());
+      pten::ReshapeDoubleGradKernel(dev_ctx, *dd_x, dd_out);
     }
 #endif
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
-      pten::ReshapeDoubleGradKernel(dev_ctx, *pt_dd_x.get(), pt_dd_out.get());
+      pten::ReshapeDoubleGradKernel(
+          static_cast<const pten::XPUContext &>(dev_ctx), *dd_x, dd_out);
     }
 #endif
   }
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index c130dbb35a0daa551f51c9a0315be90a4415a98e..a97876957abd38124c164ba934a0ce1378188659 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -46,7 +46,7 @@ using ProgramDesc = framework::ProgramDesc;
 
 using Variable = framework::Variable;
 using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
+using SelectedRows = pten::SelectedRows;
 
 namespace details {
 
@@ -86,21 +86,21 @@ static void CheckOutputVarStatus(const Variable &src_var,
                           "RunProgram(Grad)Op's internal "
                           "scope is not initialized.",
                           var_name));
-  } else if (dst_var.IsType<SelectedRows>()) {
+  } else if (dst_var.IsType<pten::SelectedRows>()) {
     PADDLE_ENFORCE_EQ(
-        src_var.IsType<SelectedRows>(), true,
+        src_var.IsType<pten::SelectedRows>(), true,
         platform::errors::InvalidArgument(
             "The output variable %s get from "
             "RunProgram(Grad)Op's internal scope holds "
             "wrong type. Expect type is SelectedRows, but receive type is %s.",
             var_name,
             platform::demangle(framework::ToTypeName(src_var.Type()))));
-    PADDLE_ENFORCE_EQ(src_var.Get<SelectedRows>().value().IsInitialized(), true,
-                      platform::errors::InvalidArgument(
-                          "The tensor in output variable %s get from "
-                          "RunProgram(Grad)Op's "
-                          "internal scope is not initialized.",
-                          var_name));
+    PADDLE_ENFORCE_EQ(src_var.Get<pten::SelectedRows>().value().IsInitialized(),
+                      true, platform::errors::InvalidArgument(
+                                "The tensor in output variable %s get from "
+                                "RunProgram(Grad)Op's "
+                                "internal scope is not initialized.",
+                                var_name));
 
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
@@ -118,12 +118,12 @@ static void VariableShare(const Variable &src_var, Variable *dst_var) {
     auto *lod_tensor = dst_var->GetMutable<LoDTensor>();
     lod_tensor->ShareDataWith(src_var.Get<LoDTensor>());
     lod_tensor->set_lod(src_var.Get<LoDTensor>().lod());
-  } else if (src_var.IsType<SelectedRows>()) {
-    auto *selected_rows = dst_var->GetMutable<SelectedRows>();
+  } else if (src_var.IsType<pten::SelectedRows>()) {
+    auto *selected_rows = dst_var->GetMutable<pten::SelectedRows>();
     selected_rows->mutable_value()->ShareDataWith(
-        src_var.Get<SelectedRows>().value());
-    selected_rows->set_rows(src_var.Get<SelectedRows>().rows());
-    selected_rows->set_height(src_var.Get<SelectedRows>().height());
+        src_var.Get<pten::SelectedRows>().value());
+    selected_rows->set_rows(src_var.Get<pten::SelectedRows>().rows());
+    selected_rows->set_height(src_var.Get<pten::SelectedRows>().height());
   }
 }
 
diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h
index 5ed71a26c8aa3563656baab3c7751a150d5f105f..2a61d7ce0c25b786bb5713e342d371d48ad2d04d 100644
--- a/paddle/fluid/operators/save_op.h
+++ b/paddle/fluid/operators/save_op.h
@@ -56,7 +56,7 @@ class SaveOpKernel : public framework::OpKernel<T> {
 
     if (input_var->IsType<framework::LoDTensor>()) {
       SaveLodTensor(ctx, place, input_var, filename);
-    } else if (input_var->IsType<framework::SelectedRows>()) {
+    } else if (input_var->IsType<pten::SelectedRows>()) {
       SaveSelectedRows(ctx, place, input_var, filename);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
@@ -105,7 +105,7 @@ class SaveOpKernel : public framework::OpKernel<T> {
                         const platform::Place &place,
                         const framework::Variable *var,
                         const std::string &filename) const {
-    auto &selectedRows = var->Get<framework::SelectedRows>();
+    auto &selectedRows = var->Get<pten::SelectedRows>();
 
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 86f4e1b3ac3ba0b3cfef98e322f89627d4e927da..a195452791048d9875602285551a00cf6e42c7a8 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/scale_op.h"
 #include <string>
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/ops/compat/scale_args_fn.h"
 
 namespace paddle {
 namespace framework {
@@ -71,12 +70,6 @@ class ScaleOp : public framework::OperatorWithKernel {
 #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
-
-  framework::KernelSignature GetExpectedPtenKernelArgs(
-      const framework::ExecutionContext &ctx) const override {
-    framework::ExecutionArgumentMappingContext arg_mapping_ctx(ctx);
-    return pten::ScaleOpArgumentMapping(arg_mapping_ctx);
-  }
 };
 
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index a04837b6949e0f3ac0e0dda913c878166fb63311..2a30d3f0b08842b5b876847e22dd0ccea2956914 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -55,9 +55,9 @@ class ScaleKernel : public framework::OpKernel<T> {
     }
 
     auto* out_var = ctx.OutputVar("Out");
-    if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
-      auto& in_slr = in_var->Get<framework::SelectedRows>();
-      auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
+    if (in_var->IsType<pten::SelectedRows>() && in_var != out_var) {
+      auto& in_slr = in_var->Get<pten::SelectedRows>();
+      auto* out_slr = out_var->GetMutable<pten::SelectedRows>();
       out_slr->set_rows(in_slr.rows());
       out_slr->set_height(in_slr.height());
     }
diff --git a/paddle/fluid/operators/scale_op_mlu.cc b/paddle/fluid/operators/scale_op_mlu.cc
index 8d9690a866ae26abe0817d98636a45d58735aefd..1e1187845ce477f939e8cf21650076c875861f3d 100644
--- a/paddle/fluid/operators/scale_op_mlu.cc
+++ b/paddle/fluid/operators/scale_op_mlu.cc
@@ -57,9 +57,9 @@ class ScaleMLUKernel : public framework::OpKernel<T> {
     MLUCnnl::Fill(ctx, bias, bias_desc.get(), GetBasePtr(&bias_tensor));
 
     auto* out_var = ctx.OutputVar("Out");
-    if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
-      auto& in_slr = in_var->Get<framework::SelectedRows>();
-      auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
+    if (in_var->IsType<pten::SelectedRows>() && in_var != out_var) {
+      auto& in_slr = in_var->Get<pten::SelectedRows>();
+      auto* out_slr = out_var->GetMutable<pten::SelectedRows>();
       out_slr->set_rows(in_slr.rows());
       out_slr->set_height(in_slr.height());
     }
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
index 4960f720ee39aaa130544befc9b0a6449d5381d9..026a5dda89b5f07423090cb83bfb73e706cba7b7 100644
--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
 #include <string>
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+#include "paddle/pten/kernels/scale_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -32,30 +32,21 @@ class ScaleXPUKernel : public framework::OpKernel<T> {
     auto bias = static_cast<float>(ctx.Attr<float>("bias"));
     auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
     auto* out_var = ctx.OutputVar("Out");
-    if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
-      auto& in_slr = in_var->Get<framework::SelectedRows>();
-      auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
+    if (in_var->IsType<pten::SelectedRows>() && in_var != out_var) {
+      auto& in_slr = in_var->Get<pten::SelectedRows>();
+      auto* out_slr = out_var->GetMutable<pten::SelectedRows>();
       out_slr->set_rows(in_slr.rows());
       out_slr->set_height(in_slr.height());
     }
     auto* out =
         framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
     out->mutable_data<T>(in->place());
-    PADDLE_ENFORCE_EQ(
-        in->dims(), out->dims(),
-        platform::errors::InvalidArgument("In and out should have the same dim,"
-                                          " expected %s, but got %s.",
-                                          in->dims().to_str().c_str(),
-                                          out->dims().to_str().c_str()));
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int r = xpu::scale(dev_ctx.x_context(),
-                       reinterpret_cast<const XPUType*>(in->data<T>()),
-                       reinterpret_cast<XPUType*>(out->data<T>()), in->numel(),
-                       bias_after_scale, scale, bias);
-    PADDLE_ENFORCE_EQ(
-        r, XPU_SUCCESS,
-        platform::errors::External("XPU scale kernel return wrong value[%d %s]",
-                                   r, XPUAPIErrorMsg[r]));
+    // call pten kernel
+    pten::ScaleKernel<T>(
+        static_cast<const typename framework::ConvertToPtenContext<
+            DeviceContext>::TYPE&>(dev_ctx),
+        *in, scale, bias, bias_after_scale, out);
   }
 };
 
diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h
index 1f90c041c095331db427ddd5f9a656e948947e46..cac8c10c207a51e7de1bc2ca3346394f39da8ddf 100644
--- a/paddle/fluid/operators/shape_op.h
+++ b/paddle/fluid/operators/shape_op.h
@@ -21,7 +21,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
+using SelectedRows = pten::SelectedRows;
 
 template <typename T>
 class ShapeKernel : public framework::OpKernel<T> {
@@ -29,8 +29,8 @@ class ShapeKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in_var = ctx.InputVar("Input");
     framework::DDim in_dims;
-    if (in_var->IsType<SelectedRows>()) {
-      in_dims = in_var->Get<SelectedRows>().value().dims();
+    if (in_var->IsType<pten::SelectedRows>()) {
+      in_dims = in_var->Get<pten::SelectedRows>().value().dims();
     } else {
       in_dims = in_var->Get<LoDTensor>().dims();
     }
diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc
index 94f4737191d11a7ca8a3dd3e7f40399d08813486..89a1e952d1dc558dfad55604713ec601d7ccc125 100644
--- a/paddle/fluid/operators/shape_op_npu.cc
+++ b/paddle/fluid/operators/shape_op_npu.cc
@@ -22,7 +22,6 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-using SelectedRows = framework::SelectedRows;
 
 template <typename DeviceContext, typename T>
 class ShapeNPUKernel : public framework::OpKernel<T> {
@@ -30,8 +29,8 @@ class ShapeNPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in_var = ctx.InputVar("Input");
     framework::DDim in_dims;
-    if (in_var->IsType<SelectedRows>()) {
-      in_dims = in_var->Get<SelectedRows>().value().dims();
+    if (in_var->IsType<pten::SelectedRows>()) {
+      in_dims = in_var->Get<pten::SelectedRows>().value().dims();
     } else {
       in_dims = in_var->Get<LoDTensor>().dims();
     }
diff --git a/paddle/fluid/operators/share_data_op.h b/paddle/fluid/operators/share_data_op.h
index d876b4fabd5c09bf32322cf1a63e0c0fe7ed7d25..f668a1cf01dfc5c141f94997aafb18f1a730707d 100644
--- a/paddle/fluid/operators/share_data_op.h
+++ b/paddle/fluid/operators/share_data_op.h
@@ -29,9 +29,8 @@ class ShareDataKernel : public framework::OpKernel<T> {
       auto *detach_tensor = out_var->GetMutable<framework::LoDTensor>();
       detach_tensor->ShareDataWith(origin_tensor);
     } else {
-      const auto &origin_selected_rows = in_var->Get<framework::SelectedRows>();
-      auto *detach_selected_rows =
-          out_var->GetMutable<framework::SelectedRows>();
+      const auto &origin_selected_rows = in_var->Get<pten::SelectedRows>();
+      auto *detach_selected_rows = out_var->GetMutable<pten::SelectedRows>();
       detach_selected_rows->mutable_value()->ShareDataWith(
           origin_selected_rows.value());
     }
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc
index 7e21cba14b7dcaad215aa040958a656e9b3058ec..6395aa1caa01b9578d55e1155b0d6cd0d2295e36 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc
@@ -18,6 +18,7 @@
 #include <vector>
 
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
@@ -41,24 +42,41 @@ class SigmoidCrossEntropyWithLogitsXPUKernel : public framework::OpKernel<T> {
     auto& dev_ctx = context.template device_context<DeviceContext>();
 
     // attrs
-    bool normalize = context.Attr<bool>("normalize");
-    PADDLE_ENFORCE_EQ(
-        normalize, false,
-        platform::errors::InvalidArgument("normalize only support true now."));
     int ignore_index = context.Attr<int>("ignore_index");
-    PADDLE_ENFORCE_EQ(ignore_index, kIgnoreIndex,
-                      platform::errors::InvalidArgument(
-                          "ignore_index only support %d now.", kIgnoreIndex));
+    bool normalize = context.Attr<bool>("normalize");
+
+    // allocate temp memory
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+    int* hit = RAII_GUARD.alloc_l3_or_gm<int>(input->numel());
+    PADDLE_ENFORCE_NOT_NULL(
+        hit, platform::errors::External("XPU alloc_l3_or_gm returns nullptr"));
 
     int r = xpu::sigmoid_cross_entropy_with_logits(
         dev_ctx.x_context(), reinterpret_cast<const XPUType*>(input->data<T>()),
         reinterpret_cast<const XPUType*>(label->data<T>()),
-        reinterpret_cast<XPUType*>(output->data<T>()), 1, input->numel());
-    PADDLE_ENFORCE_EQ(
-        r, XPU_SUCCESS,
-        platform::errors::External("XPU sigmoid_cross_entropy_with_logits "
-                                   "kernel return wrong value[%d %s]",
-                                   r, XPUAPIErrorMsg[r]));
+        reinterpret_cast<XPUType*>(output->data<T>()), 1, input->numel(), hit,
+        ignore_index);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "sigmoid_cross_entropy_with_logits");
+    if (normalize) {
+      int* non_zero = RAII_GUARD.alloc_l3_or_gm<int>(1);
+      PADDLE_ENFORCE_NOT_NULL(
+          non_zero,
+          platform::errors::External("XPU alloc_l3_or_gm returns nullptr"));
+      int r = xpu::nonzero_count(dev_ctx.x_context(),
+                                 reinterpret_cast<const XPUType*>(hit),
+                                 non_zero, input->numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count");
+      int non_zero_cpu = 0;
+      memory::Copy(platform::CPUPlace(), static_cast<void*>(&non_zero_cpu),
+                   context.GetPlace(), static_cast<void*>(non_zero),
+                   sizeof(int));
+      r = xpu::scale(dev_ctx.x_context(),
+                     reinterpret_cast<const XPUType*>(output->data<T>()),
+                     reinterpret_cast<XPUType*>(output->data<T>()),
+                     input->numel(), false,
+                     1.0f / static_cast<float>(non_zero_cpu), 0.0f);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
+    }
   }
 };
 
@@ -81,16 +99,42 @@ class SigmoidCrossEntropyWithLogitsGradXPUKernel
     dx->mutable_data<T>(context.GetPlace());
     auto& dev_ctx = context.template device_context<DeviceContext>();
 
+    // attrs
+    int ignore_index = context.Attr<int>("ignore_index");
+    bool normalize = context.Attr<bool>("normalize");
+
+    // allocate temp memory
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+    int* hit = RAII_GUARD.alloc_l3_or_gm<int>(input->numel());
+    PADDLE_ENFORCE_NOT_NULL(
+        hit, platform::errors::External("XPU alloc_l3_or_gm returns nullptr"));
+
     int r = xpu::sigmoid_cross_entropy_with_logits_grad(
         dev_ctx.x_context(), reinterpret_cast<const XPUType*>(input->data<T>()),
         reinterpret_cast<const XPUType*>(label->data<T>()),
         reinterpret_cast<const XPUType*>(dy->data<T>()),
-        reinterpret_cast<XPUType*>(dx->data<T>()), 1, input->numel());
-    PADDLE_ENFORCE_EQ(
-        r, XPU_SUCCESS,
-        platform::errors::External("XPU sigmoid_cross_entropy_with_logits_grad "
-                                   "kernel return wrong value[%d %s]",
-                                   r, XPUAPIErrorMsg[r]));
+        reinterpret_cast<XPUType*>(dx->data<T>()), 1, input->numel(), hit,
+        ignore_index);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "sigmoid_cross_entropy_with_logits");
+    if (normalize) {
+      int* non_zero = RAII_GUARD.alloc_l3_or_gm<int>(1);
+      PADDLE_ENFORCE_NOT_NULL(
+          non_zero,
+          platform::errors::External("XPU alloc_l3_or_gm returns nullptr"));
+      int r = xpu::nonzero_count(dev_ctx.x_context(),
+                                 reinterpret_cast<const XPUType*>(hit),
+                                 non_zero, input->numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count");
+      int non_zero_cpu = 0;
+      memory::Copy(platform::CPUPlace(), static_cast<void*>(&non_zero_cpu),
+                   context.GetPlace(), static_cast<void*>(non_zero),
+                   sizeof(int));
+      r = xpu::scale(dev_ctx.x_context(),
+                     reinterpret_cast<const XPUType*>(dx->data<T>()),
+                     reinterpret_cast<XPUType*>(dx->data<T>()), input->numel(),
+                     false, 1.0f / static_cast<float>(non_zero_cpu), 0.0f);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc
index 6207c33f9d6299605d24f11c13820eac47ee6c98..f36124078054e87f8218f2bb82ff4e58b22fc0ae 100644
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
@@ -14,7 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/sign_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/core/infermeta_utils.h"
+#include "paddle/pten/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,14 +25,6 @@ namespace operators {
 class SignOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "sign");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "sign");
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 template <typename AttrType>
@@ -64,9 +59,12 @@ class SignGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DELCARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor,
+                            PT_INFER_META(pten::UnchangedInferMetaNew));
 REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker<float>,
                   ops::SignGradMaker<paddle::framework::OpDesc>,
-                  ops::SignGradMaker<paddle::imperative::OpBase>);
+                  ops::SignGradMaker<paddle::imperative::OpBase>,
+                  SignInferShapeFunctor);
 REGISTER_OP_CPU_KERNEL(
     sign, ops::SignKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SignKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/softmax_op_xpu.cc b/paddle/fluid/operators/softmax_op_xpu.cc
index 0adc12e684c3a4c816bddb29043e638fbb368ae9..a0d4b4c4eb4604ef699acd20807856ccada8717d 100644
--- a/paddle/fluid/operators/softmax_op_xpu.cc
+++ b/paddle/fluid/operators/softmax_op_xpu.cc
@@ -45,8 +45,8 @@ class SoftmaxXPUKernel : public framework::OpKernel<T> {
     auto& dev_ctx = context.template device_context<DeviceContext>();
 
     int r = XPU_SUCCESS;
-    paddle::platform::XPUVersion version = dev_ctx.xpu_version();
-    if (version == paddle::platform::XPUVersion::XPU1) {
+    auto version = dev_ctx.xpu_version();
+    if (version == pten::backends::xpu::XPUVersion::XPU1) {
       xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
       XPUType* clip_x_data_l3 = RAII_GUARD.alloc_l3_or_gm<XPUType>(x->numel());
       r = xpu::clip_v2(dev_ctx.x_context(),
diff --git a/paddle/fluid/operators/split_op_mlu.cc b/paddle/fluid/operators/split_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c569c9bf091335a01dfb2d70808cb6ce0bb66812
--- /dev/null
+++ b/paddle/fluid/operators/split_op_mlu.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/split_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class SplitMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // init parameter
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    int num = ctx.Attr<int>("num");
+    std::vector<int> sections = ctx.Attr<std::vector<int>>("sections");
+    int axis = ctx.Attr<int>("axis");
+    auto in_dims = in->dims();
+    auto out_size = outs.size();
+    auto num_tensor = num == 0 ? out_size : num;
+
+    bool need_resize_outs_dims = false;
+    if (ctx.HasInput("AxisTensor")) {
+      auto* axis_tensor = ctx.Input<framework::Tensor>("AxisTensor");
+      axis = GetDataFromTensor(axis_tensor)[0];
+      need_resize_outs_dims = true;
+    }
+    auto sections_tensor_list =
+        ctx.MultiInput<framework::Tensor>("SectionsTensorList");
+    if (sections_tensor_list.size() > 0) {
+      sections = GetDataFromTensorList(sections_tensor_list);
+      need_resize_outs_dims = true;
+    }
+    if (need_resize_outs_dims) {
+      std::vector<framework::DDim> outs_dims =
+          UpdateOutsDims(true, true, in_dims, num, sections, axis, out_size);
+      for (size_t j = 0; j < outs.size(); ++j) {
+        outs[j]->Resize(outs_dims[j]);
+      }
+    }
+
+    // init out tensors
+    std::vector<void*> vct_tensor;
+    std::vector<MLUCnnlTensorDesc> output_descs;
+    std::vector<cnnlTensorDescriptor_t> desc_vector;
+    auto place = ctx.GetPlace();
+    for (size_t i = 0; i < outs.size(); i++) {
+      outs[i]->mutable_data<T>(ctx.GetPlace());
+      output_descs.emplace_back(MLUCnnlTensorDesc(
+          *outs[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(outs[i]->type())));
+      desc_vector.push_back(output_descs.back().get());
+      vct_tensor.push_back(GetBasePtr(outs[i]));
+    }
+    // init in tensors
+    MLUCnnlTensorDesc input_desc(*in, CNNL_LAYOUT_ARRAY,
+                                 ToCnnlDataType(in->type()));
+
+    // MLU should do sth
+    MLUCnnl::Split(ctx, num_tensor, axis, input_desc.get(), GetBasePtr(in),
+                   desc_vector.data(), vct_tensor.data());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(split, ops::SplitMLUKernel<float>,
+                       ops::SplitMLUKernel<int64_t>, ops::SplitMLUKernel<int>,
+                       ops::SplitMLUKernel<bool>,
+                       ops::SplitMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 0f520adba57a203fae5d3b34fb67067d01691bed..00aab6b75006aec9b2ff397f2589174aeee615f9 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -165,9 +165,9 @@ class SumOp : public framework::OperatorWithKernel {
 
       return framework::OpKernelType(data_type, ctx.GetPlace(), layout,
                                      library);
-    } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
+    } else if (x_vars[0]->IsType<pten::SelectedRows>()) {
       for (auto& var : x_vars) {
-        auto& value = var->Get<framework::SelectedRows>().value();
+        auto& value = var->Get<pten::SelectedRows>().value();
         if (value.IsInitialized()) {
           return framework::OpKernelType(value.type(), ctx.device_context(),
                                          layout, library);
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index 4288e9415aa8699d1f98d186d74801bf890d757e..9de9b0b6338dfc78ba06d750ce2c18823d0eda53 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -151,7 +151,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
       if (lod_length && in_i.IsInitialized()) {
         in_data.emplace_back(in_i.data<T>());
       }
-    } else if (in_vars[i]->IsType<framework::SelectedRows>()) {
+    } else if (in_vars[i]->IsType<pten::SelectedRows>()) {
       selectrow_index.push_back(i);
     }
   }
@@ -162,7 +162,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
     size_t rows = 0;
     int64_t length = 0;
     for (auto index : selectrow_index) {
-      auto &sr = in_vars[index]->Get<framework::SelectedRows>();
+      auto &sr = in_vars[index]->Get<pten::SelectedRows>();
       auto &sr_value = sr.value();
       auto &sr_rows = sr.rows();
 
@@ -235,7 +235,7 @@ class SumKernel<platform::CUDADeviceContext, T>
 
     if (out_var->IsType<framework::LoDTensor>()) {
       SumToLoDTensor<T>(context);
-    } else if (out_var->IsType<framework::SelectedRows>()) {
+    } else if (out_var->IsType<pten::SelectedRows>()) {
       SelectedRowsCompute<platform::CUDADeviceContext, T>(context);
     } else if (out_var->IsType<framework::LoDTensorArray>()) {
       LodTensorArrayCompute<platform::CUDADeviceContext, T>(context);
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index 61a9c8b11508f2c9e300c16a661f744bc0248c08..4e108b56a404d590b02c098c845d08b958f15f9a 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -21,7 +21,7 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-using SelectedRows = framework::SelectedRows;
+using SelectedRows = pten::SelectedRows;
 using LoDTensor = framework::LoDTensor;
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -37,32 +37,32 @@ void SelectedRowsCompute(const framework::ExecutionContext &context) {
     return;
   }
 
-  std::vector<const paddle::framework::SelectedRows *> inputs;
+  std::vector<const pten::SelectedRows *> inputs;
   SelectedRows temp_in0;
 
   if (in_place) {
-    auto &in0 = in_vars[0]->Get<SelectedRows>();
+    auto &in0 = in_vars[0]->Get<pten::SelectedRows>();
     temp_in0.set_height(in0.height());
     temp_in0.set_rows(in0.rows());
     framework::TensorCopy(in0.value(), in0.place(), context.device_context(),
                           temp_in0.mutable_value());
     inputs.push_back(&temp_in0);
     for (size_t i = 1; i < in_vars.size(); ++i) {
-      auto &in = in_vars[i]->Get<SelectedRows>();
+      auto &in = in_vars[i]->Get<pten::SelectedRows>();
       if (in.rows().size() > 0) {
         inputs.push_back(&in);
       }
     }
   } else {
     for (auto &in_var : in_vars) {
-      auto &in = in_var->Get<SelectedRows>();
+      auto &in = in_var->Get<pten::SelectedRows>();
       if (in.rows().size() > 0) {
-        inputs.push_back(&in_var->Get<SelectedRows>());
+        inputs.push_back(&in_var->Get<pten::SelectedRows>());
       }
     }
   }
 
-  auto *out = context.Output<SelectedRows>("Out");
+  auto *out = context.Output<pten::SelectedRows>("Out");
   out->mutable_rows()->clear();
 
   bool has_data = false;
@@ -183,8 +183,8 @@ class SumKernel : public framework::OpKernel<T> {
           }
           auto in = EigenVector<T>::Flatten(in_t);
           result.device(place) = result + in;
-        } else if (in_vars[i]->IsType<framework::SelectedRows>()) {
-          auto &in_t = in_vars[i]->Get<framework::SelectedRows>();
+        } else if (in_vars[i]->IsType<pten::SelectedRows>()) {
+          auto &in_t = in_vars[i]->Get<pten::SelectedRows>();
           functor(context.template device_context<DeviceContext>(), in_t, out);
         } else {
           PADDLE_THROW(platform::errors::InvalidArgument(
@@ -194,7 +194,7 @@ class SumKernel : public framework::OpKernel<T> {
               framework::ToTypeName(in_vars[i]->Type())));
         }
       }
-    } else if (out_var->IsType<framework::SelectedRows>()) {
+    } else if (out_var->IsType<pten::SelectedRows>()) {
       SelectedRowsCompute<DeviceContext, T>(context);
     } else if (out_var->IsType<framework::LoDTensorArray>()) {
       LodTensorArrayCompute<DeviceContext, T>(context);
diff --git a/paddle/fluid/operators/sum_op_mlu.cc b/paddle/fluid/operators/sum_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e2cd649722b2444bb7a032eac18760e582db71d8
--- /dev/null
+++ b/paddle/fluid/operators/sum_op_mlu.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sum_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SumMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto out_var = ctx.OutputVar("Out");
+    if (out_var->IsType<framework::LoDTensor>()) {
+      // init
+      auto *out = out_var->GetMutable<framework::LoDTensor>();
+      auto ins = ctx.MultiInput<Tensor>("X");
+      out->mutable_data<T>(ctx.GetPlace());
+      auto place = ctx.GetPlace();
+      int ins_size = static_cast<int>(ins.size());
+      if (ins_size == 1) {
+        TensorCopy(*ins[0], place, out);
+        return;
+      }
+
+      // MLU shoul do sth
+      std::vector<const void *> inputs;
+      std::vector<MLUCnnlTensorDesc> input_descs;
+      std::vector<cnnlTensorDescriptor_t> desc_vector;
+      for (int i = 0; i < ins_size; i++) {
+        input_descs.emplace_back(MLUCnnlTensorDesc(
+            *ins[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(ins[i]->type())));
+        desc_vector.push_back(input_descs.back().get());
+        inputs.push_back(GetBasePtr(ins[i]));
+      }
+      // init out tensors
+      MLUCnnlTensorDesc output_desc(*out, CNNL_LAYOUT_ARRAY,
+                                    ToCnnlDataType(out->type()));
+      uint32_t ins_size_t = static_cast<uint32_t>(ins_size);
+      MLUCnnl::AddN(ctx, ins_size_t, desc_vector.data(), inputs.data(),
+                    output_desc.get(), GetBasePtr(out));
+
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of Output(out) must be Tensor or But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(out_var->Type())));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_MLU_KERNEL(
+    sum, ops::SumMLUKernel<paddle::platform::MLUDeviceContext, float>,
+    ops::SumMLUKernel<paddle::platform::MLUDeviceContext,
+                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/top_k_op_mlu.cc b/paddle/fluid/operators/top_k_op_mlu.cc
index affe5a4bc6c2dc603fe5a4cc4ef91c297ec81d59..e5064ed90d5d718c63aaf68c5630f12b7032483a 100644
--- a/paddle/fluid/operators/top_k_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_op_mlu.cc
@@ -33,8 +33,7 @@ class TopkMLUKernel : public framework::OpKernel<T> {
       auto k_t_ptr = static_cast<const void*>(k_t->data<int>());
       auto size = k_t->numel() * sizeof(int);
       memory::Copy(platform::CPUPlace(), reinterpret_cast<void*>(&k),
-                   BOOST_GET_CONST(platform::MLUPlace, k_t->place()), k_t_ptr,
-                   size, nullptr);
+                   k_t->place(), k_t_ptr, size, nullptr);
       framework::DDim output_dims = output->dims();
       output_dims[output_dims.size() - 1] = k;
       output->Resize(output_dims);
diff --git a/paddle/fluid/operators/top_k_v2_op_mlu.cc b/paddle/fluid/operators/top_k_v2_op_mlu.cc
index 08c960186bafeb59ab6657e2445a3d5a9c58b6ab..cc05e11495b7bbe278cd79aa09cb35077e659d05 100644
--- a/paddle/fluid/operators/top_k_v2_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_mlu.cc
@@ -43,8 +43,7 @@ class TopkV2MLUKernel : public framework::OpKernel<T> {
       auto k_t_ptr = static_cast<const void*>(k_t->data<int>());
       auto size = k_t->numel() * sizeof(int);
       memory::Copy(platform::CPUPlace(), reinterpret_cast<void*>(&k),
-                   BOOST_GET_CONST(platform::MLUPlace, k_t->place()), k_t_ptr,
-                   size, nullptr);
+                   k_t->place(), k_t_ptr, size, nullptr);
       framework::DDim output_dims = output->dims();
       // accroding to axis to set K value in the dim
       output_dims[axis] = k;
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index cdb4ad7c40826b94e00dbeba947025b7edf6cfeb..8c603a7c5d8c8f30c769ac53a914bff8305b24cb 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -73,8 +73,8 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
       }
     }
 
-    if (out_var->IsType<framework::SelectedRows>()) {
-      auto *selected_rows = out_var->GetMutable<framework::SelectedRows>();
+    if (out_var->IsType<pten::SelectedRows>()) {
+      auto *selected_rows = out_var->GetMutable<pten::SelectedRows>();
       tensor = selected_rows->mutable_value();
       auto shape = ctx.Attr<std::vector<int64_t>>("shape");
       if (!new_shape.empty()) shape = new_shape;
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 63eecd15c2d69bab3a4e8230f6fa947e3662f22d..5278bdd2f1c7255e4e407fb98de80e79360e5430 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -111,8 +111,8 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
       }
     }
 
-    if (out_var->IsType<framework::SelectedRows>()) {
-      auto* selected_rows = out_var->GetMutable<framework::SelectedRows>();
+    if (out_var->IsType<pten::SelectedRows>()) {
+      auto* selected_rows = out_var->GetMutable<pten::SelectedRows>();
       tensor = selected_rows->mutable_value();
       auto shape = context.Attr<std::vector<int64_t>>("shape");
       if (!new_shape.empty()) shape = new_shape;
diff --git a/paddle/fluid/operators/uniform_random_op_npu.cc b/paddle/fluid/operators/uniform_random_op_npu.cc
index 1c2f2b07ce897524467ae1877f4a3252571d0106..6812a2b0b7085c6be68325ec506860d3e1b2c4e6 100644
--- a/paddle/fluid/operators/uniform_random_op_npu.cc
+++ b/paddle/fluid/operators/uniform_random_op_npu.cc
@@ -40,8 +40,8 @@ class NPUUniformRandomKernel : public framework::OpKernel<T> {
       }
     }
 
-    if (out_var->IsType<framework::SelectedRows>()) {
-      auto *selected_rows = out_var->GetMutable<framework::SelectedRows>();
+    if (out_var->IsType<pten::SelectedRows>()) {
+      auto *selected_rows = out_var->GetMutable<pten::SelectedRows>();
       tensor = selected_rows->mutable_value();
       auto shape = ctx.Attr<std::vector<int64_t>>("shape");
       if (!new_shape.empty()) shape = new_shape;
diff --git a/paddle/fluid/operators/uniform_random_op_xpu.cc b/paddle/fluid/operators/uniform_random_op_xpu.cc
index fed0accd8a14cd7f2434e117b3145e81cfccafd4..848b72727bd28295e8e1a2b3d9e231b3c34c733d 100644
--- a/paddle/fluid/operators/uniform_random_op_xpu.cc
+++ b/paddle/fluid/operators/uniform_random_op_xpu.cc
@@ -41,8 +41,8 @@ class XPUUniformRandomKernel : public framework::OpKernel<T> {
       }
     }
 
-    if (out_var->IsType<framework::SelectedRows>()) {
-      auto *selected_rows = out_var->GetMutable<framework::SelectedRows>();
+    if (out_var->IsType<pten::SelectedRows>()) {
+      auto *selected_rows = out_var->GetMutable<pten::SelectedRows>();
       tensor = selected_rows->mutable_value();
       auto shape = ctx.Attr<std::vector<int64_t>>("shape");
       if (!new_shape.empty()) shape = new_shape;
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 21531a3efd64f33dbd90c8d3114fc54020db427f..eb7057bcd50addd8053738b81b79cf6d0a915941 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -1,9 +1,7 @@
 proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
-proto_library(error_codes_proto SRCS error_codes.proto)
 if(WITH_GPU)
   proto_library(external_error_proto SRCS external_error.proto)
 endif(WITH_GPU)
-
 if (WITH_PYTHON)
   py_proto_compile(profiler_py_proto SRCS profiler.proto)
   add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
@@ -28,10 +26,9 @@ endif()
 cc_library(flags SRCS flags.cc DEPS gflags boost)
 cc_library(denormal SRCS denormal.cc DEPS)
 
-cc_library(errors SRCS errors.cc DEPS error_codes_proto)
 cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)
 
-set(enforce_deps flags errors boost flags)
+set(enforce_deps flags errors boost flags pten_enforce)
 if(WITH_GPU)
   set(enforce_deps ${enforce_deps} external_error_proto)
 endif()
@@ -75,7 +72,7 @@ IF(WITH_GPU OR WITH_ROCM)
 ENDIF()
 
 IF(WITH_IPU)
-    set(IPU_CTX_DEPS ipu_backend)
+    set(IPU_CTX_DEPS ipu_info)
 ELSE()
     set(IPU_CTX_DEPS)
 ENDIF(WITH_IPU)
@@ -123,7 +120,10 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
     place pten_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
-    ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} cpu_context)
+    ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} eigen3 cpu_context)
+if(WITH_XPU)
+  target_link_libraries(device_context xpu_context)
+endif()
 
 cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)
 if(WITH_ASCEND_CL)
diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h
index 43408ca207d1d2c10ba29b32b487e8a7ea99917f..4f8bbb2d2689eb6ffee1119c6eb14ef27de7a2c8 100644
--- a/paddle/fluid/platform/device/device_wrapper.h
+++ b/paddle/fluid/platform/device/device_wrapper.h
@@ -34,3 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/mlu/enforce.h"
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
 #endif
+
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
index 883767348f06a99c32664ca2575880737b7418b5..d07ef73a49e7991d43d056da7d41eb83792a402b 100644
--- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
@@ -165,8 +165,6 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(
   return config;
 }
 
-// TODO(wangchaochaohu): 3D will add later
-
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt
index 5f711937a8098b1d8d83ac0d9f284883191fc796..d54c6a33ecbf53071956aaf4b9d342efa5746f65 100644
--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
@@ -1,19 +1,22 @@
 IF(WITH_IPU)
   FILE(GLOB POPART_CANONICALIZATION_SRC ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/device/ipu/popart_canonicalization/*.cc)
   list(APPEND PADDLE_IPU_SRC ${POPART_CANONICALIZATION_SRC})
-  set(PADDLE_IPU_LIB "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_ipu.so" CACHE STRING "")
-  set(PADDLE_IPU_LIB_DIR "${CMAKE_CURRENT_BINARY_DIR}" CACHE STRING "")
   set(IPU_BACKEND_SRC
-    "ipu_device.cc"
     "ipu_strategy.cc"
     "ipu_executor.cc"
     "ipu_compiler.cc"
     "ipu_backend.cc"
     "ipu_utils.cc"
   )
+  set(IPU_INFO_SRC
+    "ipu_info.cc"
+    "ipu_device.cc"
+  )
 
-  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart graph framework_proto enforce graph_helper timer)
-  cc_library(ipu_info SRCS ipu_info.cc DEPS ipu_backend)
-  cc_library(paddle_ipu SHARED SRCS ${PADDLE_IPU_SRC} DEPS popart)
+  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart graph graph_helper)
+  cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart enforce)
+  cc_library(paddle_ipu SHARED SRCS ${PADDLE_IPU_SRC} DEPS popart graph_helper)
   add_dependencies(paddle_ipu ipu_backend)
+  set(PADDLE_IPU_LIB "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_ipu.so" CACHE STRING "")
+  set(PADDLE_IPU_LIB_DIR "${CMAKE_CURRENT_BINARY_DIR}" CACHE STRING "")
 ENDIF()
diff --git a/paddle/fluid/platform/device/ipu/ipu_device.cc b/paddle/fluid/platform/device/ipu/ipu_device.cc
index cd2a628c9abe2bf8e391fcfc7b9d37b293d19936..2459f5140eb5b25af82381366f25c714beb69aaf 100644
--- a/paddle/fluid/platform/device/ipu/ipu_device.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_device.cc
@@ -13,12 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/ipu/ipu_device.h"
-#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
 namespace ipu {
 
+// TODO(alleng) merge with ipu_utils
+static bool GetBoolEnv(std::string str) {
+  char* str_val = getenv(str.c_str());
+  if (str_val == NULL) {
+    return false;
+  } else {
+    bool val = false;
+    if (strcmp(str_val, "1") == 0 || strcmp(str_val, "true") == 0 ||
+        strcmp(str_val, "True") == 0 || strcmp(str_val, "TRUE") == 0)
+      val = true;
+    return val;
+  }
+}
+
 int GetNumDevices() {
   bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
   if (ipu_model) {
diff --git a/paddle/fluid/platform/device/ipu/ipu_device.h b/paddle/fluid/platform/device/ipu/ipu_device.h
index 3da13a522e19a3f6526751e48c70bdd8562d1b6c..d39feffc92655b52dae1792fab0a5ef95bb6075f 100644
--- a/paddle/fluid/platform/device/ipu/ipu_device.h
+++ b/paddle/fluid/platform/device/ipu/ipu_device.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include <popart/devicemanager.hpp>
-#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device/ipu/ipu_info.cc b/paddle/fluid/platform/device/ipu/ipu_info.cc
index 4506bfbf972248fd0539927c483b3e23114a6750..9e6951c37139db2bbca6a1eab7f521e850dba6db 100644
--- a/paddle/fluid/platform/device/ipu/ipu_info.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_info.cc
@@ -16,12 +16,10 @@ namespace paddle {
 namespace platform {
 
 //! Get a list of device ids from environment variable or use all.
-std::vector<int> GetSelectedIPUDevices() {
-  return platform::ipu::GetDeviceIds();
-}
+std::vector<int> GetSelectedIPUDevices() { return ipu::GetDeviceIds(); }
 
 //! Get the total number of IPU devices in system.
-int GetIPUDeviceCount() { return platform::ipu::GetNumDevices(); }
+int GetIPUDeviceCount() { return ipu::GetNumDevices(); }
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
index 67012e8d4b92d8d6336f1b192a7b19828511c08e..d4a14a6d8409f9b50247f747016f5284f11037da 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
@@ -32,7 +32,7 @@ Node *mean_handler(Graph *graph, Node *node) {
 
 Node *pow_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
-  if (op->HasInput("FactorTensor") && !op->Input("FactorTensor").empty()) {
+  if (!op->Input("FactorTensor").empty()) {
     return CreateBaseOp(
         graph, node, "popart_pow",
         {GetInputVarNode("X", node), GetInputVarNode("FactorTensor", node)},
@@ -161,7 +161,7 @@ Node *scale_handler(Graph *graph, Node *node) {
                          static_cast<int>(framework::proto::VarType::FP32));
 
   Node *result = nullptr;
-  if (op->HasInput("ScaleTensor") && !op->Input("ScaleTensor").empty()) {
+  if (!op->Input("ScaleTensor").empty()) {
     auto scale = GetInputVarNode("ScaleTensor", node);
     if (is_float_equal(bias_, 0.0)) {
       result = CreateBaseOp(graph, node, "popart_mul",
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
index b7412000107d3157c6b5c38d7c456af3bd36aabd..b731ba532d60c743278b73754deb884c800fe4d1 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
@@ -34,7 +34,7 @@ Node *conv2d_handler(Graph *graph, Node *node) {
   auto pads = std::vector<int64_t>{pads_.begin(), pads_.end()};
   auto stride_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("strides"));
   auto stride = std::vector<int64_t>{stride_.begin(), stride_.end()};
-  if (op->HasInput("Bias") && !op->Input("Bias").empty()) {
+  if (!op->Input("Bias").empty()) {
     return CreateConv(
         graph, node,
         {
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc
index 662660c23b4a6a357d27565a8c6b37b25db9c9be..539053f2fb67bae4652e61a52bc3254f233d3417 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc
@@ -65,7 +65,7 @@ Node *topk_handler(Graph *graph, Node *node) {
 
   Node *var_x = GetInputVarNode("X", node);
   Node *var_k = nullptr;
-  if (op->HasInput("K") && !op->Input("K").empty()) {
+  if (!op->Input("K").empty()) {
     var_k = GetInputVarNode("K", node);
   } else {
     auto k = BOOST_GET_CONST(int, op->GetAttr("k"));
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
index 296668890ebe5a0f1550e41aff4424b0f87b4f95..db429d2f6228455bd4ca1a47d117ddf2ad286e65 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
@@ -23,7 +23,7 @@ namespace {
 
 Node *fill_constant_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
-  if (op->HasInput("ShapeTensor") && !op->Input("ShapeTensor").empty()) {
+  if (!op->Input("ShapeTensor").empty()) {
     PADDLE_THROW(
         platform::errors::Unimplemented("op fill_constant with ShapeTensor"));
   }
@@ -328,7 +328,7 @@ Node *shape_handler(Graph *graph, Node *node) {
 Node *slice_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   Node *starts = nullptr;
-  if (op->HasInput("StartsTensor") && !op->Input("StartsTensor").empty()) {
+  if (!op->Input("StartsTensor").empty()) {
     starts = GetInputVarNode("StartsTensor", node);
   } else {
     auto starts_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("starts"));
@@ -338,7 +338,7 @@ Node *slice_handler(Graph *graph, Node *node) {
     starts = starts->outputs[0];
   }
   Node *ends = nullptr;
-  if (op->HasInput("EndsTensor") && !op->Input("EndsTensor").empty()) {
+  if (!op->Input("EndsTensor").empty()) {
     ends = GetInputVarNode("EndsTensor", node);
   } else {
     auto ends_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("ends"));
@@ -384,14 +384,13 @@ Node *slice_handler(Graph *graph, Node *node) {
 
 Node *expand_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
-  if (op->HasInput("expand_times_tensor") &&
-      !op->Input("expand_times_tensor").empty()) {
+  if (!op->Input("expand_times_tensor").empty()) {
     PADDLE_THROW(
         platform::errors::Unimplemented("Expand op with expand_times_tensor"));
   }
 
   Node *expand_times = nullptr;
-  if (op->HasInput("ExpandTimes") && !op->Input("ExpandTimes").empty()) {
+  if (!op->Input("ExpandTimes").empty()) {
     // cast to int64
     expand_times =
         CreateCast(graph, node, {GetInputVarNode("ExpandTimes", node)}, {},
diff --git a/paddle/fluid/platform/device/mlu/CMakeLists.txt b/paddle/fluid/platform/device/mlu/CMakeLists.txt
index 9ef4439f39b6a553e83747452b32d6dd6a2e999b..a4584f54637a615d79995e1e27303128b4202b5e 100644
--- a/paddle/fluid/platform/device/mlu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/mlu/CMakeLists.txt
@@ -5,6 +5,6 @@ IF(WITH_MLU)
 
     cc_library(mlu_stream SRCS mlu_stream.cc DEPS boost mlu_info stream_callback_manager)
     
-    cc_library(mlu_device_context SRCS device_context.cc DEPS mlu_stream )
+    cc_library(mlu_device_context SRCS device_context.cc DEPS mlu_stream eigen3)
     cc_test(mlu_device_context_test SRCS device_context_test.cc DEPS mlu_device_context)
 ENDIF()
diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt
index f89c8c193ae7cf37bc7d2c3b8dc4171badb4a4b4..d292ce130eb34a3c3dfd7e5496f2fbe5112064af 100644
--- a/paddle/fluid/platform/device/xpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -4,7 +4,7 @@ endif()
 
 set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl)
 
-cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place)
+cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place pten_xpu_info)
 cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context)
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/platform/device/xpu/enforce_xpu.h b/paddle/fluid/platform/device/xpu/enforce_xpu.h
index 4c85168f68dd3a5eed07bb64912dbab5f018f2ab..ae5ec8e851d688e191fd3ed086a48cde54087d1f 100644
--- a/paddle/fluid/platform/device/xpu/enforce_xpu.h
+++ b/paddle/fluid/platform/device/xpu/enforce_xpu.h
@@ -15,177 +15,36 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "xpu/bkcl.h"
+
+#include "paddle/pten/backends/xpu/enforce_xpu.h"
 
 namespace paddle {
 namespace platform {
 
 // Note: XPU runtime api return int, not XPUError_t
 inline const char* xpuGetErrorString(int stat) {
-  switch (stat) {
-    case XPU_SUCCESS:
-      return "Success";
-    case XPUERR_INVALID_DEVICE:
-      return "Invalid XPU device";
-    case XPUERR_UNINIT:
-      return "XPU runtime not properly inited";
-    case XPUERR_NOMEM:
-      return "Device memory not enough";
-    case XPUERR_NOCPUMEM:
-      return "CPU memory not enough";
-    case XPUERR_INVALID_PARAM:
-      return "Invalid parameter";
-    case XPUERR_NOXPUFUNC:
-      return "Cannot get XPU Func";
-    case XPUERR_LDSO:
-      return "Error loading dynamic library";
-    case XPUERR_LDSYM:
-      return "Error loading func from dynamic library";
-    case XPUERR_SIMULATOR:
-      return "Error from XPU Simulator";
-    case XPUERR_NOSUPPORT:
-      return "Operation not supported";
-    case XPUERR_ABNORMAL:
-      return "Device abnormal due to previous error";
-    case XPUERR_KEXCEPTION:
-      return "Exception in kernel execution";
-    case XPUERR_TIMEOUT:
-      return "Kernel execution timed out";
-    case XPUERR_BUSY:
-      return "Resource busy";
-    case XPUERR_USEAFCLOSE:
-      return "Use a stream after closed";
-    case XPUERR_UCECC:
-      return "Uncorrectable ECC";
-    case XPUERR_OVERHEAT:
-      return "Overheat";
-    case XPUERR_UNEXPECT:
-      return "Execution error, reach unexpected control flow";
-    case XPUERR_DEVRESET:
-      return "Device is being reset, try again later";
-    case XPUERR_HWEXCEPTION:
-      return "Hardware module exception";
-    case XPUERR_HBM_INIT:
-      return "Error init HBM";
-    case XPUERR_DEVINIT:
-      return "Error init device";
-    case XPUERR_PEERRESET:
-      return "Device is being reset, try again later";
-    case XPUERR_MAXDEV:
-      return "Device count exceed limit";
-    case XPUERR_NOIOC:
-      return "Unknown IOCTL command";
-    case XPUERR_DMATIMEOUT:
-      return "DMA timed out, a reboot maybe needed";
-    case XPUERR_DMAABORT:
-      return "DMA aborted due to error, possibly wrong address or hardware "
-             "state";
-    case XPUERR_MCUUNINIT:
-      return "Firmware not initialized";
-    case XPUERR_OLDFW:
-      return "Firmware version too old (<15), please update.";
-    case XPUERR_PCIE:
-      return "Error in PCIE";
-    case XPUERR_FAULT:
-      return "Error copy between kernel and user space";
-    case XPUERR_INTERRUPTED:
-      return "Execution interrupted by user";
-    default:
-      return "unkonwn error";
-  }
+  return pten::backends::xpu::xpuGetErrorString(stat);
 }
 
 inline const char* bkclGetErrorString(BKCLResult_t stat) {
-  switch (stat) {
-    case BKCL_SUCCESS:
-      return "BKCL_SUCCESS";
-    case BKCL_INVALID_ARGUMENT:
-      return "BKCL_INVALID_ARGUMENT";
-    case BKCL_RUNTIME_ERROR:
-      return "BKCL_RUNTIME_ERROR";
-    case BKCL_SYSTEM_ERROR:
-      return "BKCL_SYSTEM_ERROR";
-    case BKCL_INTERNAL_ERROR:
-      return "BKCL_INTERNAL_ERROR";
-    default:
-      return "Unknown BKCL status";
-  }
+  return pten::backends::xpu::bkclGetErrorString(stat);
 }
 
 inline const char* xdnnGetErrorString(int stat) {
-  switch (stat) {
-    case xpu::Error_t::SUCCESS:
-      return "XDNN_SUCCESS";
-    case xpu::Error_t::INVALID_PARAM:
-      return "XDNN_INVALID_PARAM";
-    case xpu::Error_t::RUNTIME_ERROR:
-      return "XDNN_RUNTIME_ERROR";
-    case xpu::Error_t::NO_ENOUGH_WORKSPACE:
-      return "XDNN_NO_ENOUGH_WORKSPACE";
-    case xpu::Error_t::NOT_IMPLEMENT:
-      return "XDNN_NOT_IMPLEMENT";
-    default:
-      return "Unknown XDNN status";
-  }
+  return pten::backends::xpu::xdnnGetErrorString(stat);
 }
 
 inline std::string build_xpu_error_msg(int stat) {
-  std::string msg("XPU Error <" + std::to_string(stat) + ">, ");
-  return msg + xpuGetErrorString(stat) + " ";
+  return pten::backends::xpu::build_xpu_error_msg(stat);
 }
 
 inline std::string build_xpu_error_msg(BKCLResult_t stat) {
-  std::string msg("BKCL Error, ");
-  return msg + bkclGetErrorString(stat) + " ";
+  return pten::backends::xpu::build_xpu_error_msg(stat);
 }
 
 inline std::string build_xpu_xdnn_error_msg(int stat, std::string msg) {
-  return msg + " XDNN Error, " + xdnnGetErrorString(stat) + " ";
+  return pten::backends::xpu::build_xpu_xdnn_error_msg(stat, msg);
 }
 
-namespace details {
-
-template <typename T>
-struct ExternalApiType {};
-
-#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
-  template <>                                         \
-  struct ExternalApiType<type> {                      \
-    using Type = type;                                \
-    static constexpr Type kSuccess = success_value;   \
-  }
-
-DEFINE_EXTERNAL_API_TYPE(int, XPU_SUCCESS);
-DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
-
-#undef DEFINE_EXTERNAL_API_TYPE
-
-}  // namespace details
-
-#define PADDLE_ENFORCE_XPU_SUCCESS(COND)                      \
-  do {                                                        \
-    auto __cond__ = (COND);                                   \
-    using __XPU_STATUS_TYPE__ = decltype(__cond__);           \
-    constexpr auto __success_type__ =                         \
-        ::paddle::platform::details::ExternalApiType<         \
-            __XPU_STATUS_TYPE__>::kSuccess;                   \
-    if (UNLIKELY(__cond__ != __success_type__)) {             \
-      auto __summary__ = paddle::platform::errors::External(  \
-          ::paddle::platform::build_xpu_error_msg(__cond__)); \
-      __THROW_ERROR_INTERNAL__(__summary__);                  \
-    }                                                         \
-  } while (0)
-
-#define PADDLE_ENFORCE_XDNN_SUCCESS(COND, MSG)                          \
-  do {                                                                  \
-    auto __cond__ = (COND);                                             \
-    if (UNLIKELY(__cond__ != xpu::Error_t::SUCCESS)) {                  \
-      auto __summary__ = paddle::platform::errors::External(            \
-          ::paddle::platform::build_xpu_xdnn_error_msg(__cond__, MSG)); \
-      __THROW_ERROR_INTERNAL__(__summary__);                            \
-    }                                                                   \
-  } while (0)
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index f83e3f6d0db3b3fbbb07a70ce6e9e40d4b675cf3..8764458433072061fd35f264549721a36c60e0d3 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -292,6 +292,10 @@ XPUOpMap& get_kl2_ops() {
                               pOpKernelType(vartype::INT64, XPUPlace())})},
       {"scatter", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                                 pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sigmoid_cross_entropy_with_logits_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sigmoid_cross_entropy_with_logits",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::INT64, XPUPlace())})},
       {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
diff --git a/paddle/fluid/platform/device/xpu/xpu_header.h b/paddle/fluid/platform/device/xpu/xpu_header.h
index 1177fd63742b3b4f104c6943a3e59022677f26d9..6b5c32fd511b3685291a1e7a027834be922ed872 100644
--- a/paddle/fluid/platform/device/xpu/xpu_header.h
+++ b/paddle/fluid/platform/device/xpu/xpu_header.h
@@ -15,42 +15,5 @@ limitations under the License. */
 #pragma once
 
 #ifdef PADDLE_WITH_XPU
-#include <map>
-#include <string>
-#include <unordered_map>
-
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-
-#include "xpu/runtime.h"
-#include "xpu/runtime_ex.h"
-#include "xpu/xdnn.h"
-
-namespace xpu = baidu::xpu::api;
-
-static std::map<int, std::string> XPUAPIErrorMsg = {
-    {xpu::Error_t::SUCCESS, "xpu api success"},
-    {xpu::Error_t::INVALID_PARAM, "xpu api invalid param"},
-    {xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"},
-    {xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}};
-
-template <typename T>
-class XPUTypeTrait {
- public:
-  using Type = T;
-};
-
-template <>
-class XPUTypeTrait<paddle::platform::float16> {
- public:
-  using Type = float16;
-};
-
-template <>
-class XPUTypeTrait<paddle::platform::bfloat16> {
- public:
-  using Type = bfloat16;
-};
-
+#include "paddle/pten/backends/xpu/xpu_header.h"
 #endif
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc
index a8c6ee8f3b0353487f9e09c59e0df8baa01a868d..cf08f9ada6b300dd93bcb7b3dd3fb8c0ecb65f44 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_info.cc
@@ -14,22 +14,14 @@ limitations under the License. */
 #include <cstdlib>
 #include <string>
 #include "gflags/gflags.h"
+
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"
-
-PADDLE_DEFINE_EXPORTED_string(
-    selected_xpus, "",
-    "A list of device ids separated by comma, like: 0,1,2,3. "
-    "This option is useful when doing multi process training and "
-    "each process have only one device (XPU). If you want to use "
-    "all visible devices, set this to empty string. NOTE: the "
-    "reason of doing this is that we want to use P2P communication"
-    "between XPU devices, use XPU_VISIBLE_DEVICES can only use"
-    "share-memory only.");
+
+#include "paddle/pten/backends/xpu/xpu_info.h"
 
 namespace paddle {
 namespace platform {
@@ -37,101 +29,40 @@ namespace platform {
 /**************************** Version Management **************************/
 
 //! Get the version of XPU Driver
-int GetDriverVersion() {
-  uint32_t driver_version_major = 0;
-  uint32_t driver_version_minor = 0;
-  PADDLE_ENFORCE_XPU_SUCCESS(
-      xpu_get_driver_version(&driver_version_major, &driver_version_minor));
-  int driver_version = driver_version_major * 10 + driver_version_minor;
-  return driver_version;
-}
+int GetDriverVersion() { return pten::backends::xpu::GetDriverVersion(); }
 
 //! Get the version of XPU Runtime
-int GetRuntimeVersion() {
-  uint32_t rumtime_version_major = 0;
-  uint32_t rumtime_version_minor = 0;
-  PADDLE_ENFORCE_XPU_SUCCESS(
-      xpu_get_runtime_version(&rumtime_version_major, &rumtime_version_minor));
-  int runtime_version = rumtime_version_major * 10 + rumtime_version_minor;
-  return runtime_version;
-}
+int GetRuntimeVersion() { return pten::backends::xpu::GetRuntimeVersion(); }
 
 /**************************** Device Management **************************/
 
-static int GetDeviceCountImpl() {
-  const auto* xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES");
-  if (xpu_visible_devices != nullptr) {
-    std::string xpu_visible_devices_str(xpu_visible_devices);
-    if (std::all_of(xpu_visible_devices_str.begin(),
-                    xpu_visible_devices_str.end(),
-                    [](char ch) { return ch == ' '; })) {
-      VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected.";
-      return 0;
-    }
-  }
-
-  int count = 0;
-  PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_count(&count));
-  return count;
-}
-
-int GetXPUDeviceCount() {
-  static auto dev_cnt = GetDeviceCountImpl();
-  return dev_cnt;
-}
+int GetXPUDeviceCount() { return pten::backends::xpu::GetXPUDeviceCount(); }
 
 int GetXPUCurrentDeviceId() {
-  int dev_id;
-  PADDLE_ENFORCE_XPU_SUCCESS(xpu_current_device(&dev_id));
-  if (dev_id >= 64) {
-    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
-    dev_id -= 64;
-  }
-  return dev_id;
+  return pten::backends::xpu::GetXPUCurrentDeviceId();
 }
 
-void SetXPUDeviceId(int id) {
-  PADDLE_ENFORCE_LT(
-      id, GetXPUDeviceCount(),
-      platform::errors::InvalidArgument("id must less than XPU count"));
-  PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id));
-}
+void SetXPUDeviceId(int id) { pten::backends::xpu::SetXPUDeviceId(id); }
 
 //! Get a list of device ids from environment variable or use all.
 std::vector<int> GetXPUSelectedDevices() {
   // use user specified XPUs in single-node multi-process mode.
-  std::vector<int> devices;
-  if (!FLAGS_selected_xpus.empty()) {
-    auto devices_str = paddle::string::Split(FLAGS_selected_xpus, ',');
-    for (auto id : devices_str) {
-      devices.push_back(atoi(id.c_str()));
-    }
-  } else {
-    int count = GetXPUDeviceCount();
-    for (int i = 0; i < count; ++i) {
-      devices.push_back(i);
-    }
-  }
-  return devices;
+  return pten::backends::xpu::GetXPUSelectedDevices();
 }
 
 /**************************** Memory Management **************************/
 
 void MemcpySyncH2D(void* dst, const void* src, size_t count,
                    const platform::XPUPlace& dst_place) {
-  platform::XPUDeviceGuard guard(dst_place.device);
-  PADDLE_ENFORCE_XPU_SUCCESS(
-      xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  pten::backends::xpu::MemcpySyncH2D(dst, src, count, dst_place);
 }
 
 void MemcpySyncD2H(void* dst, const void* src, size_t count,
                    const platform::XPUPlace& src_place) {
-  platform::XPUDeviceGuard guard(src_place.device);
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.GetByPlace(src_place);
   dev_ctx->Wait();
-  PADDLE_ENFORCE_XPU_SUCCESS(
-      xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST));
+  pten::backends::xpu::MemcpySyncD2H(dst, src, count, src_place, *dev_ctx);
 }
 
 // if src.device == dst.device and you need sync , after call this function,
@@ -139,33 +70,16 @@ void MemcpySyncD2H(void* dst, const void* src, size_t count,
 void MemcpySyncD2D(void* dst, const platform::XPUPlace& dst_place,
                    const void* src, const platform::XPUPlace& src_place,
                    size_t count) {
-  int dev_id = GetXPUCurrentDeviceId();
-  if (dst_place.device == dev_id && src_place.device == dev_id) {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto* dev_ctx = pool.GetByPlace(src_place);
-    PADDLE_ENFORCE_XDNN_SUCCESS(
-        xpu::copy(dev_ctx->x_context(), static_cast<const int8_t*>(src),
-                  static_cast<int8_t*>(dst), count),
-        "copy ");
-  } else {
-    PADDLE_ENFORCE_XPU_SUCCESS(
-        xpu_memcpy_peer(dst_place.device, dst, src_place.device, src, count));
-  }
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.GetByPlace(src_place);
+  pten::backends::xpu::MemcpySyncD2D(dst, dst_place, src, src_place, count,
+                                     *dev_ctx);
 }
 
 /**************************** Others **************************/
 
-XPUVersion get_xpu_version(int dev_id) {
-  uint64_t v = 0;
-  PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id));
-
-  if (v == K100 || v == K200) {
-    VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n";
-    return XPU1;
-  } else {
-    VLOG(1) << "KUNLUN device " << dev_id << " is XPU2\n";
-    return XPU2;
-  }
+pten::backends::xpu::XPUVersion get_xpu_version(int dev_id) {
+  return pten::backends::xpu::get_xpu_version(dev_id);
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.h b/paddle/fluid/platform/device/xpu/xpu_info.h
index 220bebb9e6b055319a0f642e8f711ccf8302ea43..03082e8dc50eca7e85d22a327600068099ee4567 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.h
+++ b/paddle/fluid/platform/device/xpu/xpu_info.h
@@ -13,6 +13,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include <vector>
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/backends/xpu/xpu_info.h"
 
 namespace paddle {
 namespace platform {
@@ -50,31 +51,9 @@ void MemcpySyncD2D(void *dst, const platform::XPUPlace &dst_place,
                    const void *src, const platform::XPUPlace &src_place,
                    size_t count);
 
-class XPUDeviceGuard {
- public:
-  explicit inline XPUDeviceGuard(int dev_id) {
-    int prev_id = platform::GetXPUCurrentDeviceId();
-    if (prev_id != dev_id) {
-      prev_id_ = prev_id;
-      platform::SetXPUDeviceId(dev_id);
-    }
-  }
+using XPUDeviceGuard = pten::backends::xpu::XPUDeviceGuard;
 
-  inline ~XPUDeviceGuard() {
-    if (prev_id_ != -1) {
-      platform::SetXPUDeviceId(prev_id_);
-    }
-  }
-
-  XPUDeviceGuard(const XPUDeviceGuard &o) = delete;
-  XPUDeviceGuard &operator=(const XPUDeviceGuard &o) = delete;
-
- private:
-  int prev_id_{-1};
-};
-
-enum XPUVersion { XPU1, XPU2 };
-XPUVersion get_xpu_version(int dev_id);
+pten::backends::xpu::XPUVersion get_xpu_version(int dev_id);
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index 36be4a55d0a6f1f1e85073c35b8d2d4e9092e491..e9b494024bd699d5176226b6535758f1ff2e0c39 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -24,7 +24,7 @@ namespace platform {
 bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
   auto& ops = get_kl1_ops();
   auto v = get_xpu_version(type.place_.device);
-  if (v == XPU2) {
+  if (v == pten::backends::xpu::XPUVersion::XPU2) {
     ops = get_kl2_ops();
   }
 
@@ -74,10 +74,11 @@ bool is_in_xpu_black_list(const std::string& op_name) {
   return false;
 }
 
-std::vector<vartype::Type> get_xpu_op_support_type(const std::string& op_name,
-                                                   XPUVersion version) {
+std::vector<vartype::Type> get_xpu_op_support_type(
+    const std::string& op_name, pten::backends::xpu::XPUVersion version) {
   std::vector<vartype::Type> res;
-  auto& ops = version == XPU1 ? get_kl1_ops() : get_kl2_ops();
+  auto& ops = version == pten::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops()
+                                                               : get_kl2_ops();
   if (ops.find(op_name) != ops.end()) {
     XPUKernelSet& type_set = ops[op_name];
     for (auto& item : type_set) {
@@ -87,9 +88,10 @@ std::vector<vartype::Type> get_xpu_op_support_type(const std::string& op_name,
   return res;
 }
 
-XPUOpListMap get_xpu_op_list(XPUVersion version) {
+XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version) {
   XPUOpListMap res;
-  auto& ops = version == XPU1 ? get_kl1_ops() : get_kl2_ops();
+  auto& ops = version == pten::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops()
+                                                               : get_kl2_ops();
   for (auto& op : ops) {
     std::vector<vartype::Type> op_vartypes;
     for (auto& item : op.second) {
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.h b/paddle/fluid/platform/device/xpu/xpu_op_list.h
index 3672d68492a6f5485a6c5a48751905e3f6cbbf30..4c3eb097a147ee11fb84c817614ef7c1002bddd5 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h
@@ -27,9 +27,9 @@ using XPUOpListMap =
 bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type);
 bool is_in_xpu_black_list(const std::string& op_name);
 
-std::vector<vartype::Type> get_xpu_op_support_type(const std::string& op_name,
-                                                   XPUVersion version);
-XPUOpListMap get_xpu_op_list(XPUVersion version);
+std::vector<vartype::Type> get_xpu_op_support_type(
+    const std::string& op_name, pten::backends::xpu::XPUVersion version);
+XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version);
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 6ffeaf101feca795f8a330b72206dffa2d68904c..142e30d161ccadf3c3cb55eee430597e60d50624 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -21,9 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/mlu/device_context.h"
 #include "paddle/fluid/platform/device/mlu/device_context_allocator.h"
 #endif
-#ifdef PADDLE_WITH_IPU
-#include "paddle/fluid/platform/ipu/ipu_backend.h"
-#endif
 #include "glog/logging.h"
 #include "paddle/fluid/framework/expect.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -230,14 +227,10 @@ CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {}
 CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext() {}
 
 #ifdef PADDLE_WITH_IPU
-IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {
-  int id = place.GetDeviceId();
-  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
-      platform::ipu::IpuBackend::GetInstance();
-  device_ = ipu_backend->GetDevice(id);
-}
+IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}
 
 Place IPUDeviceContext::GetPlace() const { return place_; }
+
 void IPUDeviceContext::Wait() const {
   /*! \brief  Wait for all operations completion in the stream. */
 }
@@ -246,52 +239,14 @@ IPUDeviceContext::~IPUDeviceContext() {}
 
 #endif
 #ifdef PADDLE_WITH_XPU
-XPUDeviceContext::XPUDeviceContext() {
-  context_ = xpu::create_context();
-  xpu_version_ = get_xpu_version(place_.device);
-}
+XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {}
 
 XPUDeviceContext::~XPUDeviceContext() {}
 
-XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
-  platform::XPUDeviceGuard guard(place.device);
-
+XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) {
   LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
-                          << static_cast<int>(place_.device);
-
-  context_ = xpu::create_context();
-  const int MAX_XPU_NUM = 16;
-  static void* l3ptrs[MAX_XPU_NUM] = {nullptr};
-
-  int l3_size = 13.5 * 1024 * 1024;
-  if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) {
-    l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE"));
-  }
-
-  auto selected_xpus = GetXPUSelectedDevices();
-  for (unsigned int i = 0; i < selected_xpus.size(); i++) {
-    if (place.device == selected_xpus[i]) {
-      if (l3ptrs[place.device] == nullptr) {
-        xpu_malloc(static_cast<void**>(&l3ptrs[place.device]), l3_size,
-                   XPU_MEM_L3);
-      }
-      if (l3ptrs[place.device] != nullptr) {
-        context_->_l3_mgr.set(l3ptrs[place.device], l3_size);
-        VLOG(3) << "xpu place " << place.device << " set l3 size " << l3_size;
-      }
-      break;
-    }
-  }
+                          << static_cast<int>(place.device);
 }
-
-void XPUDeviceContext::Wait() const {
-  platform::SetXPUDeviceId(place_.device);
-  xpu_wait(context_->xpu_stream);
-}
-
-Place XPUDeviceContext::GetPlace() const { return place_; }
-
-xpu::Context* XPUDeviceContext::x_context() const { return context_; }
 #endif
 
 #ifdef PADDLE_WITH_ASCEND_CL
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 78c09dca5b4886cfa03f18065df393c3861eed8f..17b22907b15328ef8fe610ce126639b0a5f927e7 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -65,9 +65,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/npu/enforce_npu.h"
 #include "paddle/fluid/platform/device/npu/npu_stream.h"
 #endif
-#ifdef PADDLE_WITH_IPU
-#include "paddle/fluid/platform/device/ipu/device.h"
-#endif
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace Eigen {
@@ -78,6 +75,7 @@ struct GpuDevice;
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/pten/backends/xpu/xpu_context.h"
 #endif
 
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -150,11 +148,9 @@ class IPUDeviceContext : public DeviceContext {
   Place GetPlace() const override;
   /*! \brief  Wait for all operations completion in the stream. */
   void Wait() const override;
-  int DeviceId() const { return device_.getId(); }
 
  private:
   IPUPlace place_;
-  platform::ipu::Device device_;
 };
 template <>
 struct DefaultDeviceContextType<platform::IPUPlace> {
@@ -171,39 +167,12 @@ struct DefaultDeviceContextType<platform::MLUPlace>;
 
 #ifdef PADDLE_WITH_XPU
 namespace xpu = baidu::xpu::api;
-class XPUDeviceContext : public DeviceContext {
+class XPUDeviceContext : public pten::XPUContext {
  public:
   XPUDeviceContext();
   explicit XPUDeviceContext(XPUPlace place);
   virtual ~XPUDeviceContext();
   Eigen::DefaultDevice* eigen_device() const { return nullptr; }
-  XPUVersion xpu_version() const { return xpu_version_; }
-  Place GetPlace() const override;
-  xpu::Context* x_context() const;
-
-  /*! \brief  Wait for all operations completion in the stream. */
-  void Wait() const override;
-
-#ifdef PADDLE_WITH_XPU_BKCL
-  /*! \brief  Return bkcl context. */
-  BKCLContext_t bkcl_context() const { return bkcl_context_; }
-
-  /*! \brief  Set bkcl context. */
-  void set_bkcl_context(BKCLContext_t context) { bkcl_context_ = context; }
-#endif
-
- private:
-  XPUPlace place_;
-  XPUVersion xpu_version_;
-  xpu::Context* context_;
-#ifdef PADDLE_WITH_XPU_BKCL
-  BKCLContext_t bkcl_context_;
-#endif
-
-  // Need to be the same with other DeviceContext,
-  // Eventhough eigen_device_ is not used in XPU
-  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
-  DISABLE_COPY_AND_ASSIGN(XPUDeviceContext);
 };
 
 template <>
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 32f233e44e952f6c78b7bfbfd3b0c600ac50d5e4..c751ee1e69b2bdcb85de0f9657f679356796ef33 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -95,17 +95,16 @@ limitations under the License. */
 // Note: these headers for simplify demangle type string
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/pten/core/enforce.h"
 // Note: this header for simplify HIP and CUDA type string
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #endif
 #include "paddle/fluid/platform/flags.h"
 
-namespace paddle {
-namespace platform {
+namespace pten {
 class ErrorSummary;
-}  // namespace platform
-}  // namespace paddle
+}  // namespace pten
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 DECLARE_int64(gpu_allocator_retry_time);
@@ -114,6 +113,7 @@ DECLARE_int32(call_stack_level);
 
 namespace paddle {
 namespace platform {
+using namespace ::pten::enforce;  // NOLINT
 
 /** HELPER MACROS AND FUNCTIONS **/
 
@@ -121,478 +121,6 @@ namespace platform {
 #define PADDLE_MAY_THROW noexcept(false)
 #endif
 
-// Because most enforce conditions would evaluate to true, we can use
-// __builtin_expect to instruct the C++ compiler to generate code that
-// always forces branch prediction of true.
-// This generates faster binary code. __builtin_expect is since C++11.
-// For more details, please check https://stackoverflow.com/a/43870188/724872.
-#if !defined(_WIN32)
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-#else
-// there is no equivalent intrinsics in msvc.
-#define UNLIKELY(condition) (condition)
-#endif
-
-#if !defined(_WIN32)
-#define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
-#else
-// there is no equivalent intrinsics in msvc.
-#define LIKELY(condition) (condition)
-#endif
-
-#if defined _WIN32 && defined PADDLE_ON_INFERENCE && defined PADDLE_NO_PYTHON
-#define HANDLE_THE_ERROR try {
-#define END_HANDLE_THE_ERROR            \
-  }                                     \
-  catch (const std::exception& e) {     \
-    std::cout << e.what() << std::endl; \
-    throw;                              \
-  }
-#else
-#define HANDLE_THE_ERROR
-#define END_HANDLE_THE_ERROR
-#endif
-
-#ifdef __GNUC__
-inline std::string demangle(std::string name) {
-  int status = -4;  // some arbitrary value to eliminate the compiler warning
-  std::unique_ptr<char, void (*)(void*)> res{
-      abi::__cxa_demangle(name.c_str(), NULL, NULL, &status), std::free};
-  return (status == 0) ? res.get() : name;
-}
-#else
-inline std::string demangle(std::string name) { return name; }
-#endif
-
-namespace details {
-template <typename T>
-inline constexpr bool IsArithmetic() {
-  return std::is_arithmetic<T>::value;
-}
-
-template <typename T1, typename T2, bool kIsArithmetic /* = true */>
-struct TypeConverterImpl {
-  using Type1 = typename std::common_type<T1, T2>::type;
-  using Type2 = Type1;
-};
-
-template <typename T1, typename T2>
-struct TypeConverterImpl<T1, T2, false> {
-  using Type1 = T1;
-  using Type2 = T2;
-};
-
-template <typename T1, typename T2>
-struct TypeConverter {
-  static constexpr bool kIsArithmetic =
-      IsArithmetic<T1>() && IsArithmetic<T2>();
-  using Type1 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type1;
-  using Type2 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type2;
-};
-
-template <typename T1, typename T2>
-using CommonType1 = typename std::add_lvalue_reference<
-    typename std::add_const<typename TypeConverter<T1, T2>::Type1>::type>::type;
-
-template <typename T1, typename T2>
-using CommonType2 = typename std::add_lvalue_reference<
-    typename std::add_const<typename TypeConverter<T1, T2>::Type2>::type>::type;
-
-// Here, we use SFINAE to check whether T can be converted to std::string
-template <typename T>
-struct CanToString {
- private:
-  using YesType = uint8_t;
-  using NoType = uint16_t;
-
-  template <typename U>
-  static YesType Check(decltype(std::cout << std::declval<U>())) {
-    return 0;
-  }
-
-  template <typename U>
-  static NoType Check(...) {
-    return 0;
-  }
-
- public:
-  static constexpr bool kValue =
-      std::is_same<YesType, decltype(Check<T>(std::cout))>::value;
-};
-
-template <bool kCanToString /* = true */>
-struct BinaryCompareMessageConverter {
-  template <typename T>
-  static std::string Convert(const char* expression, const T& value) {
-    return expression + std::string(":") + string::to_string(value);
-  }
-};
-
-template <>
-struct BinaryCompareMessageConverter<false> {
-  template <typename T>
-  static const char* Convert(const char* expression, const T& value) {
-    return expression;
-  }
-};
-}  // namespace details
-
-template <typename T>
-inline std::string ReplaceComplexTypeStr(std::string str,
-                                         const std::string& type_name) {
-  auto demangle_type_str = demangle(typeid(T).name());
-  size_t start_pos = 0;
-  while ((start_pos = str.find(demangle_type_str, start_pos)) !=
-         std::string::npos) {
-    str.replace(start_pos, demangle_type_str.length(), type_name);
-    start_pos += type_name.length();
-  }
-  return str;
-}
-
-#define __REPLACE_COMPLEX_TYPE_STR__(__TYPENAME, __STR)                       \
-  do {                                                                        \
-    __STR = paddle::platform::ReplaceComplexTypeStr<__TYPENAME>(__STR,        \
-                                                                #__TYPENAME); \
-  } while (0)
-
-inline std::string SimplifyDemangleStr(std::string str) {
-  // the older is important, you have to put complex types in front
-  __REPLACE_COMPLEX_TYPE_STR__(paddle::framework::AttributeMap, str);
-  __REPLACE_COMPLEX_TYPE_STR__(paddle::framework::Attribute, str);
-  __REPLACE_COMPLEX_TYPE_STR__(paddle::imperative::NameVariableWrapperMap, str);
-  __REPLACE_COMPLEX_TYPE_STR__(paddle::imperative::NameVarBaseMap, str);
-  __REPLACE_COMPLEX_TYPE_STR__(std::string, str);
-  return str;
-}
-
-inline std::string GetCurrentTraceBackString(bool for_signal = false) {
-  std::ostringstream sout;
-
-  if (!for_signal) {
-    sout << "\n\n--------------------------------------\n";
-    sout << "C++ Traceback (most recent call last):";
-    sout << "\n--------------------------------------\n";
-  }
-#if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL)
-  static constexpr int TRACE_STACK_LIMIT = 100;
-
-  void* call_stack[TRACE_STACK_LIMIT];
-  auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
-  auto symbols = backtrace_symbols(call_stack, size);
-  Dl_info info;
-  int idx = 0;
-  // `for_signal` used to remove the stack trace introduced by
-  // obtaining the error stack trace when the signal error occurred,
-  // that is not related to the signal error self, remove it to
-  // avoid misleading users and developers
-  int end_idx = for_signal ? 2 : 0;
-  for (int i = size - 1; i >= end_idx; --i) {
-    if (dladdr(call_stack[i], &info) && info.dli_sname) {
-      auto demangled = demangle(info.dli_sname);
-      std::string path(info.dli_fname);
-      // C++ traceback info are from core.so
-      if (path.substr(path.length() - 3).compare(".so") == 0) {
-        sout << string::Sprintf("%-3d %s\n", idx++,
-                                SimplifyDemangleStr(demangled));
-      }
-    }
-  }
-  free(symbols);
-#else
-  sout << "Not support stack backtrace yet.\n";
-#endif
-  return sout.str();
-}
-
-template <typename StrType>
-inline std::string GetErrorSumaryString(StrType&& what, const char* file,
-                                        int line) {
-  std::ostringstream sout;
-  if (FLAGS_call_stack_level > 1) {
-    sout << "\n----------------------\nError Message "
-            "Summary:\n----------------------\n";
-  }
-  sout << string::Sprintf("%s (at %s:%d)", std::forward<StrType>(what), file,
-                          line)
-       << std::endl;
-  return sout.str();
-}
-
-template <typename StrType>
-inline std::string GetTraceBackString(StrType&& what, const char* file,
-                                      int line) {
-  if (FLAGS_call_stack_level > 1) {
-    // FLAGS_call_stack_level>1 means showing c++ call stack
-    return GetCurrentTraceBackString() + GetErrorSumaryString(what, file, line);
-  } else {
-    return GetErrorSumaryString(what, file, line);
-  }
-}
-
-inline std::string SimplifyErrorTypeFormat(const std::string& str) {
-  std::ostringstream sout;
-  size_t type_end_pos = str.find(":", 0);
-  if (type_end_pos == std::string::npos) {
-    sout << str;
-  } else {
-    // Remove "Error:", add "()""
-    sout << "(" << str.substr(0, type_end_pos - 5) << ")"
-         << str.substr(type_end_pos + 1);
-  }
-  return sout.str();
-}
-
-inline bool is_error(bool stat) { return !stat; }
-
-// Note: This Macro can only be used within enforce.h
-#define __THROW_ERROR_INTERNAL__(__ERROR_SUMMARY)                      \
-  do {                                                                 \
-    HANDLE_THE_ERROR                                                   \
-    throw ::paddle::platform::EnforceNotMet(__ERROR_SUMMARY, __FILE__, \
-                                            __LINE__);                 \
-    END_HANDLE_THE_ERROR                                               \
-  } while (0)
-
-/** ENFORCE EXCEPTION AND MACROS **/
-
-struct EnforceNotMet : public std::exception {
- public:
-  EnforceNotMet(std::exception_ptr e, const char* file, int line) {
-    try {
-      std::rethrow_exception(e);
-    } catch (platform::EnforceNotMet& e) {
-      code_ = e.code();
-      err_str_ = GetTraceBackString(e.what(), file, line);
-      simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
-    } catch (std::exception& e) {
-      err_str_ = GetTraceBackString(e.what(), file, line);
-      simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
-    }
-  }
-
-  EnforceNotMet(const std::string& str, const char* file, int line)
-      : err_str_(GetTraceBackString(str, file, line)) {
-    simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
-  }
-
-  EnforceNotMet(const ErrorSummary& error, const char* file, int line)
-      : code_(error.code()),
-        err_str_(GetTraceBackString(error.to_string(), file, line)) {
-    simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
-  }
-
-  const char* what() const noexcept override {
-    if (FLAGS_call_stack_level > 1) {
-      return err_str_.c_str();
-    } else {
-      return simple_err_str_.c_str();
-    }
-  }
-
-  error::Code code() const { return code_; }
-
-  const std::string& error_str() const { return err_str_; }
-
-  const std::string& simple_error_str() const { return simple_err_str_; }
-
-  void set_error_str(std::string str) {
-    if (FLAGS_call_stack_level > 1) {
-      err_str_ = str;
-    } else {
-      simple_err_str_ = str;
-    }
-  }
-
- private:
-  // Used to determine the final type of exception thrown
-  error::Code code_ = error::LEGACY;
-  // Complete error message
-  // e.g. InvalidArgumentError: ***
-  std::string err_str_;
-  // Simple errror message used when no C++ stack and python compile stack
-  // e.g. (InvalidArgument) ***
-  std::string simple_err_str_;
-};
-
-#define PADDLE_THROW(...)                                                   \
-  do {                                                                      \
-    HANDLE_THE_ERROR                                                        \
-    throw ::paddle::platform::EnforceNotMet(                                \
-        ::paddle::platform::ErrorSummary(__VA_ARGS__), __FILE__, __LINE__); \
-    END_HANDLE_THE_ERROR                                                    \
-  } while (0)
-
-#if defined(__CUDA_ARCH__)
-// For cuda, the assertions can affect performance and it is therefore
-// recommended to disable them in production code
-// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
-#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...)                         \
-  do {                                                                       \
-    if (!(_IS_NOT_ERROR)) {                                                  \
-      printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", __FILE__, \
-             __LINE__, #_IS_NOT_ERROR, ##__VA_ARGS__);                       \
-      asm("trap;");                                                          \
-    }                                                                        \
-  } while (0)
-#elif defined(__HIPCC__)
-#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...)                         \
-  do {                                                                       \
-    if (!(_IS_NOT_ERROR)) {                                                  \
-      printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", __FILE__, \
-             __LINE__, #_IS_NOT_ERROR, ##__VA_ARGS__);                       \
-      abort();                                                               \
-    }                                                                        \
-  } while (0)
-#else
-#define PADDLE_ENFORCE(COND, ...)                                              \
-  do {                                                                         \
-    auto __cond__ = (COND);                                                    \
-    if (UNLIKELY(::paddle::platform::is_error(__cond__))) {                    \
-      __THROW_ERROR_INTERNAL__(::paddle::platform::ErrorSummary(__VA_ARGS__)); \
-    }                                                                          \
-  } while (0)
-#endif
-
-/*
- * Some enforce helpers here, usage:
- *    int a = 1;
- *    int b = 2;
- *    PADDLE_ENFORCE_EQ(a, b);
- *
- *    will raise an expression described as follows:
- *    "Expected input a == b, but received a(1) != b(2)."
- *      with detailed stack information.
- *
- *    extra messages is also supported, for example:
- *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
- */
-
-#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                                   \
-  do {                                                                        \
-    if (UNLIKELY(nullptr == (__VAL))) {                                       \
-      auto __summary__ = ::paddle::platform::ErrorSummary(__VA_ARGS__);       \
-      auto __message__ = ::paddle::string::Sprintf(                           \
-          "%s\n  [Hint: " #__VAL " should not be null.]",                     \
-          __summary__.error_message());                                       \
-      __THROW_ERROR_INTERNAL__(                                               \
-          ::paddle::platform::ErrorSummary(__summary__.code(), __message__)); \
-    }                                                                         \
-  } while (0)
-
-#define __PADDLE_BINARY_COMPARE(__VAL1, __VAL2, __CMP, __INV_CMP, ...)        \
-  do {                                                                        \
-    auto __val1 = (__VAL1);                                                   \
-    auto __val2 = (__VAL2);                                                   \
-    using __TYPE1__ = decltype(__val1);                                       \
-    using __TYPE2__ = decltype(__val2);                                       \
-    using __COMMON_TYPE1__ =                                                  \
-        ::paddle::platform::details::CommonType1<__TYPE1__, __TYPE2__>;       \
-    using __COMMON_TYPE2__ =                                                  \
-        ::paddle::platform::details::CommonType2<__TYPE1__, __TYPE2__>;       \
-    bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP(       \
-        static_cast<__COMMON_TYPE2__>(__val2));                               \
-    if (UNLIKELY(!__is_not_error)) {                                          \
-      auto __summary__ = ::paddle::platform::ErrorSummary(__VA_ARGS__);       \
-      constexpr bool __kCanToString__ =                                       \
-          ::paddle::platform::details::CanToString<__TYPE1__>::kValue &&      \
-          ::paddle::platform::details::CanToString<__TYPE2__>::kValue;        \
-      auto __message__ = ::paddle::string::Sprintf(                           \
-          "%s\n  [Hint: Expected %s " #__CMP                                  \
-          " %s, but received %s " #__INV_CMP " %s.]",                         \
-          __summary__.error_message(), #__VAL1, #__VAL2,                      \
-          ::paddle::platform::details::BinaryCompareMessageConverter<         \
-              __kCanToString__>::Convert(#__VAL1, __val1),                    \
-          ::paddle::platform::details::BinaryCompareMessageConverter<         \
-              __kCanToString__>::Convert(#__VAL2, __val2));                   \
-      __THROW_ERROR_INTERNAL__(                                               \
-          ::paddle::platform::ErrorSummary(__summary__.code(), __message__)); \
-    }                                                                         \
-  } while (0)
-
-#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
-#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
-#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
-#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
-#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
-#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
-
-/** EXTENDED TOOL FUNCTIONS WITH CHECKING **/
-
-/*
- * Summary: This macro is used to get Variable or internal type
- *   data (such as LoDTensor or SelectedRows) of the Input and
- *   Output in op, generally used when call scope.FindVar(Input/
- *   Output("Name")) or ctx.Input<LoDTensor>().
- *   Firstly this macro check whether the obtained pointer is null,
- *   and then return data if it is not null.
- *
- * Note: This macro is only suitable for specific scenarios and
- *   does not intended to be widely used. If it cannot meet the
- *   requirements, please use other PADDLE_ENFORCE** check macro.
- *
- * Parameters:
- *     __PTR: pointer
- *     __ROLE: (string), Input or Output
- *     __NAME: (string), Input or Output name
- *     __OP_TYPE: (string), the op type
- *  
- * Return: The data pointed to by the pointer.
- *
- * Examples:
- *    GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
- */
-#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE)                     \
-  (([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type {            \
-    auto* __ptr = (__PTR);                                                    \
-    if (UNLIKELY(nullptr == __ptr)) {                                         \
-      auto __summary__ = paddle::platform::errors::NotFound(                  \
-          "Unable to get %s data of %s %s in operator %s. "                   \
-          "Possible reasons are:\n"                                           \
-          "  1. The %s is not the %s of operator %s;\n"                       \
-          "  2. The %s has no corresponding variable passed in;\n"            \
-          "  3. The %s corresponding variable is not initialized.",           \
-          paddle::platform::demangle(                                         \
-              typeid(std::add_lvalue_reference<decltype(*__ptr)>::type)       \
-                  .name()),                                                   \
-          __ROLE, __NAME, __OP_TYPE, __NAME, __ROLE, __OP_TYPE, __NAME,       \
-          __NAME);                                                            \
-      auto __message__ = ::paddle::string::Sprintf(                           \
-          "%s\n  [Hint: pointer " #__PTR " should not be null.]",             \
-          __summary__.error_message());                                       \
-      __THROW_ERROR_INTERNAL__(                                               \
-          ::paddle::platform::ErrorSummary(__summary__.code(), __message__)); \
-    }                                                                         \
-    return *__ptr;                                                            \
-  })())
-
-/*
- * Summary: This macro is used to check whether op has specified
- * Input or Output Variables. Because op's Input and Output
- * checking are written similarly, so abstract this macro.
- *
- * Parameters:
- *     __EXPR: (bool), the bool expression
- *     __ROLE: (string), Input or Output
- *     __NAME: (string), Input or Output name
- *     __OP_TYPE: (string), the op type
- *
- * Examples:
- *    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Mul");
- */
-#define OP_INOUT_CHECK(__EXPR, __ROLE, __NAME, __OP_TYPE)                   \
-  do {                                                                      \
-    PADDLE_ENFORCE_EQ(__EXPR, true, paddle::platform::errors::NotFound(     \
-                                        "No %s(%s) found for %s operator.", \
-                                        __ROLE, __NAME, __OP_TYPE));        \
-  } while (0)
-
 /*
  * Summary: This BOOST_GET(_**) series macros are used to call boost::get
  *   safely. boost::get is not a completely safe api, although it will not
@@ -616,6 +144,8 @@ struct EnforceNotMet : public std::exception {
  */
 namespace details {
 
+using namespace pten::enforce::details;  // NOLINT
+
 #define DEFINE_SAFE_BOOST_GET(__InputType, __OutputType, __OutputTypePtr,      \
                               __FuncName)                                      \
   template <typename OutputType, typename InputType>                           \
@@ -627,13 +157,12 @@ namespace details {
       return boost::get<OutputType>(input);                                    \
     } catch (boost::bad_get&) {                                                \
       HANDLE_THE_ERROR                                                         \
-      throw ::paddle::platform::EnforceNotMet(                                 \
-          ::paddle::platform::errors::InvalidArgument(                         \
+      throw ::pten::enforce::EnforceNotMet(                                    \
+          pten::errors::InvalidArgument(                                       \
               "boost::get failed, cannot get value "                           \
               "(%s) by type %s, its type is %s.",                              \
-              expression,                                                      \
-              paddle::platform::demangle(typeid(OutputType).name()),           \
-              paddle::platform::demangle(input.type().name())),                \
+              expression, pten::enforce::demangle(typeid(OutputType).name()),  \
+              pten::enforce::demangle(input.type().name())),                   \
           file, line);                                                         \
       END_HANDLE_THE_ERROR                                                     \
     }                                                                          \
@@ -647,44 +176,43 @@ DEFINE_SAFE_BOOST_GET(InputType&&, OutputType, OutputType*,
 
 }  // namespace details
 
-#define BOOST_GET(__TYPE, __VALUE)                                     \
-  ::paddle::platform::details::SafeBoostGet<__TYPE>(__VALUE, #__VALUE, \
-                                                    __FILE__, __LINE__)
-#define BOOST_GET_CONST(__TYPE, __VALUE)                                    \
-  ::paddle::platform::details::SafeBoostGetConst<__TYPE>(__VALUE, #__VALUE, \
+#define BOOST_GET(__TYPE, __VALUE)                                             \
+  paddle::platform::details::SafeBoostGet<__TYPE>(__VALUE, #__VALUE, __FILE__, \
+                                                  __LINE__)
+#define BOOST_GET_CONST(__TYPE, __VALUE)                                  \
+  paddle::platform::details::SafeBoostGetConst<__TYPE>(__VALUE, #__VALUE, \
+                                                       __FILE__, __LINE__)
+#define BOOST_GET_MUTABLE(__TYPE, __VALUE)                                  \
+  paddle::platform::details::SafeBoostGetMutable<__TYPE>(__VALUE, #__VALUE, \
                                                          __FILE__, __LINE__)
-#define BOOST_GET_MUTABLE(__TYPE, __VALUE)                                    \
-  ::paddle::platform::details::SafeBoostGetMutable<__TYPE>(__VALUE, #__VALUE, \
-                                                           __FILE__, __LINE__)
 
 /** OTHER EXCEPTION AND ENFORCE **/
 
 struct EOFException : public std::exception {
   std::string err_str_;
   EOFException(const char* err_msg, const char* file, int line) {
-    err_str_ = string::Sprintf("%s at [%s:%d]", err_msg, file, line);
+    err_str_ = paddle::string::Sprintf("%s at [%s:%d]", err_msg, file, line);
   }
 
   const char* what() const noexcept override { return err_str_.c_str(); }
 };
 
-#define PADDLE_THROW_EOF()                                                     \
-  do {                                                                         \
-    HANDLE_THE_ERROR                                                           \
-    throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
-                                           __LINE__);                          \
-    END_HANDLE_THE_ERROR                                                       \
-  } while (0)
-
-#define PADDLE_THROW_BAD_ALLOC(...)                                          \
+#define PADDLE_THROW_EOF()                                                   \
   do {                                                                       \
     HANDLE_THE_ERROR                                                         \
-    throw ::paddle::memory::allocation::BadAlloc(                            \
-        ::paddle::platform::ErrorSummary(__VA_ARGS__).to_string(), __FILE__, \
-        __LINE__);                                                           \
+    throw paddle::platform::EOFException("There is no next data.", __FILE__, \
+                                         __LINE__);                          \
     END_HANDLE_THE_ERROR                                                     \
   } while (0)
 
+#define PADDLE_THROW_BAD_ALLOC(...)                                       \
+  do {                                                                    \
+    HANDLE_THE_ERROR                                                      \
+    throw ::paddle::memory::allocation::BadAlloc(                         \
+        pten::ErrorSummary(__VA_ARGS__).to_string(), __FILE__, __LINE__); \
+    END_HANDLE_THE_ERROR                                                  \
+  } while (0)
+
 /**************************************************************************/
 /**************************** NVIDIA ERROR ********************************/
 #ifdef PADDLE_WITH_CUDA
@@ -970,7 +498,7 @@ inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
         ::paddle::platform::details::ExternalApiType<            \
             __CUDA_STATUS_TYPE__>::kSuccess;                     \
     if (UNLIKELY(__cond__ != __success_type__)) {                \
-      auto __summary__ = ::paddle::platform::errors::External(   \
+      auto __summary__ = pten::errors::External(                 \
           ::paddle::platform::build_nvidia_error_msg(__cond__)); \
       __THROW_ERROR_INTERNAL__(__summary__);                     \
     }                                                            \
@@ -1016,7 +544,7 @@ inline void retry_sleep(unsigned milliseconds) {
       ++retry_count;                                                    \
     }                                                                   \
     if (UNLIKELY(__cond__ != __success_type__)) {                       \
-      auto __summary__ = ::paddle::platform::errors::External(          \
+      auto __summary__ = pten::errors::External(                        \
           ::paddle::platform::build_nvidia_error_msg(__cond__));        \
       __THROW_ERROR_INTERNAL__(__summary__);                            \
     }                                                                   \
@@ -1176,7 +704,7 @@ DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess);
         ::paddle::platform::details::ExternalApiType<          \
             __CUDA_STATUS_TYPE__>::kSuccess;                   \
     if (UNLIKELY(__cond__ != __success_type__)) {              \
-      auto __summary__ = ::paddle::platform::errors::External( \
+      auto __summary__ = pten::errors::External(               \
           ::paddle::platform::build_rocm_error_msg(__cond__)); \
       __THROW_ERROR_INTERNAL__(__summary__);                   \
     }                                                          \
@@ -1204,7 +732,7 @@ inline void retry_sleep(unsigned millisecond) {
       ++retry_count;                                                    \
     }                                                                   \
     if (UNLIKELY(__cond__ != __success_type__)) {                       \
-      auto __summary__ = ::paddle::platform::errors::External(          \
+      auto __summary__ = pten::errors::External(                        \
           ::paddle::platform::build_rocm_error_msg(__cond__));          \
       __THROW_ERROR_INTERNAL__(__summary__);                            \
     }                                                                   \
diff --git a/paddle/fluid/platform/error_codes.proto b/paddle/fluid/platform/error_codes.proto
deleted file mode 100644
index 90ab93dd11d0ad1706f6199308ad2a6cb3ffa650..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/error_codes.proto
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto2";
-option optimize_for = LITE_RUNTIME;
-package paddle.platform.error;
-
-enum Code {
-  // Legacy error.
-  // Error type string: "Error"
-  LEGACY = 0;
-
-  // Client specified an invalid argument.
-  // Error type string: "InvalidArgumentError"
-  INVALID_ARGUMENT = 1;
-
-  // Some requested entity (e.g., file or directory) was not found.
-  // Error type string: "NotFoundError"
-  NOT_FOUND = 2;
-
-  // Operation tried to iterate past the valid input range.  E.g., seeking or
-  // reading past end of file.
-  // Error type string: "OutOfRangeError"
-  OUT_OF_RANGE = 3;
-
-  // Some entity that we attempted to create (e.g., file or directory)
-  // already exists.
-  // Error type string: "AlreadyExistsError"
-  ALREADY_EXISTS = 4;
-
-  // Some resource has been exhausted, perhaps a per-user quota, or
-  // perhaps the entire file system is out of space.
-  // Error type string: "ResourceExhaustedError"
-  RESOURCE_EXHAUSTED = 5;
-
-  // Operation was rejected because the system is not in a state
-  // required for the operation's execution.
-  // Error type string: "PreconditionNotMetError"
-  PRECONDITION_NOT_MET = 6;
-
-  // The caller does not have permission to execute the specified
-  // operation.
-  // Error type string: "PermissionDeniedError"
-  PERMISSION_DENIED = 7;
-
-  // Deadline expired before operation could complete.
-  // Error type string: "ExecutionTimeout"
-  EXECUTION_TIMEOUT = 8;
-
-  // Operation is not implemented or not supported/enabled in this service.
-  // Error type string: "UnimpelmentedError"
-  UNIMPLEMENTED = 9;
-
-  // The service is currently unavailable.  This is a most likely a
-  // transient condition and may be corrected by retrying with
-  // a backoff.
-  // Error type string: "UnavailableError"
-  UNAVAILABLE = 10;
-
-  // Fatal errors.  Means some invariant expected by the underlying
-  // system has been broken.  If you see one of these errors,
-  // something is very broken.
-  // Error type string: "FatalError"
-  FATAL = 11;
-
-  // Third-party library error.
-  // Error type string: "ExternalError"
-  EXTERNAL = 12;
-}
diff --git a/paddle/fluid/platform/errors.h b/paddle/fluid/platform/errors.h
index 6bcd5cf39f2e0b051595de2ae8c5e41e03abb62a..30e532f0491cbc9b821014bf3d0dbfe2263cce50 100644
--- a/paddle/fluid/platform/errors.h
+++ b/paddle/fluid/platform/errors.h
@@ -13,76 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <tuple>
-#include <type_traits>
-
-#include "paddle/fluid/platform/error_codes.pb.h"
-#include "paddle/fluid/string/printf.h"
-
+#include "paddle/pten/core/errors.h"
 namespace paddle {
 namespace platform {
-
-typedef ::paddle::platform::error::Code Code;
-
-class ErrorSummary {
- public:
-  // Note(chenweihang): Final deprecated constructor
-  //   This constructor is used to be compatible with
-  //   current existing untyped PADDLE_ENFORCE_*
-  //   PADDLE_ENFORCE
-  // Note(chenweihang): Windows openblas need this
-  //   constructor for compiling PADDLE_ENFORCE in *.cu,
-  //   this is a bug cause we can't remove this
-  //   constructor now.
-  template <typename... Args>
-  explicit ErrorSummary(Args... args) {
-    code_ = paddle::platform::error::LEGACY;
-    msg_ = paddle::string::Sprintf(args...);
-  }
-
-  // Note(chenweihang): Only recommended constructor
-  //   No longer supports PADDLE_ENFORCE without type or without error message
-  explicit ErrorSummary(Code code, std::string msg) : code_(code), msg_(msg) {}
-
-  Code code() const { return code_; }
-
-  const std::string& error_message() const { return msg_; }
-
-  std::string to_string() const;
-
- private:
-  Code code_;
-  std::string msg_;
-};
-
-namespace errors {
-
-#define REGISTER_ERROR(FUNC, CONST, ...)                                       \
-  template <typename... Args>                                                  \
-  ::paddle::platform::ErrorSummary FUNC(Args... args) {                        \
-    return ::paddle::platform::ErrorSummary(                                   \
-        ::paddle::platform::error::CONST, ::paddle::string::Sprintf(args...)); \
-  }
-
-REGISTER_ERROR(InvalidArgument, INVALID_ARGUMENT)
-REGISTER_ERROR(NotFound, NOT_FOUND)
-REGISTER_ERROR(OutOfRange, OUT_OF_RANGE)
-REGISTER_ERROR(AlreadyExists, ALREADY_EXISTS)
-REGISTER_ERROR(ResourceExhausted, RESOURCE_EXHAUSTED)
-REGISTER_ERROR(PreconditionNotMet, PRECONDITION_NOT_MET)
-REGISTER_ERROR(PermissionDenied, PERMISSION_DENIED)
-REGISTER_ERROR(ExecutionTimeout, EXECUTION_TIMEOUT)
-REGISTER_ERROR(Unimplemented, UNIMPLEMENTED)
-REGISTER_ERROR(Unavailable, UNAVAILABLE)
-REGISTER_ERROR(Fatal, FATAL)
-REGISTER_ERROR(External, EXTERNAL)
-
-#undef REGISTER_ERROR
-
-}  // namespace errors
-}  // namespace platform
-}  // namespace paddle
+namespace errors = ::pten::errors;
+using error = ::pten::ErrorCode;
+}
+}
diff --git a/paddle/fluid/platform/profiler/event_node.h b/paddle/fluid/platform/profiler/event_node.h
new file mode 100755
index 0000000000000000000000000000000000000000..05190bc4666941f8403cbb55589a53bb26aeb690
--- /dev/null
+++ b/paddle/fluid/platform/profiler/event_node.h
@@ -0,0 +1,207 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <list>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/profiler/output_logger.h"
+#include "paddle/fluid/platform/profiler/trace_event.h"
+
+namespace paddle {
+namespace platform {
+
+class DeviceTraceEventNode {
+ public:
+  // constructor
+  explicit DeviceTraceEventNode(const DeviceTraceEvent& device_event)
+      : device_event_(device_event) {}
+  // destructor
+  ~DeviceTraceEventNode() {}
+  // getter
+  std::string name() const { return device_event_.name; }
+  TracerEventType type() const { return device_event_.type; }
+  uint64_t start_ns() const { return device_event_.start_ns; }
+  uint64_t end_ns() const { return device_event_.end_ns; }
+  uint64_t device_id() const { return device_event_.device_id; }
+  uint64_t context_id() const { return device_event_.context_id; }
+  uint64_t stream_id() const { return device_event_.stream_id; }
+  uint64_t duration() const {
+    return device_event_.end_ns - device_event_.start_ns;
+  }
+  uint32_t correlation_id() const { return device_event_.correlation_id; }
+  KernelEventInfo kernel_info() const {
+    PADDLE_ENFORCE_EQ(
+        device_event_.type, TracerEventType::Kernel,
+        platform::errors::Unavailable(
+            "Can not kernel_info, "
+            "TracerEventType in node must be TracerEventType::Kernel."));
+    return device_event_.kernel_info;
+  }
+  MemcpyEventInfo memcpy_info() const {
+    PADDLE_ENFORCE_EQ(
+        device_event_.type, TracerEventType::Memcpy,
+        platform::errors::Unavailable(
+            "Can not get memcpy_info, "
+            "TracerEventType in node must be TracerEventType::Memcpy."));
+    return device_event_.memcpy_info;
+  }
+  MemsetEventInfo memset_info() const {
+    PADDLE_ENFORCE_EQ(
+        device_event_.type, TracerEventType::Memset,
+        platform::errors::Unavailable(
+            "Can not get memset_info, "
+            "TracerEventType in node must be TracerEventType::Memset."));
+    return device_event_.memset_info;
+  }
+
+  // member function
+  void LogMe(BaseLogger* logger) { logger->LogDeviceTraceEventNode(*this); }
+
+ private:
+  // data
+  DeviceTraceEvent device_event_;
+};
+
+class CudaRuntimeTraceEventNode {
+ public:
+  // constructor
+  explicit CudaRuntimeTraceEventNode(const RuntimeTraceEvent& runtime_event)
+      : runtime_event_(runtime_event) {}
+  // destructor
+  ~CudaRuntimeTraceEventNode();
+  // getter
+  std::string name() const { return runtime_event_.name; }
+  TracerEventType type() const { return runtime_event_.type; }
+  uint64_t start_ns() const { return runtime_event_.start_ns; }
+  uint64_t end_ns() const { return runtime_event_.end_ns; }
+  uint64_t process_id() const { return runtime_event_.process_id; }
+  uint64_t thread_id() const { return runtime_event_.thread_id; }
+  uint64_t duration() const {
+    return runtime_event_.end_ns - runtime_event_.start_ns;
+  }
+  uint32_t correlation_id() const { return runtime_event_.correlation_id; }
+  uint32_t callback_id() const { return runtime_event_.callback_id; }
+  // member function
+  void AddDeviceTraceEventNode(DeviceTraceEventNode* node) {
+    device_node_ptrs_.push_back(node);
+  }
+  void LogMe(BaseLogger* logger) { logger->LogRuntimeTraceEventNode(*this); }
+  std::vector<DeviceTraceEventNode*>& GetDeviceTraceEventNodes() {
+    return device_node_ptrs_;
+  }
+
+ private:
+  // data
+  RuntimeTraceEvent runtime_event_;
+  // device events called by this
+  std::vector<DeviceTraceEventNode*> device_node_ptrs_;
+};
+
+class HostTraceEventNode {
+ public:
+  // constructor
+  explicit HostTraceEventNode(const HostTraceEvent& host_event)
+      : host_event_(host_event) {}
+
+  // destructor
+  ~HostTraceEventNode();
+
+  // getter
+  std::string name() const { return host_event_.name; }
+  TracerEventType type() const { return host_event_.type; }
+  uint64_t start_ns() const { return host_event_.start_ns; }
+  uint64_t end_ns() const { return host_event_.end_ns; }
+  uint64_t process_id() const { return host_event_.process_id; }
+  uint64_t thread_id() const { return host_event_.thread_id; }
+  uint64_t duration() const {
+    return host_event_.end_ns - host_event_.start_ns;
+  }
+
+  // member function
+  void AddChild(HostTraceEventNode* node) { children_.push_back(node); }
+  void AddCudaRuntimeNode(CudaRuntimeTraceEventNode* node) {
+    runtime_node_ptrs_.push_back(node);
+  }
+  std::vector<HostTraceEventNode*>& GetChildren() { return children_; }
+  std::vector<CudaRuntimeTraceEventNode*>& GetRuntimeTraceEventNodes() {
+    return runtime_node_ptrs_;
+  }
+  void LogMe(BaseLogger* logger) { logger->LogHostTraceEventNode(*this); }
+
+ private:
+  // data
+  HostTraceEvent host_event_;
+  // cuda runtime events called by this
+  std::vector<CudaRuntimeTraceEventNode*> runtime_node_ptrs_;
+  // host events called by this
+  std::vector<HostTraceEventNode*> children_;
+};
+
+class NodeTrees {
+ public:
+  // constructor
+  NodeTrees(const std::list<HostTraceEvent>& host_events,
+            const std::list<RuntimeTraceEvent>& runtime_events,
+            const std::list<DeviceTraceEvent>& device_events) {
+    std::vector<HostTraceEventNode*> host_event_nodes;
+    std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes;
+    std::vector<DeviceTraceEventNode*> device_event_nodes;
+    // encapsulate event into nodes
+    for (auto it = host_events.begin(); it != host_events.end(); ++it) {
+      host_event_nodes.push_back(new HostTraceEventNode(*it));
+    }
+    for (auto it = runtime_events.begin(); it != runtime_events.end(); ++it) {
+      runtime_event_nodes.push_back(new CudaRuntimeTraceEventNode(*it));
+    }
+    for (auto it = device_events.begin(); it != device_events.end(); ++it) {
+      device_event_nodes.push_back(new DeviceTraceEventNode(*it));
+    }
+    // build tree
+    BuildTrees(host_event_nodes, runtime_event_nodes, device_event_nodes);
+  }
+
+  explicit NodeTrees(
+      const std::map<uint64_t, HostTraceEventNode*>& thread_event_trees_map)
+      : thread_event_trees_map_(thread_event_trees_map) {}
+
+  // destructor
+  ~NodeTrees();
+
+  void LogMe(BaseLogger* logger);
+  void HandleTrees(std::function<void(HostTraceEventNode*)>,
+                   std::function<void(CudaRuntimeTraceEventNode*)>,
+                   std::function<void(DeviceTraceEventNode*)>);
+  std::map<uint64_t, HostTraceEventNode*> GetNodeTrees() {
+    return thread_event_trees_map_;
+  }
+  std::map<uint64_t, std::vector<HostTraceEventNode*>> Traverse(bool bfs) const;
+
+ private:
+  std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map_;
+  void BuildTrees(const std::vector<HostTraceEventNode*>&,
+                  std::vector<CudaRuntimeTraceEventNode*>&,
+                  const std::vector<DeviceTraceEventNode*>&);
+  HostTraceEventNode* BuildTreeRelationship(
+      std::vector<HostTraceEventNode*> host_event_nodes,
+      std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes);
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h
new file mode 100755
index 0000000000000000000000000000000000000000..2241cf9e49e7e8d50cd0bfda575675559577323f
--- /dev/null
+++ b/paddle/fluid/platform/profiler/event_python.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+
+#include "paddle/fluid/platform/profiler/event_node.h"
+
+namespace paddle {
+namespace platform {
+
+struct DevicePythonNode {
+  DevicePythonNode() = default;
+  ~DevicePythonNode() {}
+  // record name
+  std::string name;
+  // record type, one of TracerEventType
+  TracerEventType type;
+  // start timestamp of the record
+  uint64_t start_ns;
+  // end timestamp of the record
+  uint64_t end_ns;
+  // device id
+  uint64_t device_id;
+  // context id
+  uint64_t context_id;
+  // stream id
+  uint64_t stream_id;
+};
+
+struct HostPythonNode {
+  HostPythonNode() = default;
+  ~HostPythonNode();
+  // record name
+  std::string name;
+  // record type, one of TracerEventType
+  TracerEventType type;
+  // start timestamp of the record
+  uint64_t start_ns;
+  // end timestamp of the record
+  uint64_t end_ns;
+  // process id of the record
+  uint64_t process_id;
+  // thread id of the record
+  uint64_t thread_id;
+  // children node
+  std::vector<HostPythonNode*> children_node_ptrs;
+  // runtime node
+  std::vector<HostPythonNode*> runtime_node_ptrs;
+  // device node
+  std::vector<DevicePythonNode*> device_node_ptrs;
+};
+
+class ProfilerResult {
+ public:
+  ProfilerResult() : tree_(nullptr) {}
+  explicit ProfilerResult(NodeTrees* tree);
+  ~ProfilerResult();
+  std::map<uint64_t, HostPythonNode*> GetData() {
+    return thread_event_trees_map;
+  }
+  void Save(const std::string& file_name);
+
+ private:
+  std::map<uint64_t, HostPythonNode*> thread_event_trees_map;
+  NodeTrees* tree_;
+  HostPythonNode* CopyTree(HostTraceEventNode* node);
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/output_logger.h b/paddle/fluid/platform/profiler/output_logger.h
new file mode 100755
index 0000000000000000000000000000000000000000..6901ed0c44479459cbe920cd906c2ef16e20844e
--- /dev/null
+++ b/paddle/fluid/platform/profiler/output_logger.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include <ostream>
+
+namespace paddle {
+namespace platform {
+
+class DeviceTraceEventNode;       // forward declaration
+class HostTraceEventNode;         // forward declaration
+class CudaRuntimeTraceEventNode;  // forward declaration
+class NodeTrees;                  // forward declaration
+
+class BaseLogger {
+ public:
+  BaseLogger() {}
+  virtual ~BaseLogger() {}
+  virtual void LogDeviceTraceEventNode(const DeviceTraceEventNode&) {}
+  virtual void LogHostTraceEventNode(const HostTraceEventNode&) {}
+  virtual void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) {}
+  virtual void LogNodeTrees(const NodeTrees&) {}
+  virtual void LogMetaInfo() {}
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h
new file mode 100644
index 0000000000000000000000000000000000000000..e676942c4581688f6854918a3e5a1465fab8d00b
--- /dev/null
+++ b/paddle/fluid/platform/profiler/trace_event.h
@@ -0,0 +1,228 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+namespace paddle {
+namespace platform {
+
+enum class TracerEventType {
+  // Used to mark operator record
+  Operator = 0,
+  // Used to mark dataloader record
+  Dataloader = 1,
+  // Used to mark profile step record
+  ProfileStep = 2,
+  // Used to mark cuda runtime record returned by cupti
+  CudaRuntime = 3,
+  // Used to mark kernel computation record returned by cupti
+  Kernel = 4,
+  // Used to mark memcpy record returned by cupti
+  Memcpy = 5,
+  // Used to mark memset record returned by cupti
+  Memset = 6,
+  // Used to mark record defined by user
+  UserDefined = 7,
+  // A flag to denote the number of current types
+  NumTypes
+};
+
+struct KernelEventInfo {
+  // The X-dimension block size for the kernel.
+  uint32_t block_x;
+  // The Y-dimension block size for the kernel.
+  uint32_t block_y;
+  // The Z-dimension grid size for the kernel.
+  uint32_t block_z;
+  // X-dimension of a grid.
+  uint32_t grid_x;
+  // Y-dimension of a grid.
+  uint32_t grid_y;
+  // Z-dimension of a grid.
+  uint32_t grid_z;
+  // The dynamic shared memory reserved for the kernel, in bytes.
+  uint32_t dynamic_shared_memory;
+  // The static shared memory allocated for the kernel, in bytes.
+  uint32_t static_shared_memory;
+  // The number of registers required for each thread executing the kernel.
+  uint32_t registers_per_thread;
+  // The amount of local memory reserved for each thread, in bytes.
+  uint32_t local_memory_per_thread;
+  // The total amount of local memory reserved for the kernel, in bytes.
+  uint32_t local_memory_total;
+  // The timestamp when the kernel is queued up in the command buffer, in ns.
+  // This timestamp is not collected by default. Use API
+  // cuptiActivityEnableLatencyTimestamps() to enable collection.
+  uint64_t queued;
+  // The timestamp when the command buffer containing the kernel launch is
+  // submitted to the GPU, in ns.
+  // This timestamp is not collected by default. Use API
+  // cuptiActivityEnableLatencyTimestamps() to enable collection.
+  uint64_t submitted;
+  // The completed timestamp for the kernel execution, in ns.
+  uint64_t completed;
+};
+
+struct MemcpyEventInfo {
+  // The number of bytes transferred by the memory copy.
+  uint64_t num_bytes;
+  // The kind of the memory copy.
+  // Each kind represents the source and destination targets of a memory copy.
+  // Targets are host, device, and array. Refer to CUpti_ActivityMemcpyKind
+  std::string copy_kind;
+  // The source memory kind read by the memory copy.
+  // Each kind represents the type of the memory accessed by a memory
+  // operation/copy. Refer to CUpti_ActivityMemoryKind
+  std::string src_kind;
+  // The destination memory kind read by the memory copy.
+  std::string dst_kind;
+};
+
+struct MemsetEventInfo {
+  // The number of bytes being set by the memory set.
+  uint64_t num_bytes;
+  // The memory kind of the memory set. Refer to CUpti_ActivityMemoryKind
+  std::string memory_kind;
+  // the value being assigned to memory by the memory set.
+  uint32_t value;
+};
+
+struct HostTraceEvent {
+  HostTraceEvent() = default;
+  HostTraceEvent(const std::string& name, TracerEventType type,
+                 uint64_t start_ns, uint64_t end_ns, uint64_t process_id,
+                 uint64_t thread_id)
+      : name(name),
+        type(type),
+        start_ns(start_ns),
+        end_ns(end_ns),
+        process_id(process_id),
+        thread_id(thread_id) {}
+  // record name
+  std::string name;
+  // record type, one of TracerEventType
+  TracerEventType type;
+  // start timestamp of the record
+  uint64_t start_ns;
+  // end timestamp of the record
+  uint64_t end_ns;
+  // process id of the record
+  uint64_t process_id;
+  // thread id of the record
+  uint64_t thread_id;
+};
+
+struct RuntimeTraceEvent {
+  RuntimeTraceEvent() = default;
+  RuntimeTraceEvent(const std::string& name, uint64_t start_ns, uint64_t end_ns,
+                    uint64_t process_id, uint64_t thread_id,
+                    uint32_t correlation_id, uint32_t callback_id)
+      : name(name),
+        start_ns(start_ns),
+        end_ns(end_ns),
+        process_id(process_id),
+        thread_id(thread_id),
+        correlation_id(correlation_id),
+        callback_id(callback_id) {}
+
+  // record name
+  std::string name;
+  // record type, one of TracerEventType
+  TracerEventType type{TracerEventType::CudaRuntime};
+  // start timestamp of the record
+  uint64_t start_ns;
+  // end timestamp of the record
+  uint64_t end_ns;
+  // process id of the record
+  uint64_t process_id;
+  // thread id of the record
+  uint64_t thread_id;
+  // correlation id, used for correlating async activities happened on device
+  uint32_t correlation_id;
+  // callback id, used to identify which cuda runtime api is called
+  uint32_t callback_id;
+};
+
+struct DeviceTraceEvent {
+  DeviceTraceEvent() = default;
+  DeviceTraceEvent(const std::string& name, TracerEventType type,
+                   uint64_t start_ns, uint64_t end_ns, uint64_t device_id,
+                   uint64_t context_id, uint64_t stream_id,
+                   uint32_t correlation_id, const KernelEventInfo& kernel_info)
+      : name(name),
+        type(type),
+        start_ns(start_ns),
+        end_ns(end_ns),
+        device_id(device_id),
+        context_id(context_id),
+        stream_id(stream_id),
+        correlation_id(correlation_id),
+        kernel_info(kernel_info) {}
+  DeviceTraceEvent(const std::string& name, TracerEventType type,
+                   uint64_t start_ns, uint64_t end_ns, uint64_t device_id,
+                   uint64_t context_id, uint64_t stream_id,
+                   uint32_t correlation_id, const MemcpyEventInfo& memcpy_info)
+      : name(name),
+        type(type),
+        start_ns(start_ns),
+        end_ns(end_ns),
+        device_id(device_id),
+        context_id(context_id),
+        stream_id(stream_id),
+        correlation_id(correlation_id),
+        memcpy_info(memcpy_info) {}
+  DeviceTraceEvent(const std::string& name, TracerEventType type,
+                   uint64_t start_ns, uint64_t end_ns, uint64_t device_id,
+                   uint64_t context_id, uint64_t stream_id,
+                   uint32_t correlation_id, const MemsetEventInfo& memset_info)
+      : name(name),
+        type(type),
+        start_ns(start_ns),
+        end_ns(end_ns),
+        device_id(device_id),
+        context_id(context_id),
+        stream_id(stream_id),
+        correlation_id(correlation_id),
+        memset_info(memset_info) {}
+  // record name
+  std::string name;
+  // record type, one of TracerEventType
+  TracerEventType type;
+  // start timestamp of the record
+  uint64_t start_ns;
+  // end timestamp of the record
+  uint64_t end_ns;
+  // device id
+  uint64_t device_id;
+  // context id
+  uint64_t context_id;
+  // stream id
+  uint64_t stream_id;
+  // correlation id, used for correlating async activities happened on device
+  uint32_t correlation_id;
+  // union, specific device record type has different detail information
+  union {
+    // used for TracerEventType::Kernel
+    KernelEventInfo kernel_info;
+    // used for TracerEventType::Memcpy
+    MemcpyEventInfo memcpy_info;
+    // used for TracerEventType::Memset
+    MemsetEventInfo memset_info;
+  };
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 4feba4ab19b785491bc611b00b1749f253433b29..922b818b2363bb1b29585f31f53c71ebece8e887 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,7 +2,7 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp
   feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
-  cost_model cuda_graph_with_memory_pool fleet_executor global_utils)
+  cost_model cuda_graph_with_memory_pool fleet_executor global_utils pten_utils)
 
 if (WITH_PSCORE)
   set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
@@ -293,6 +293,10 @@ if(WITH_PYTHON)
     target_link_libraries(paddle_pybind ${ROCM_HIPRTC_LIB})
   endif()
 
+  if(WITH_IPU)
+    target_link_libraries(paddle_pybind paddle_ipu)
+  endif()
+
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(paddle_pybind ${os_dependency_modules})
   add_dependencies(paddle_pybind op_function_generator_cmd)
diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index 450939dd0ff8bf8d48262bc4b59e53e0123a3dae..72ee451fe7c31deeec714ba899b6cf2535edc88a 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -162,7 +162,12 @@ void BindFleetExecutor(py::module* m) {
   py::class_<DistModel>(*m, "DistModel")
       .def(py::init<const DistModelConfig&>())
       .def("init", &DistModel::Init)
-      .def("run", &DistModel::Run, py::call_guard<py::gil_scoped_release>());
+      .def("run",
+           [](DistModel& self, const std::vector<DistModelTensor>& inputs) {
+             std::vector<DistModelTensor> outputs;
+             self.Run(inputs, &outputs);
+             return outputs;
+           });
 
   py::class_<DistModelDataBuf>(*m, "DistModelDataBuf")
       .def(py::init<size_t>())
diff --git a/paddle/fluid/pybind/communicator_py.cc b/paddle/fluid/pybind/communicator_py.cc
index 07ba7061678d97f11fa6541e7cb7c304d64eb945..723d7f3197230aa5218b19bebdf97bb9a7167e75 100644
--- a/paddle/fluid/pybind/communicator_py.cc
+++ b/paddle/fluid/pybind/communicator_py.cc
@@ -23,8 +23,8 @@ limitations under the License. */
 #include "pybind11/pybind11.h"
 
 #include "paddle/fluid/operators/distributed/communicator.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
 #include "paddle/fluid/operators/distributed/large_scale_kv.h"
+#include "paddle/fluid/operators/distributed/ps/service/communicator/communicator_common.h"
 
 namespace py = pybind11;
 
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index aeb4f533f49395ed43fface1a5c11cee508837d4..73c8f362d145db078ac4c84c91372dcdd61c47af 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -29,15 +29,15 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/distributed/common/sparse_sharding_merge.h"
-#include "paddle/fluid/distributed/communicator_common.h"
-#include "paddle/fluid/distributed/fleet.h"
 #include "paddle/fluid/distributed/index_dataset/index_sampler.h"
 #include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
-#include "paddle/fluid/distributed/service/communicator.h"
-#include "paddle/fluid/distributed/service/env.h"
-#include "paddle/fluid/distributed/service/graph_brpc_client.h"
-#include "paddle/fluid/distributed/service/graph_py_service.h"
-#include "paddle/fluid/distributed/service/heter_client.h"
+#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
+#include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h"
+#include "paddle/fluid/distributed/ps/service/env.h"
+#include "paddle/fluid/distributed/ps/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/ps/service/heter_client.h"
+#include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h"
+#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 
 namespace py = pybind11;
 using paddle::distributed::CommContext;
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index a3f0a0c87fd803880b6ea19df3d79761bce59daf..780ef741c6aca5ca53224fcdadaf4a4b2e6a6205 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -697,10 +697,10 @@ static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src,  // NOLINT
             platform::DeviceContextPool::Instance().Get(src_device)->Wait();
           }
         }
-      } else if (src->Var().IsType<framework::SelectedRows>()) {
-        auto &src_selected_rows = src->Var().Get<framework::SelectedRows>();
+      } else if (src->Var().IsType<pten::SelectedRows>()) {
+        auto &src_selected_rows = src->Var().Get<pten::SelectedRows>();
         auto *dst_selected_rows =
-            dst.MutableVar()->GetMutable<framework::SelectedRows>();
+            dst.MutableVar()->GetMutable<pten::SelectedRows>();
         dst_selected_rows->set_height(src_selected_rows.height());
         dst_selected_rows->set_rows(src_selected_rows.rows());
         framework::TensorCopy(src_selected_rows.value(), dst_device,
@@ -1392,7 +1392,7 @@ void BindImperative(py::module *m_ptr) {
 
              PADDLE_ENFORCE_EQ(
                  self.Var().IsType<framework::LoDTensor>() ||
-                     self.Var().IsType<framework::SelectedRows>(),
+                     self.Var().IsType<pten::SelectedRows>(),
                  true,
                  platform::errors::InvalidArgument(
                      "Type of Tensor[%s] must be LoDTensor or SelectedRows!",
@@ -1423,15 +1423,14 @@ void BindImperative(py::module *m_ptr) {
                detach_tensor->ShareInplaceVersionCounterWith(origin_tensor);
              } else {
                const auto &origin_selected_rows =
-                   self.Var().Get<framework::SelectedRows>();
+                   self.Var().Get<pten::SelectedRows>();
                PADDLE_ENFORCE_EQ(
                    origin_selected_rows.value().IsInitialized(), true,
                    platform::errors::InvalidArgument(
                        "Tensor %s has not been initialized!", self.Name()));
 
                auto *detach_selected_rows =
-                   detach_var->MutableVar()
-                       ->GetMutable<framework::SelectedRows>();
+                   detach_var->MutableVar()->GetMutable<pten::SelectedRows>();
                detach_selected_rows->set_height(origin_selected_rows.height());
                detach_selected_rows->set_rows(origin_selected_rows.rows());
                detach_selected_rows->mutable_value()->ShareDataWith(
@@ -1597,7 +1596,7 @@ void BindImperative(py::module *m_ptr) {
                        ? grad_var->MutableVar()
                              ->GetMutable<framework::LoDTensor>()
                        : grad_var->MutableVar()
-                             ->GetMutable<framework::SelectedRows>()
+                             ->GetMutable<pten::SelectedRows>()
                              ->mutable_value();
 
                if (tensor->IsInitialized()) {
@@ -1613,7 +1612,7 @@ void BindImperative(py::module *m_ptr) {
            })
       .def("_is_sparse",
            [](imperative::VarBase &self) {
-             return self.Var().IsType<framework::SelectedRows>();
+             return self.Var().IsType<pten::SelectedRows>();
            })
       .def("_allreduce",
            [](imperative::VarBase &self,
@@ -1623,7 +1622,7 @@ void BindImperative(py::module *m_ptr) {
 #if NCCL_VERSION_CODE >= 2212
                imperative::AllReduce(self.Var(), self.MutableVar(), strategy);
 #else
-               if (!self.Var().IsType<framework::SelectedRows>()) {
+               if (!self.Var().IsType<pten::SelectedRows>()) {
                  imperative::AllReduce(self.Var(), self.MutableVar(), strategy);
                } else {
                  PADDLE_THROW(platform::errors::Unimplemented(
@@ -2126,11 +2125,10 @@ void BindImperative(py::module *m_ptr) {
                                          .Get<framework::LoDTensor>()
                                          .dims());
                                } else if (self.Var()
-                                              .IsType<
-                                                  framework::SelectedRows>()) {
+                                              .IsType<pten::SelectedRows>()) {
                                  return framework::vectorize<int>(
                                      self.Var()
-                                         .Get<framework::SelectedRows>()
+                                         .Get<pten::SelectedRows>()
                                          .value()
                                          .dims());
                                } else if (self.Var()
diff --git a/paddle/fluid/pybind/io.cc b/paddle/fluid/pybind/io.cc
index 88a43f9428b227d217074765de6d94733d71213f..0bd1e94a09cdbe979228ba0cc30149416458dbf9 100644
--- a/paddle/fluid/pybind/io.cc
+++ b/paddle/fluid/pybind/io.cc
@@ -49,35 +49,33 @@ void BindIO(pybind11::module *m) {
     return tellg;
   });
 
-  m->def("save_selected_rows",
-         [](const paddle::framework::SelectedRows &selected_rows,
-            const std::string &str_file_name) {
-           std::ofstream fout(str_file_name, std::ios::binary);
-           PADDLE_ENFORCE_EQ(
-               static_cast<bool>(fout), true,
-               platform::errors::Unavailable(
-                   "Cannot open %s to save SelectedRows.", str_file_name));
-
-           paddle::framework::SerializeToStream(fout, selected_rows);
-           int64_t tellp = fout.tellp();
-           fout.close();
-           return tellp;
-         });
+  m->def("save_selected_rows", [](const pten::SelectedRows &selected_rows,
+                                  const std::string &str_file_name) {
+    std::ofstream fout(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(
+        static_cast<bool>(fout), true,
+        platform::errors::Unavailable("Cannot open %s to save SelectedRows.",
+                                      str_file_name));
 
-  m->def("load_selected_rows",
-         [](paddle::framework::SelectedRows &selected_rows,
-            const std::string &str_file_name) {
-           std::ifstream fin(str_file_name, std::ios::binary);
-           PADDLE_ENFORCE_EQ(
-               static_cast<bool>(fin), true,
-               platform::errors::Unavailable(
-                   "Cannot open %s to load SelectedRows.", str_file_name));
+    paddle::framework::SerializeToStream(fout, selected_rows);
+    int64_t tellp = fout.tellp();
+    fout.close();
+    return tellp;
+  });
 
-           paddle::framework::DeserializeFromStream(fin, &selected_rows);
-           int64_t tellg = fin.tellg();
-           fin.close();
-           return tellg;
-         });
+  m->def("load_selected_rows", [](pten::SelectedRows &selected_rows,
+                                  const std::string &str_file_name) {
+    std::ifstream fin(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(
+        static_cast<bool>(fin), true,
+        platform::errors::Unavailable("Cannot open %s to load SelectedRows.",
+                                      str_file_name));
+
+    paddle::framework::DeserializeFromStream(fin, &selected_rows);
+    int64_t tellg = fin.tellg();
+    fin.close();
+    return tellg;
+  });
 
   m->def("save_lod_tensor_to_memory",
          [](const paddle::framework::LoDTensor &tensor) -> py::bytes {
@@ -93,14 +91,14 @@ void BindIO(pybind11::module *m) {
   });
 
   m->def("save_selected_rows_to_memory",
-         [](const paddle::framework::SelectedRows &selected_rows) -> py::bytes {
+         [](const pten::SelectedRows &selected_rows) -> py::bytes {
            std::ostringstream ss;
            paddle::framework::SerializeToStream(ss, selected_rows);
            return ss.str();
          });
 
   m->def("load_selected_rows_from_memory",
-         [](paddle::framework::SelectedRows &selected_rows,
+         [](pten::SelectedRows &selected_rows,
             const std::string &selected_rows_bytes) {
            std::istringstream fin(selected_rows_bytes,
                                   std::ios::in | std::ios::binary);
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 5587952facc530c6847a8949ae17c08c7cb09a9c..957c0b0ee6d1d09fa6b4ed78595295e5b43544f5 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -123,6 +123,7 @@ static PyObject * %s(PyObject *self, PyObject *args, PyObject *kwargs)
   PyThreadState *tstate = nullptr;
   try
   {
+    platform::RecordEvent op_type_record_event("%s pybind_imperative_func");
     %s
     framework::AttributeMap attrs;
     ConstructAttrMapFromPyArgs("%s", args, %d, PyTuple_GET_SIZE(args) , attrs);
@@ -371,8 +372,8 @@ std::string GenerateOpFunctionsBody(
 
   // generate op funtcion body
   auto op_function_str = paddle::string::Sprintf(
-      OP_FUNCTION_TEMPLATE, func_name, ins_cast_str, op_type, input_args_num,
-      inplace_strategy_str, outs_initializer, ins_initializer,
+      OP_FUNCTION_TEMPLATE, func_name, op_type, ins_cast_str, op_type,
+      input_args_num, inplace_strategy_str, outs_initializer, ins_initializer,
       ins_initializer_with_null + outs_initializer_with_null +
           view_strategy_str,
       op_type, inplace_mapping_str, return_str);
@@ -461,6 +462,7 @@ int main(int argc, char* argv[]) {
 #endif
 
   std::vector<std::string> headers{"\"paddle/fluid/imperative/tracer.h\"",
+                                   "\"paddle/fluid/platform/profiler.h\"",
                                    "\"pybind11/detail/common.h\"",
                                    "<Python.h>"};
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 454e3b524f5f14f3aa5b780eec2eac2305a1e1ed..d3d7e5794e7b192d6aacee4adccfab554555187b 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -50,6 +50,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/prune.h"
+#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/save_load_util.h"
 #include "paddle/fluid/framework/scope_pool.h"
@@ -133,9 +134,10 @@ limitations under the License. */
 #endif
 
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+
 #ifdef PADDLE_WITH_IPU
-#include "paddle/fluid/platform/ipu/ipu_backend.h"
-#include "paddle/fluid/platform/ipu_info.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif
 
 #ifdef PADDLE_WITH_MLU
@@ -1216,23 +1218,27 @@ PYBIND11_MODULE(core_noavx, m) {
           }));
 #endif
 
-  py::class_<SelectedRows>(m, "SelectedRows")
+  py::class_<pten::SelectedRows>(m, "SelectedRows")
       .def("__init__",
-           [](SelectedRows &instance) { new (&instance) SelectedRows(); })
+           [](pten::SelectedRows &instance) {
+             new (&instance) pten::SelectedRows();
+           })
       .def("__init__",
-           [](SelectedRows &instance, const std::vector<int64_t> rows,
+           [](pten::SelectedRows &instance, const std::vector<int64_t> rows,
               const int64_t &height) {
-             new (&instance) SelectedRows(rows, height);
+             new (&instance) pten::SelectedRows(rows, height);
            })
       .def("get_tensor",
-           [](SelectedRows &self) { return self.mutable_value(); },
+           [](pten::SelectedRows &self) { return self.mutable_value(); },
            py::return_value_policy::reference)
       .def("numel",
-           [](SelectedRows &self) -> int64_t { return self.value().numel(); })
-      .def("set_height", &SelectedRows::set_height)
-      .def("height", &SelectedRows::height)
+           [](pten::SelectedRows &self) -> int64_t {
+             return self.value().numel();
+           })
+      .def("set_height", &pten::SelectedRows::set_height)
+      .def("height", &pten::SelectedRows::height)
       .def("set_rows",
-           [](SelectedRows &self, std::vector<int64_t> rows) {
+           [](pten::SelectedRows &self, std::vector<int64_t> rows) {
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
              self.set_rows(rows);
 #else
@@ -1240,8 +1246,9 @@ PYBIND11_MODULE(core_noavx, m) {
         self.set_rows(new_rows);
 #endif
            })
-      .def("sync_index", [](SelectedRows &instance) { instance.SyncIndex(); })
-      .def("rows", [](SelectedRows &self) {
+      .def("sync_index",
+           [](pten::SelectedRows &instance) { instance.SyncIndex(); })
+      .def("rows", [](pten::SelectedRows &self) {
         auto rows = self.rows();
         std::vector<int64_t> new_rows;
         new_rows.reserve(rows.size());
@@ -1290,8 +1297,8 @@ All parameter, weight, gradient are variables in Paddle.
            [](Variable &self) { return self.GetMutable<LoDRankTable>(); },
            py::return_value_policy::reference)
       .def("get_selected_rows",
-           [](Variable &self) -> SelectedRows * {
-             return self.GetMutable<SelectedRows>();
+           [](Variable &self) -> pten::SelectedRows * {
+             return self.GetMutable<pten::SelectedRows>();
            },
            py::return_value_policy::reference)
       .def("get_lod_tensor_array",
@@ -1756,27 +1763,30 @@ All parameter, weight, gradient are variables in Paddle.
       .def("__repr__", string::to_string<const platform::XPUPlace &>)
       .def("__str__", string::to_string<const platform::XPUPlace &>);
 #ifdef PADDLE_WITH_XPU
-  py::enum_<platform::XPUVersion>(m, "XPUVersion", py::arithmetic())
-      .value("XPU1", platform::XPUVersion::XPU1)
-      .value("XPU2", platform::XPUVersion::XPU2)
+  py::enum_<pten::backends::xpu::XPUVersion>(m, "XPUVersion", py::arithmetic())
+      .value("XPU1", pten::backends::xpu::XPUVersion::XPU1)
+      .value("XPU2", pten::backends::xpu::XPUVersion::XPU2)
       .export_values();
   m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
   m.def("get_xpu_device_version",
         [](int device_id) { return platform::get_xpu_version(device_id); });
-  m.def("get_xpu_device_op_support_types",
-        [](const std::string &op_name, platform::XPUVersion version) {
-          return platform::get_xpu_op_support_type(op_name, version);
-        });
-  m.def("get_xpu_device_op_list", [](platform::XPUVersion version) {
+  m.def(
+      "get_xpu_device_op_support_types",
+      [](const std::string &op_name, pten::backends::xpu::XPUVersion version) {
+        return platform::get_xpu_op_support_type(op_name, version);
+      });
+  m.def("get_xpu_device_op_list", [](pten::backends::xpu::XPUVersion version) {
     return platform::get_xpu_op_list(version);
   });
   m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool {
     // XPUs with Compute Capability > xpu2 support float16 and bfloat16
-    return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1;
+    return platform::get_xpu_version(place.device) >
+           pten::backends::xpu::XPUVersion::XPU1;
   });
   m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool {
     // XPUs with Compute Capability > xpu2 support float16 and bfloat16
-    return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1;
+    return platform::get_xpu_version(place.device) >
+           pten::backends::xpu::XPUVersion::XPU1;
   });
 #endif
 
diff --git a/paddle/fluid/string/piece.h b/paddle/fluid/string/piece.h
index 8dda484eaac4d62b758e57ac5e81bfe68a5c60d4..09dee0a31cbc71ba946a05d55714ba1c302bf788 100644
--- a/paddle/fluid/string/piece.h
+++ b/paddle/fluid/string/piece.h
@@ -17,89 +17,4 @@
 #include <ostream>
 #include <string>
 
-namespace paddle {
-namespace string {
-
-// Piece points into a std::string object but doesn't own the
-// string.  It is for efficient access to strings.  Like Go's string
-// type.  Not that Piece doesn't mutate the underlying string,
-// so it is thread-safe given that the underlying string doesn't
-// change.  Because Piece contains a little data members, and
-// its syntax is simple as it doesn't own/manage the string, it is
-// cheap to construct Pieces and pass them around.
-class Piece {
- public:
-  static const size_t npos = static_cast<size_t>(-1);
-
-  // We provide non-explicit singleton constructors so users can
-  // pass in a "const char*" or a "string" wherever a "Piece"
-  // is expected.  These constructors ensure that if data_ is NULL,
-  // size_ is 0.
-  Piece();
-  Piece(const char* d, size_t n);
-  Piece(const char* d);         // NOLINT: accept C string into Piece.
-  Piece(const std::string& s);  // NOLINT: accept C++ string into Piece.
-
-  const char* data() const { return data_; }
-  size_t len() const { return size_; }
-
-  char operator[](size_t n) const;
-
-  // Piece doesn't own the string, so both iterator and const
-  // iterator are const char* indeed.
-  typedef const char* const_iterator;
-  typedef const char* iterator;
-  iterator begin() const { return data_; }
-  iterator end() const { return data_ + size_; }
-
-  // Return a string that contains the copy of the referenced data.
-  std::string ToString() const { return std::string(data_, size_); }
-
- private:
-  const char* data_;
-  size_t size_;
-
-  // Intentionally copyable
-};
-
-int Compare(Piece a, Piece b);
-
-bool operator==(Piece x, Piece y);
-bool operator!=(Piece x, Piece y);
-bool operator<(Piece x, Piece y);
-bool operator>(Piece x, Piece y);
-bool operator<=(Piece x, Piece y);
-bool operator>=(Piece x, Piece y);
-
-bool HasPrefix(Piece s, Piece prefix);
-bool HasSuffix(Piece s, Piece suffix);
-
-Piece SkipPrefix(Piece s, size_t n);
-Piece SkipSuffix(Piece s, size_t n);
-
-// Skip the prefix (or suffix) if it matches with the string.
-Piece TrimPrefix(Piece s, Piece prefix);
-Piece TrimSuffix(Piece s, Piece suffix);
-
-// Returns if s contains sub.  Any s except for empty s contains an
-// empty sub.
-bool Contains(Piece s, Piece sub);
-
-// Return the first occurrence of sub in s, or npos.  If both s and
-// sub is empty, it returns npos; otherwise, if only sub is empty, it
-// returns 0.
-size_t Index(Piece s, Piece sub);
-
-// Return the first occurrence of c in s[pos:end], or npos.
-size_t Find(Piece s, char c, size_t pos);
-
-// Search range is [0..pos] inclusive.  If pos == npos, search everything.
-size_t RFind(Piece s, char c, size_t pos);
-
-Piece SubStr(Piece s, size_t pos, size_t n);
-
-// allow Piece to be logged
-std::ostream& operator<<(std::ostream& o, Piece piece);
-
-}  // namespace string
-}  // namespace paddle
+#include "paddle/utils/string/piece.h"
diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h
index 696e2bb04f010dcbbd8eb930cb64d3e5c6a595ce..45fe89e8b5b14ef7afe7ccb4806b025f0a5eac39 100644
--- a/paddle/fluid/string/pretty_log.h
+++ b/paddle/fluid/string/pretty_log.h
@@ -19,70 +19,4 @@
 #include <utility>
 #include "gflags/gflags.h"
 
-#include "paddle/fluid/string/printf.h"
-
-DECLARE_bool(color);
-
-namespace paddle {
-
-namespace string {
-
-inline std::string black() { return FLAGS_color ? "\e[30m" : ""; }
-inline std::string red() { return FLAGS_color ? "\e[31m" : ""; }
-inline std::string b_red() { return FLAGS_color ? "\e[41m" : ""; }
-inline std::string green() { return FLAGS_color ? "\e[32m" : ""; }
-inline std::string yellow() { return FLAGS_color ? "\e[33m" : ""; }
-inline std::string blue() { return FLAGS_color ? "\e[34m" : ""; }
-inline std::string purple() { return FLAGS_color ? "\e[35m" : ""; }
-inline std::string cyan() { return FLAGS_color ? "\e[36m" : ""; }
-inline std::string light_gray() { return FLAGS_color ? "\e[37m" : ""; }
-inline std::string white() { return FLAGS_color ? "\e[37m" : ""; }
-inline std::string light_red() { return FLAGS_color ? "\e[91m" : ""; }
-inline std::string dim() { return FLAGS_color ? "\e[2m" : ""; }
-inline std::string bold() { return FLAGS_color ? "\e[1m" : ""; }
-inline std::string underline() { return FLAGS_color ? "\e[4m" : ""; }
-inline std::string blink() { return FLAGS_color ? "\e[5m" : ""; }
-inline std::string reset() { return FLAGS_color ? "\e[0m" : ""; }
-
-using TextBlock = std::pair<std::string, std::string>;
-
-struct Style {
-  static std::string info() { return black(); }
-  static std::string warn() { return b_red(); }
-  static std::string suc() { return green(); }
-  static std::string H1() { return bold() + purple(); }
-  static std::string H2() { return green(); }
-  static std::string H3() { return green(); }
-  static std::string detail() { return light_gray(); }
-};
-
-template <typename... Args>
-static void PrettyLogEndl(const std::string &style, const char *fmt,
-                          const Args &... args) {
-  std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl;
-}
-template <typename... Args>
-static void PrettyLog(const std::string &style, const char *fmt,
-                      const Args &... args) {
-  std::cerr << style << Sprintf(fmt, args...) << reset();
-}
-
-template <typename... Args>
-static void PrettyLogInfo(const char *fmt, const Args &... args) {
-  PrettyLogEndl(Style::info(), fmt, args...);
-}
-template <typename... Args>
-static void PrettyLogDetail(const char *fmt, const Args &... args) {
-  PrettyLogEndl(Style::detail(), fmt, args...);
-}
-template <typename... Args>
-static void PrettyLogH1(const char *fmt, const Args &... args) {
-  PrettyLogEndl(Style::H1(), fmt, args...);
-}
-template <typename... Args>
-static void PrettyLogH2(const char *fmt, const Args &... args) {
-  PrettyLogEndl(Style::H2(), fmt, args...);
-}
-
-}  // namespace string
-}  // namespace paddle
+#include "paddle/utils/string/pretty_log.h"
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
index 66b768665b6d0b97b4ca1470020132bfc9576bbb..40cc5450f415911b9f15ef39d24d8b04914a6baf 100644
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -12,113 +12,5 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Compared with std::stringstream, there are primary purpose of
-// string::Printf:
-//
-// 1. Type-safe printing, with why and how explained in
-//    http://www.drdobbs.com/stringprintf-a-typesafe-printf-family-fo/184401999.
-//    Implementation includes
-//
-//    https://github.com/c42f/tinyformat
-//    boost::format
-//    std::stringstream
-//
-//    std::stringstream is not convenient enough in many cases.  For example:
-//
-//      std::cout << std::setprecision(2) << std::fixed << 1.23456 << "\n";
-//
-//    boost::format is the most convenient one.  We can have
-//
-//      std::cout << format("%2% %1%") % 36 % 77;
-//
-//    or
-//
-//      format fmter("%2% %1%");
-//      fmter % 36; fmter % 77;
-//      std::cout << fmter.c_str();
-//
-//    But the overloading of % might be overkilling and it would be
-//    more efficient if it can write to std::cout directly.
-//
-//    tinyformat has an interface compatible with the C-printf style,
-//    and it can writes to a stream or returns a std::string:
-//
-//      std::cout << tfm::printf(
-//                  "%s, %s %d, %.2d:%.2d\n",
-//                  weekday, month, day, hour, min);
-//
-//    or
-//
-//      tfm::format(std::cout,
-//                  "%s, %s %d, %.2d:%.2d\n",
-//                  weekday, month, day, hour, min);
-//
-// 2. High-performance -- most printed strings are not too long and
-//    doens't need dynamic memory allocation.  Many StringPrintf
-//    implementations doesn't enforce type-safe, but are
-//    high-performance, including
-//
-//    https://developers.google.com/optimization/reference/base/stringprintf/
-//    https://github.com/adobe/chromium/blob/master/base/stringprintf.h
-//    https://github.com/google/protobuf/blob/master/src/google/protobuf/stubs/stringprintf.h
-//
-// According to
-// https://github.com/c42f/tinyformat#compile-time-and-code-bloat,
-// boost::format runs too slow and results in large executable binary
-// files.  So here we port tinyformat.
-
 #pragma once
-
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "tinyformat/tinyformat.h"  // https://github.com/c42f/tinyformat
-
-namespace paddle {
-namespace string {
-
-template <typename... Args>
-void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
-  tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...));
-}
-
-inline std::string Sprintf() { return ""; }
-
-template <typename... Args>
-std::string Sprintf(const Args&... args) {
-  std::ostringstream oss;
-  Fprintf(oss, "%s", args...);
-  return oss.str();
-}
-
-template <typename... Args>
-std::string Sprintf(const char* fmt, const Args&... args) {
-  std::ostringstream oss;
-  Fprintf(oss, fmt, args...);
-  return oss.str();
-}
-
-template <typename... Args>
-void Printf(const char* fmt, const Args&... args) {
-  Fprintf(std::cout, fmt, args...);
-}
-
-inline std::string HumanReadableSize(double f_size) {
-  size_t i = 0;
-  double orig = f_size;
-  const std::vector<std::string> units(
-      {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"});
-  while (f_size >= 1024) {
-    f_size /= 1024;
-    i++;
-  }
-  if (i >= units.size()) {
-    return Sprintf("%fB", orig);
-  }
-  return Sprintf("%f%s", f_size, units[i]);
-}
-
-}  // namespace string
-}  // namespace paddle
+#include "paddle/utils/string/printf.h"
diff --git a/paddle/fluid/string/split.h b/paddle/fluid/string/split.h
index ccb96b8a9cb68f03acbca592a2149ba5001f34d2..d2a6f67ca75c15e746586ab0db97528c3fc88117 100644
--- a/paddle/fluid/string/split.h
+++ b/paddle/fluid/string/split.h
@@ -17,21 +17,4 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-namespace paddle {
-namespace string {
-
-static inline std::vector<std::string> Split(std::string const& original,
-                                             char separator) {
-  std::vector<std::string> results;
-  std::string token;
-  std::istringstream is(original);
-  while (std::getline(is, token, separator)) {
-    if (!token.empty()) {
-      results.push_back(token);
-    }
-  }
-  return results;
-}
-
-}  // namespace string
-}  // namespace paddle
+#include "paddle/utils/string/split.h"
diff --git a/paddle/fluid/string/string_helper.h b/paddle/fluid/string/string_helper.h
index c52b7a99a777a7eac714e6533101368e35844c21..08a715bfbc76431362fadf5376cc4647f1487ebc 100644
--- a/paddle/fluid/string/string_helper.h
+++ b/paddle/fluid/string/string_helper.h
@@ -14,219 +14,4 @@
 
 #pragma once
 
-#include <ctype.h>
-#include <stdio.h>
-#include <cstring>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "glog/logging.h"
-
-namespace paddle {
-namespace string {
-
-inline size_t count_spaces(const char* s) {
-  size_t count = 0;
-
-  while (*s != 0 && isspace(*s++)) {
-    count++;
-  }
-
-  return count;
-}
-
-inline size_t count_nonspaces(const char* s) {
-  size_t count = 0;
-
-  while (*s != 0 && !isspace(*s++)) {
-    count++;
-  }
-
-  return count;
-}
-
-template <class... ARGS>
-void format_string_append(std::string& str, const char* fmt,  // NOLINT
-                          ARGS&&... args) {
-  int len = snprintf(NULL, 0, fmt, args...);
-  CHECK_GE(len, 0);
-  size_t oldlen = str.length();
-  str.resize(oldlen + len + 1);
-
-  CHECK(snprintf(&str[oldlen], (size_t)len + 1, fmt, args...) ==  // NOLINT
-        len);
-  str.resize(oldlen + len);
-}
-
-template <class... ARGS>
-void format_string_append(std::string& str, const std::string& fmt,  // NOLINT
-                          ARGS&&... args) {
-  format_string_append(str, fmt.c_str(), args...);
-}
-
-template <class... ARGS>
-std::string format_string(const char* fmt, ARGS&&... args) {
-  std::string str;
-  format_string_append(str, fmt, args...);
-  return str;
-}
-
-template <class... ARGS>
-std::string format_string(const std::string& fmt, ARGS&&... args) {
-  return format_string(fmt.c_str(), args...);
-}
-
-// remove leading and tailing spaces
-std::string trim_spaces(const std::string& str);
-
-// erase all spaces in str
-std::string erase_spaces(const std::string& str);
-
-inline int str_to_float(const char* str, float* v) {
-  const char* head = str;
-  char* cursor = NULL;
-  int index = 0;
-  while (*(head += count_spaces(head)) != 0) {
-    v[index++] = std::strtof(head, &cursor);
-    if (head == cursor) {
-      break;
-    }
-    head = cursor;
-  }
-  return index;
-}
-
-// checks whether the test string is a suffix of the input string.
-bool ends_with(std::string const& input, std::string const& test);
-
-// split string by delim
-template <class T = std::string>
-std::vector<T> split_string(const std::string& str, const std::string& delim) {
-  size_t pre_pos = 0;
-  size_t pos = 0;
-  std::string tmp_str;
-  std::vector<T> res_list;
-  res_list.clear();
-  if (str.empty()) {
-    return res_list;
-  }
-  while ((pos = str.find(delim, pre_pos)) != std::string::npos) {
-    tmp_str.assign(str, pre_pos, pos - pre_pos);
-    res_list.push_back(tmp_str);
-    pre_pos = pos + 1;
-  }
-  tmp_str.assign(str, pre_pos, str.length() - pre_pos);
-  if (!tmp_str.empty()) {
-    res_list.push_back(tmp_str);
-  }
-  return res_list;
-}
-
-// split string by spaces. Leading and tailing spaces are ignored. Consecutive
-// spaces are treated as one delim.
-template <class T = std::string>
-std::vector<T> split_string(const std::string& str) {
-  std::vector<T> list;
-  const char* p;
-  int pre_pos = 0;
-  int pos = 0;
-  std::string tmp_str;
-  if (str.empty()) {
-    return list;
-  }
-  for (p = str.c_str(); *p != 0;) {
-    if (!isspace(*p)) {
-      pos = pre_pos;
-      p++;
-
-      while (*p != 0 && !isspace(*p)) {
-        pos++;
-        p++;
-      }
-      tmp_str.assign(str, pre_pos, pos - pre_pos + 1);
-      list.push_back(tmp_str);
-      pre_pos = pos + 1;
-    } else {
-      pre_pos++;
-      p++;
-    }
-  }
-  return list;
-}
-
-template <class Container>
-std::string join_strings(const Container& strs, char delim) {
-  std::string str;
-
-  size_t i = 0;
-  for (auto& elem : strs) {
-    if (i > 0) {
-      str += delim;
-    }
-
-    std::stringstream ss;
-    ss << elem;
-    str += ss.str();
-    ++i;
-  }
-
-  return str;
-}
-
-template <class Container>
-std::string join_strings(const Container& strs, const std::string& delim) {
-  std::string str;
-
-  size_t i = 0;
-  for (auto& elem : strs) {
-    if (i > 0) {
-      str += delim;
-    }
-
-    std::stringstream ss;
-    ss << elem;
-    str += ss.str();
-    ++i;
-  }
-
-  return str;
-}
-
-template <class Container, class DelimT, class ConvertFunc>
-std::string join_strings(const Container& strs, DelimT&& delim,
-                         ConvertFunc&& func) {
-  std::stringstream ss;
-  size_t i = 0;
-  for (const auto& elem : strs) {
-    if (i > 0) {
-      ss << delim;
-    }
-    ss << func(elem);
-    ++i;
-  }
-
-  return ss.str();
-}
-
-// A helper class for reading lines from file. A line buffer is maintained. It
-// doesn't need to know the maximum possible length of a line.
-
-class LineFileReader {
- public:
-  LineFileReader() {}
-  LineFileReader(LineFileReader&&) = delete;
-  LineFileReader(const LineFileReader&) = delete;
-  ~LineFileReader() { ::free(_buffer); }
-  char* getline(FILE* f) { return this->getdelim(f, '\n'); }
-  char* getdelim(FILE* f, char delim);
-  char* get() { return _buffer; }
-  size_t length() { return _length; }
-
- private:
-  char* _buffer = NULL;
-  size_t _buf_size = 0;
-  size_t _length = 0;
-};
-}  // end namespace string
-}  // end namespace paddle
+#include "paddle/utils/string/string_helper.h"
diff --git a/paddle/fluid/string/to_string.h b/paddle/fluid/string/to_string.h
index 7b3332861e0fa3edbbb8915e3e3f068fed3b412f..72d9c0379fd3aa3f9d1ee156cd19c13eb7001efa 100644
--- a/paddle/fluid/string/to_string.h
+++ b/paddle/fluid/string/to_string.h
@@ -13,48 +13,4 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <sstream>
-#include <string>
-#include <type_traits>
-#include <typeindex>
-#include <vector>
-
-namespace paddle {
-namespace string {
-inline std::ostream& operator<<(std::ostream& s, const std::type_index& t) {
-  s << t.name();
-  return s;
-}
-
-template <typename T,
-          typename std::enable_if<!std::is_enum<T>::value, int>::type = 0>
-inline std::string to_string(T v) {
-  std::ostringstream sout;
-  sout << v;
-  return sout.str();
-}
-
-template <typename T,
-          typename std::enable_if<std::is_enum<T>::value, int>::type = 0>
-inline std::string to_string(T v) {
-  return std::to_string(static_cast<int>(v));
-}
-
-template <>
-inline std::string to_string(std::type_index t) {
-  return t.name();
-}
-
-// Faster std::string/const char* type
-template <>
-inline std::string to_string(std::string v) {
-  return v;
-}
-
-template <>
-inline std::string to_string(const char* v) {
-  return std::string(v);
-}
-
-}  // namespace string
-}  // namespace paddle
+#include "paddle/utils/string/to_string.h"
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 671ed28313af9efe18e8aa8e7b525ae289446e5f..78e86c12cb4bbb10de52cc2aa46a7d0ff6ce7cd3 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -21,7 +21,7 @@ add_subdirectory(ops)
 add_subdirectory(tests)
 
 # make an unity target for compile deps
-set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context arg_map_context infermeta lod_utils)
+set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos)
 get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
 # keep this message for debug, remove it later if needless
 message(STATUS "All standard pten kernels: ${pten_kernels}")
diff --git a/paddle/pten/api/include/kernel_signature.h b/paddle/pten/api/include/kernel_signature.h
index b8e7b0d75bc6cb5d8458c4e0663bc4ff1cd1a732..863adbea36aa4ddc04bea5c76959a85e3d8acfb7 100644
--- a/paddle/pten/api/include/kernel_signature.h
+++ b/paddle/pten/api/include/kernel_signature.h
@@ -102,8 +102,8 @@ using scale_kernel = void (*)(const DeviceContext&,
 using sum_kernel = void (*)(const DeviceContext&,
                             const DenseTensor&,
                             const std::vector<int64_t>&,
-                            bool,
                             DataType,
+                            bool,
                             DenseTensor*);
 
 using subtract_kernel = void (*)(const DeviceContext&,
diff --git a/paddle/pten/api/include/tensor.h b/paddle/pten/api/include/tensor.h
index d2afd703eaf2a1827143fd6b6f47c6f42941c250..e93b9be7046a3cf9592898408575c2bdb4f378c2 100644
--- a/paddle/pten/api/include/tensor.h
+++ b/paddle/pten/api/include/tensor.h
@@ -505,6 +505,12 @@ class PADDLE_API Tensor final {
    * in the development of new dygraph. It may be removed in the future.
    */
   std::string name_{""};
+
+  /**
+   * Place type: Return the expected memory location if the Tensor is
+   * uninitialized.
+   */
+  PlaceType place_{PlaceType::kUNK};
 };
 
 }  // namespace experimental
diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt
index 1e645a68edfdfa8b09216860cb905a171a0258aa..d3088c4483427f93a47b54532cb186a71f9546f8 100644
--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -3,11 +3,11 @@ add_subdirectory(utils)
 cc_library(ext_compat_utils SRCS ext_compat_utils.cc DEPS place)
 
 if (WITH_GPU)
-  nv_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils enforce)
+  nv_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce)
 elseif (WITH_ROCM)
-  hip_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils enforce)
+  hip_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce)
 else()
-  cc_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils enforce)
+  cc_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce)
 endif()
 
 cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS pten_tensor pten_context kernel_factory)
@@ -38,7 +38,7 @@ endif()
 add_custom_command(
   OUTPUT ${api_header_file} ${api_source_file}
   COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml
-  COMMAND ${PYTHON_EXECUTABLE} ${api_gen_file} 
+  COMMAND ${PYTHON_EXECUTABLE} ${api_gen_file}
                  --api_yaml_path ${api_yaml_file}
                  --api_header_path ${api_header_file_tmp}
                  --api_source_path ${api_source_file_tmp}
@@ -51,7 +51,7 @@ add_custom_command(
 # generate backward api
 add_custom_command(
   OUTPUT ${bw_api_header_file} ${bw_api_source_file} ${bw_api_header_file_tmp} ${bw_api_source_file_tmp}
-  COMMAND ${PYTHON_EXECUTABLE} ${bw_api_gen_file} 
+  COMMAND ${PYTHON_EXECUTABLE} ${bw_api_gen_file}
                  --backward_yaml_path ${bw_api_yaml_file}
                  --backward_header_path ${bw_api_header_file_tmp}
                  --backward_source_path ${bw_api_source_file_tmp}
@@ -63,4 +63,4 @@ add_custom_command(
 
 cc_library(utils_api SRCS utils.cc DEPS pten_tensor pten kernel_dispatch)
 cc_library(pten_function_api SRCS ${api_source_file} DEPS pten_tensor pten kernel_dispatch)
-cc_library(pten_bw_function_api SRCS ${bw_api_source_file} DEPS pten_tensor pten kernel_dispatch backward_infermeta)
+cc_library(pten_bw_function_api SRCS ${bw_api_source_file} DEPS pten_tensor pten kernel_dispatch backward_infermeta pten_function_api)
diff --git a/paddle/pten/api/lib/op_meta_info.cc b/paddle/pten/api/lib/op_meta_info.cc
index aa2e33afb94b84ca7fb34ebb6342d792b3afec44..82d465b4c21fcac5cb593c4d246421dd3378a275 100644
--- a/paddle/pten/api/lib/op_meta_info.cc
+++ b/paddle/pten/api/lib/op_meta_info.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/custom_operator.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/core/enforce.h"
 
 namespace paddle {
 
@@ -74,7 +74,7 @@ OpMetaInfoBuilder::OpMetaInfoBuilder(std::string&& name, size_t index) {
   PADDLE_ENFORCE_EQ(
       info_vector.size(),
       index_,
-      platform::errors::PreconditionNotMet(
+      pten::errors::PreconditionNotMet(
           "The operator %s's meta info register failed. "
           "Please make sure you call marcos as order `PD_BUILD_OP`, "
           "`PD_BUILD_GRAD_OP`, `PD_BUILD_DOUBLE_GRAD_OP`.",
@@ -88,7 +88,7 @@ OpMetaInfoBuilder::OpMetaInfoBuilder(std::string&& name, size_t index) {
     case 2:
       name_ = name_ + "_grad_grad";
     default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(pten::errors::InvalidArgument(
           "Not support index `%d` when construct OpMetaInfoBuilder, "
           "now only support `0, 1, 2`.",
           index_));
@@ -130,7 +130,7 @@ OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferDtypeFn(InferDtypeFunc func) {
   PADDLE_ENFORCE_EQ(
       index_,
       0UL,
-      platform::errors::Unimplemented(
+      pten::errors::Unimplemented(
           "Currently, the InferDtypeFn setting of Grad Op is not supported, "
           "And backward Tensor `X@GRAD` will use the dtype of forward Tensor "
           "`X` by default."));
diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc
index 3389dacec36a5c5515fd95c66f7a39ea27d5fc40..02fd918d799be6c226da73813efecd930b9bb56b 100644
--- a/paddle/pten/api/lib/tensor.cc
+++ b/paddle/pten/api/lib/tensor.cc
@@ -48,12 +48,12 @@ limitations under the License. */
  * or the corresponding components will be re-implemented.
  */
 #include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #include "paddle/pten/common/complex.h"
 #include "paddle/pten/common/float16.h"
 #include "paddle/pten/core/ddim.h"
+#include "paddle/pten/core/enforce.h"
 
 namespace paddle {
 namespace experimental {
@@ -68,7 +68,7 @@ Tensor cast(const Tensor &x, DataType out_dtype);
 Tensor::Tensor(std::shared_ptr<pten::TensorBase> tensor_impl)
     : impl_(std::move(tensor_impl)) {
   PADDLE_ENFORCE_NOT_NULL(impl_,
-                          platform::errors::InvalidArgument(
+                          pten::errors::InvalidArgument(
                               "TensorImpl with nullptr is not supported"));
 }
 
@@ -78,7 +78,8 @@ Tensor::Tensor(const PlaceType &place)
               ConvertExtPlaceToInnerPlace(place))),
           std::move(pten::DenseTensorMeta(pten::DataType::UNDEFINED,
                                           framework::make_ddim({}),
-                                          pten::DataLayout::NCHW))))) {}
+                                          pten::DataLayout::NCHW))))),
+      place_{place} {}
 
 Tensor::Tensor(const PlaceType &place, const std::vector<int64_t> &shape)
     : impl_(std::move(std::make_shared<pten::DenseTensor>(
@@ -86,7 +87,8 @@ Tensor::Tensor(const PlaceType &place, const std::vector<int64_t> &shape)
               ConvertExtPlaceToInnerPlace(place))),
           std::move(pten::DenseTensorMeta(pten::DataType::UNDEFINED,
                                           framework::make_ddim(shape),
-                                          pten::DataLayout::NCHW))))) {}
+                                          pten::DataLayout::NCHW))))),
+      place_{place} {}
 
 /* Part 2: Dimension, DataType and DataLayout methods */
 
@@ -113,7 +115,7 @@ void Tensor::reshape(const std::vector<int64_t> &shape) {
     std::dynamic_pointer_cast<pten::DenseTensor>(impl_)->set_meta(
         pten::DenseTensorMeta(dtype(), framework::make_ddim(shape)));
   } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
+    PADDLE_THROW(pten::errors::Unimplemented(
         "Only support reshape operation on DenseTensor now."));
   }
 }
@@ -131,17 +133,23 @@ bool Tensor::is_dense_tensor() const {
 /* Part 3: Device and Backend methods */
 
 PlaceType Tensor::place() const {
-  return ConvertInnerPlaceToExtPlace(impl_->place());
+  if (!impl_->initialized()) {
+    return place_;
+  } else {
+    return ConvertInnerPlaceToExtPlace(impl_->place());
+  }
 }
 
-paddle::platform::Place Tensor::inner_place() const { return impl_->place(); }
+paddle::platform::Place Tensor::inner_place() const {
+  return ConvertExtPlaceToInnerPlace(place());
+}
 
 bool Tensor::is_cpu() const {
-  return paddle::platform::is_cpu_place(impl_->place());
+  return paddle::platform::is_cpu_place(inner_place());
 }
 
 bool Tensor::is_cuda() const {
-  return paddle::platform::is_gpu_place(impl_->place());
+  return paddle::platform::is_gpu_place(inner_place());
 }
 
 /* Part 4: Data Access methods */
@@ -177,8 +185,8 @@ T *Tensor::mutable_data(const PlaceType &place) {
     PADDLE_ENFORCE_EQ(
         platform::is_same_place(inner_place, impl_->place()),
         true,
-        platform::errors::Unimplemented("Modification of tensor place through "
-                                        "mutable_data is not supported now"));
+        pten::errors::Unimplemented("Modification of tensor place through "
+                                    "mutable_data is not supported now"));
   }
   if (is_dense_tensor()) {
     return std::dynamic_pointer_cast<pten::DenseTensor>(impl_)->mutable_data<T>(
@@ -236,7 +244,7 @@ Tensor::data<paddle::platform::bfloat16>() const;
 
 template <typename T>
 T *Tensor::data() {
-  PADDLE_THROW(platform::errors::Unimplemented(
+  PADDLE_THROW(pten::errors::Unimplemented(
       "It is not currently supported to directly obtain the modifiable data "
       "address through the tensor::data<T>() method, please use the "
       "tensor::mutable_data<T>() method."));
@@ -267,7 +275,7 @@ Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const {
             begin_idx,
             end_idx))));
   } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
+    PADDLE_THROW(pten::errors::Unimplemented(
         "Only support slice operation on DenseTensor now."));
   }
 }
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index e9f5ec2d05727adde9cee1c7ad32595f914bbdde..971476a55db935af616257168b2925d1a23cb603 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -261,10 +261,10 @@ std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
     } else {
       return MakePtenDenseTensor(tensor);
     }
-  } else if (variable.IsType<framework::SelectedRows>()) {
+  } else if (variable.IsType<pten::SelectedRows>()) {
     // TODO(chenweihang): now we don't deal with row and height
     // by xiaowei's advice
-    const auto& tensor = variable.Get<framework::SelectedRows>();
+    const auto& tensor = variable.Get<pten::SelectedRows>();
     if (!platform::is_same_place(tensor.value().place(), expected_place)) {
       framework::Tensor tmp_tensor;
       paddle::framework::TensorCopySync(
@@ -289,8 +289,8 @@ std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
   if (variable->template IsType<framework::LoDTensor>()) {
     auto* tensor = variable->template GetMutable<framework::LoDTensor>();
     return MakePtenDenseTensor(*tensor, arg_def);
-  } else if (variable->template IsType<framework::SelectedRows>()) {
-    auto* tensor = variable->template GetMutable<framework::SelectedRows>();
+  } else if (variable->template IsType<pten::SelectedRows>()) {
+    auto* tensor = variable->template GetMutable<pten::SelectedRows>();
     // TODO(chenweihang): adapt SelectedRows by xiaowei's design,
     // here the row and height will lost in output!
     return MakePtenDenseTensor(tensor->value(), arg_def);
@@ -389,8 +389,8 @@ void MakeVariableFromPtenTensor(pten::DenseTensor* src,
       tensor->set_type(dtype);
     }
 
-  } else if (variable->IsType<framework::SelectedRows>()) {
-    auto* tensor = variable->GetMutable<framework::SelectedRows>();
+  } else if (variable->IsType<pten::SelectedRows>()) {
+    auto* tensor = variable->GetMutable<pten::SelectedRows>();
     auto dtype = pten::TransToProtoVarType(src->dtype());
 
     if (!tensor->value().IsInitialized()) {
diff --git a/paddle/pten/backends/CMakeLists.txt b/paddle/pten/backends/CMakeLists.txt
index 3587910ff506e572ebeead963015a8c9591388b7..e9f222d642ea0438bbee1532bf746bd0324d3f4b 100644
--- a/paddle/pten/backends/CMakeLists.txt
+++ b/paddle/pten/backends/CMakeLists.txt
@@ -2,4 +2,12 @@ add_subdirectory(dynload)
 
 add_subdirectory(cpu)
 
-cc_library(pten_context SRCS all_context.cc DEPS device_context)
+if(WITH_XPU)
+  add_subdirectory(xpu)
+endif()
+
+cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context)
+
+if(WITH_XPU)
+  add_dependencies(pten_context xpu_context)
+endif()
diff --git a/paddle/pten/backends/cpu/CMakeLists.txt b/paddle/pten/backends/cpu/CMakeLists.txt
index 62eff2dedc99c8dcc54c0f1372e3b65e36c3e9f9..965b33f3800edf9597b07ad2446637d2c505fe0f 100644
--- a/paddle/pten/backends/cpu/CMakeLists.txt
+++ b/paddle/pten/backends/cpu/CMakeLists.txt
@@ -1,6 +1,6 @@
 if(WITH_MKLDNN)
   # TODO(wilber): support mkldnn context.
-  cc_library(cpu_context SRCS cpu_context.cc DEPS pten_device_context mkldnn)
+  cc_library(cpu_context SRCS cpu_context.cc DEPS pten_device_context mkldnn eigen3)
 else()
-  cc_library(cpu_context SRCS cpu_context.cc DEPS pten_device_context)
+  cc_library(cpu_context SRCS cpu_context.cc DEPS pten_device_context eigen3)
 endif()
diff --git a/paddle/pten/backends/cpu/cpu_context.cc b/paddle/pten/backends/cpu/cpu_context.cc
index e749dfb9bd70e3594766f5399848c4114ee83ca2..efce128596b8123029787b6e4ba187c464d26cb9 100644
--- a/paddle/pten/backends/cpu/cpu_context.cc
+++ b/paddle/pten/backends/cpu/cpu_context.cc
@@ -18,16 +18,11 @@
 
 // NOTE: The paddle framework should add WITH_EIGEN option to support compile
 // without eigen.
-#include "paddle/pten/core/device_context.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace pten {
 
 struct CPUContext::CPUImpl {
-  Eigen::DefaultDevice* device_{nullptr};
-  CPUContextResource res_;
-  CPUPlace place_;
-
   CPUImpl() { device_ = new Eigen::DefaultDevice(); }
 
   // Users need to manage external resources.
@@ -36,7 +31,7 @@ struct CPUContext::CPUImpl {
   }
 
   ~CPUImpl() {
-    if (res_.device == nullptr) {
+    if (res_.device == nullptr && device_ != nullptr) {
       delete device_;
       device_ = nullptr;
     }
@@ -56,27 +51,28 @@ struct CPUContext::CPUImpl {
   }
 
   Place GetPlace() const { return place_; }
+
+  Eigen::DefaultDevice* device_{nullptr};
+  CPUContextResource res_;
+  CPUPlace place_;
 };
 
-CPUContext::CPUContext() : DeviceContext(), cpu_impl_(nullptr) {
+CPUContext::CPUContext() : DeviceContext() {
   cpu_impl_ = std::make_unique<CPUImpl>();
 }
 
-CPUContext::CPUContext(const CPUContext& other)
-    : DeviceContext(), cpu_impl_(nullptr) {
+CPUContext::CPUContext(const CPUContext& other) : DeviceContext() {
   cpu_impl_ = std::make_unique<CPUImpl>();
   cpu_impl_->SetEigenDevice(other.eigen_device());
 }
 
-CPUContext::CPUContext(CPUContext&& other)
-    : DeviceContext(), cpu_impl_(nullptr) {
+CPUContext::CPUContext(CPUContext&& other) : DeviceContext() {
   cpu_impl_ = std::move(other.cpu_impl_);
 }
 
 CPUContext::~CPUContext() = default;
 
-CPUContext::CPUContext(const CPUContextResource& ctx_res)
-    : DeviceContext(), cpu_impl_(nullptr) {
+CPUContext::CPUContext(const CPUContextResource& ctx_res) : DeviceContext() {
   cpu_impl_ = std::make_unique<CPUImpl>(ctx_res);
 }
 
diff --git a/paddle/pten/backends/xpu/CMakeLists.txt b/paddle/pten/backends/xpu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..65341dd206fd30c318eb72cb74c4ad3ac4ae212b
--- /dev/null
+++ b/paddle/pten/backends/xpu/CMakeLists.txt
@@ -0,0 +1,2 @@
+cc_library(pten_xpu_info SRCS xpu_info.cc DEPS enforce xpulib pten_place)
+cc_library(xpu_context SRCS xpu_context.cc DEPS pten_device_context pten_xpu_info)
diff --git a/paddle/pten/backends/xpu/enforce_xpu.h b/paddle/pten/backends/xpu/enforce_xpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..38aeff198d44bf98dd44edb640f9f46a6d8bd123
--- /dev/null
+++ b/paddle/pten/backends/xpu/enforce_xpu.h
@@ -0,0 +1,194 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/backends/xpu/xpu_header.h"
+#include "xpu/bkcl.h"
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pten {
+namespace backends {
+namespace xpu {
+
+// Note: XPU runtime api return int, not XPUError_t
+inline const char* xpuGetErrorString(int stat) {
+  switch (stat) {
+    case XPU_SUCCESS:
+      return "Success";
+    case XPUERR_INVALID_DEVICE:
+      return "Invalid XPU device";
+    case XPUERR_UNINIT:
+      return "XPU runtime not properly inited";
+    case XPUERR_NOMEM:
+      return "Device memory not enough";
+    case XPUERR_NOCPUMEM:
+      return "CPU memory not enough";
+    case XPUERR_INVALID_PARAM:
+      return "Invalid parameter";
+    case XPUERR_NOXPUFUNC:
+      return "Cannot get XPU Func";
+    case XPUERR_LDSO:
+      return "Error loading dynamic library";
+    case XPUERR_LDSYM:
+      return "Error loading func from dynamic library";
+    case XPUERR_SIMULATOR:
+      return "Error from XPU Simulator";
+    case XPUERR_NOSUPPORT:
+      return "Operation not supported";
+    case XPUERR_ABNORMAL:
+      return "Device abnormal due to previous error";
+    case XPUERR_KEXCEPTION:
+      return "Exception in kernel execution";
+    case XPUERR_TIMEOUT:
+      return "Kernel execution timed out";
+    case XPUERR_BUSY:
+      return "Resource busy";
+    case XPUERR_USEAFCLOSE:
+      return "Use a stream after closed";
+    case XPUERR_UCECC:
+      return "Uncorrectable ECC";
+    case XPUERR_OVERHEAT:
+      return "Overheat";
+    case XPUERR_UNEXPECT:
+      return "Execution error, reach unexpected control flow";
+    case XPUERR_DEVRESET:
+      return "Device is being reset, try again later";
+    case XPUERR_HWEXCEPTION:
+      return "Hardware module exception";
+    case XPUERR_HBM_INIT:
+      return "Error init HBM";
+    case XPUERR_DEVINIT:
+      return "Error init device";
+    case XPUERR_PEERRESET:
+      return "Device is being reset, try again later";
+    case XPUERR_MAXDEV:
+      return "Device count exceed limit";
+    case XPUERR_NOIOC:
+      return "Unknown IOCTL command";
+    case XPUERR_DMATIMEOUT:
+      return "DMA timed out, a reboot maybe needed";
+    case XPUERR_DMAABORT:
+      return "DMA aborted due to error, possibly wrong address or hardware "
+             "state";
+    case XPUERR_MCUUNINIT:
+      return "Firmware not initialized";
+    case XPUERR_OLDFW:
+      return "Firmware version too old (<15), please update.";
+    case XPUERR_PCIE:
+      return "Error in PCIE";
+    case XPUERR_FAULT:
+      return "Error copy between kernel and user space";
+    case XPUERR_INTERRUPTED:
+      return "Execution interrupted by user";
+    default:
+      return "unkonwn error";
+  }
+}
+
+inline const char* bkclGetErrorString(BKCLResult_t stat) {
+  switch (stat) {
+    case BKCL_SUCCESS:
+      return "BKCL_SUCCESS";
+    case BKCL_INVALID_ARGUMENT:
+      return "BKCL_INVALID_ARGUMENT";
+    case BKCL_RUNTIME_ERROR:
+      return "BKCL_RUNTIME_ERROR";
+    case BKCL_SYSTEM_ERROR:
+      return "BKCL_SYSTEM_ERROR";
+    case BKCL_INTERNAL_ERROR:
+      return "BKCL_INTERNAL_ERROR";
+    default:
+      return "Unknown BKCL status";
+  }
+}
+
+inline const char* xdnnGetErrorString(int stat) {
+  switch (stat) {
+    case baidu::xpu::api::Error_t::SUCCESS:
+      return "XDNN_SUCCESS";
+    case baidu::xpu::api::Error_t::INVALID_PARAM:
+      return "XDNN_INVALID_PARAM";
+    case baidu::xpu::api::Error_t::RUNTIME_ERROR:
+      return "XDNN_RUNTIME_ERROR";
+    case baidu::xpu::api::Error_t::NO_ENOUGH_WORKSPACE:
+      return "XDNN_NO_ENOUGH_WORKSPACE";
+    case baidu::xpu::api::Error_t::NOT_IMPLEMENT:
+      return "XDNN_NOT_IMPLEMENT";
+    default:
+      return "Unknown XDNN status";
+  }
+}
+
+inline std::string build_xpu_error_msg(int stat) {
+  std::string msg("XPU Error <" + std::to_string(stat) + ">, ");
+  return msg + xpuGetErrorString(stat) + " ";
+}
+
+inline std::string build_xpu_error_msg(BKCLResult_t stat) {
+  std::string msg("BKCL Error, ");
+  return msg + bkclGetErrorString(stat) + " ";
+}
+
+inline std::string build_xpu_xdnn_error_msg(int stat, std::string msg) {
+  return msg + " XDNN Error, " + xdnnGetErrorString(stat) + " ";
+}
+
+namespace details {
+
+template <typename T>
+struct ExternalApiType {};
+
+#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
+  template <>                                         \
+  struct ExternalApiType<type> {                      \
+    using Type = type;                                \
+    static constexpr Type kSuccess = success_value;   \
+  }
+
+DEFINE_EXTERNAL_API_TYPE(int, XPU_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
+
+#undef DEFINE_EXTERNAL_API_TYPE
+
+}  // namespace details
+
+#define PADDLE_ENFORCE_XPU_SUCCESS(COND)                         \
+  do {                                                           \
+    auto __cond__ = (COND);                                      \
+    using __XPU_STATUS_TYPE__ = decltype(__cond__);              \
+    constexpr auto __success_type__ =                            \
+        ::pten::backends::xpu::details::ExternalApiType<         \
+            __XPU_STATUS_TYPE__>::kSuccess;                      \
+    if (UNLIKELY(__cond__ != __success_type__)) {                \
+      auto __summary__ = paddle::platform::errors::External(     \
+          ::pten::backends::xpu::build_xpu_error_msg(__cond__)); \
+      __THROW_ERROR_INTERNAL__(__summary__);                     \
+    }                                                            \
+  } while (0)
+
+#define PADDLE_ENFORCE_XDNN_SUCCESS(COND, MSG)                             \
+  do {                                                                     \
+    auto __cond__ = (COND);                                                \
+    if (UNLIKELY(__cond__ != baidu::xpu::api::Error_t::SUCCESS)) {         \
+      auto __summary__ = paddle::platform::errors::External(               \
+          ::pten::backends::xpu::build_xpu_xdnn_error_msg(__cond__, MSG)); \
+      __THROW_ERROR_INTERNAL__(__summary__);                               \
+    }                                                                      \
+  } while (0)
+
+}  // namespace xpu
+}  // namespace backends
+}  // namespace pten
diff --git a/paddle/pten/backends/xpu/forwards.h b/paddle/pten/backends/xpu/forwards.h
new file mode 100644
index 0000000000000000000000000000000000000000..805a74865b6d8c62019f593160c19cc661962b01
--- /dev/null
+++ b/paddle/pten/backends/xpu/forwards.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// Forward-declares.
+#pragma once
+
+// Forward declaration of xpu context.
+namespace baidu {
+namespace xpu {
+namespace api {
+
+struct Context;
+typedef void* BKCLContext_t;
+
+}  // namespace api
+}  // namespace xpu
+}  // namespace baidu
diff --git a/paddle/pten/backends/xpu/xpu_context.cc b/paddle/pten/backends/xpu/xpu_context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..af4478662a53b8b657bab02e21eb9282fd4189ac
--- /dev/null
+++ b/paddle/pten/backends/xpu/xpu_context.cc
@@ -0,0 +1,169 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/backends/xpu/xpu_context.h"
+#include <memory>
+#include "paddle/pten/api/ext/exception.h"
+
+#include "xpu/runtime.h"
+#include "xpu/runtime_ex.h"
+#include "xpu/xdnn.h"
+
+namespace xpu = baidu::xpu::api;
+
+namespace pten {
+
+struct XPUContext::XPUImpl {
+  void SetL3Cache() {
+    const int MAX_XPU_NUM = 16;
+    static void* l3ptrs[MAX_XPU_NUM] = {nullptr};
+
+    int l3_size = 13.5 * 1024 * 1024;
+    if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) {
+      l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE"));
+    }
+
+    auto selected_xpus = backends::xpu::GetXPUSelectedDevices();
+    for (unsigned int i = 0; i < selected_xpus.size(); i++) {
+      if (place_.GetDeviceId() == selected_xpus[i]) {
+        if (l3ptrs[place_.GetDeviceId()] == nullptr) {
+          xpu_malloc(static_cast<void**>(&l3ptrs[place_.GetDeviceId()]),
+                     l3_size,
+                     XPU_MEM_L3);
+        }
+        if (l3ptrs[place_.GetDeviceId()] != nullptr) {
+          context_->_l3_mgr.set(l3ptrs[place_.GetDeviceId()], l3_size);
+          VLOG(3) << "xpu place " << place_.GetDeviceId() << " set l3 size "
+                  << l3_size;
+        }
+        break;
+      }
+    }
+  }
+
+  XPUImpl() {
+    context_ = xpu::create_context();
+    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
+  }
+
+  explicit XPUImpl(XPUPlace place) : place_(place) {
+    backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
+
+    LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
+                            << static_cast<int>(place_.device);
+
+    context_ = xpu::create_context();
+    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
+    SetL3Cache();
+  }
+
+  // Users need to manage external resources.
+  explicit XPUImpl(const XPUContextResource& ctx_res,
+                   const XPUPlace& place = XPUPlace(0))
+      : res_(ctx_res), place_(place) {
+    context_ = res_.context;
+    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
+    SetL3Cache();
+  }
+
+  ~XPUImpl() {
+    if (res_.context == nullptr && context_ != nullptr) {
+      xpu::destroy_context(context_);
+      context_ = nullptr;
+    }
+  }
+
+  Place GetPlace() const { return place_; }
+
+  backends::xpu::XPUVersion GetXpuVersion() const { return xpu_version_; }
+
+  xpu::Context* GetXContext() const {
+    PD_CHECK(context_ != nullptr, "the xpu context is nullptr.");
+    return context_;
+  }
+
+  xpu::BKCLContext_t GetBkclContext() const { return bkcl_context_; }
+
+  void Wait() const {
+    backends::xpu::SetXPUDeviceId(place_.GetDeviceId());
+    PD_CHECK(context_ != nullptr, "the xpu context is nullptr.");
+    xpu_wait(context_->xpu_stream);
+  }
+
+  void SetXContext(xpu::Context* context) {
+    if (context == nullptr) {
+      return;
+    }
+    res_.context = context;
+    context_ = context;
+  }
+
+  void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; }
+
+  XPUContextResource res_;
+  XPUPlace place_;
+  backends::xpu::XPUVersion xpu_version_;
+  xpu::Context* context_{nullptr};
+  // NOTE: Distributed communicator, distributed framework manages its
+  // resources, XPUContext only holds references.
+  xpu::BKCLContext_t bkcl_context_{nullptr};
+};
+
+XPUContext::XPUContext() : DeviceContext() {
+  impl_ = std::make_unique<XPUImpl>();
+}
+
+XPUContext::XPUContext(const XPUPlace& place) {
+  impl_ = std::make_unique<XPUImpl>(place);
+}
+
+XPUContext::XPUContext(const XPUContext& other) : DeviceContext() {
+  impl_ = std::make_unique<XPUImpl>();
+  impl_->SetXContext(other.x_context());
+  impl_->SetBkclContext(other.bkcl_context());
+}
+
+XPUContext::XPUContext(XPUContext&& other) : DeviceContext() {
+  impl_ = std::move(other.impl_);
+}
+
+XPUContext::~XPUContext() = default;
+
+XPUContext::XPUContext(const XPUContextResource& ctx_res) : DeviceContext() {
+  impl_ = std::make_unique<XPUImpl>(ctx_res);
+}
+
+Place XPUContext::GetPlace() const { return impl_->GetPlace(); }
+
+backends::xpu::XPUVersion XPUContext::xpu_version() const {
+  return impl_->GetXpuVersion();
+}
+
+xpu::Context* XPUContext::x_context() const { return impl_->GetXContext(); }
+
+xpu::BKCLContext_t XPUContext::bkcl_context() const {
+  return impl_->GetBkclContext();
+}
+
+void XPUContext::Wait() const { impl_->Wait(); }
+
+void XPUContext::set_x_context(xpu::Context* context) {
+  impl_->SetXContext(context);
+}
+
+void XPUContext::set_bkcl_context(xpu::BKCLContext_t context) {
+  impl_->SetBkclContext(context);
+}
+
+}  // namespace pten
diff --git a/paddle/pten/backends/xpu/xpu_context.h b/paddle/pten/backends/xpu/xpu_context.h
index 94d2a1532f6365bdb4e916adc54a32f3b5f492f3..4ae5786211dd21718a0e72d53f742fd6ae599170 100644
--- a/paddle/pten/backends/xpu/xpu_context.h
+++ b/paddle/pten/backends/xpu/xpu_context.h
@@ -14,13 +14,60 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_XPU
+#include <memory>
+#include "paddle/pten/backends/xpu/forwards.h"
+#include "paddle/pten/common/place.h"
+#include "paddle/pten/core/device_context.h"
 
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/backends/xpu/xpu_header.h"
+#include "paddle/pten/backends/xpu/xpu_info.h"
+
+namespace xpu = baidu::xpu::api;
 
 namespace pten {
-using XPUContext = paddle::platform::XPUDeviceContext;
-}  // namespace pten
 
-#endif  // PADDLE_WITH_XPU
+struct XPUContextResource {
+  xpu::Context* context{nullptr};
+};
+
+class XPUContext : public DeviceContext {
+ public:
+  // NOTE: DeviceContext hold resources. Used in training scenarios.
+  XPUContext();
+
+  explicit XPUContext(const XPUPlace&);
+
+  // NOTE: Share the same underlying resources, please ensure that resources are
+  // not released.
+  XPUContext(const XPUContext&);
+
+  XPUContext(XPUContext&&);
+
+  virtual ~XPUContext();
+
+  Place GetPlace() const override;
+
+  backends::xpu::XPUVersion xpu_version() const;
+
+  xpu::Context* x_context() const;
+
+  // Return bkcl context.
+  xpu::BKCLContext_t bkcl_context() const;
+
+  // Wait for all operations completion in the stream.
+  void Wait() const override;
+
+ public:
+  // NOTE: External users manage resources. Used in inference scenarios.
+  explicit XPUContext(const XPUContextResource&);
+
+  void set_x_context(xpu::Context*);
+
+  void set_bkcl_context(xpu::BKCLContext_t context);
+
+ private:
+  struct XPUImpl;
+  std::unique_ptr<XPUImpl> impl_;
+};
+
+}  // namespace pten
diff --git a/paddle/pten/backends/xpu/xpu_header.h b/paddle/pten/backends/xpu/xpu_header.h
new file mode 100644
index 0000000000000000000000000000000000000000..99e4a06720f22b2993b395ab4ce7ec9585bf3ea2
--- /dev/null
+++ b/paddle/pten/backends/xpu/xpu_header.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+#include <map>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/float16.h"
+
+#include "xpu/runtime.h"
+#include "xpu/runtime_ex.h"
+#include "xpu/xdnn.h"
+
+namespace xpu = baidu::xpu::api;
+
+static std::map<int, std::string> XPUAPIErrorMsg = {
+    {xpu::Error_t::SUCCESS, "xpu api success"},
+    {xpu::Error_t::INVALID_PARAM, "xpu api invalid param"},
+    {xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"},
+    {xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}};
+
+template <typename T>
+class XPUTypeTrait {
+ public:
+  using Type = T;
+};
+
+template <>
+class XPUTypeTrait<pten::dtype::float16> {
+ public:
+  using Type = float16;
+};
+
+template <>
+class XPUTypeTrait<pten::dtype::bfloat16> {
+ public:
+  using Type = bfloat16;
+};
+
+#endif
diff --git a/paddle/pten/backends/xpu/xpu_info.cc b/paddle/pten/backends/xpu/xpu_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..01d23be848bde82445498bab23ff56ce971660f8
--- /dev/null
+++ b/paddle/pten/backends/xpu/xpu_info.cc
@@ -0,0 +1,199 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/xpu/xpu_info.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <string>
+
+#include "paddle/pten/backends/xpu/enforce_xpu.h"
+#include "paddle/pten/backends/xpu/xpu_context.h"
+#include "paddle/pten/backends/xpu/xpu_header.h"
+#include "paddle/pten/common/place.h"
+
+// TODO(wilber): The pten computing library requires a component to manage
+// flags.
+#include "paddle/fluid/platform/flags.h"
+
+PADDLE_DEFINE_EXPORTED_string(
+    selected_xpus,
+    "",
+    "A list of device ids separated by comma, like: 0,1,2,3. "
+    "This option is useful when doing multi process training and "
+    "each process have only one device (XPU). If you want to use "
+    "all visible devices, set this to empty string. NOTE: the "
+    "reason of doing this is that we want to use P2P communication"
+    "between XPU devices, use XPU_VISIBLE_DEVICES can only use"
+    "share-memory only.");
+
+namespace pten {
+class XPUContext;
+
+namespace backends {
+namespace xpu {
+
+/**************************** Version Management **************************/
+
+//! Get the version of XPU Driver
+int GetDriverVersion() {
+  uint32_t driver_version_major = 0;
+  uint32_t driver_version_minor = 0;
+  PADDLE_ENFORCE_XPU_SUCCESS(
+      xpu_get_driver_version(&driver_version_major, &driver_version_minor));
+  int driver_version = driver_version_major * 10 + driver_version_minor;
+  return driver_version;
+}
+
+//! Get the version of XPU Runtime
+int GetRuntimeVersion() {
+  uint32_t rumtime_version_major = 0;
+  uint32_t rumtime_version_minor = 0;
+  PADDLE_ENFORCE_XPU_SUCCESS(
+      xpu_get_runtime_version(&rumtime_version_major, &rumtime_version_minor));
+  int runtime_version = rumtime_version_major * 10 + rumtime_version_minor;
+  return runtime_version;
+}
+
+/**************************** Device Management **************************/
+
+static int GetDeviceCountImpl() {
+  const auto* xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES");
+  if (xpu_visible_devices != nullptr) {
+    std::string xpu_visible_devices_str(xpu_visible_devices);
+    if (std::all_of(xpu_visible_devices_str.begin(),
+                    xpu_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected.";
+      return 0;
+    }
+  }
+
+  int count = 0;
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_count(&count));
+  return count;
+}
+
+int GetXPUDeviceCount() {
+  static auto dev_cnt = GetDeviceCountImpl();
+  return dev_cnt;
+}
+
+int GetXPUCurrentDeviceId() {
+  int dev_id;
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_current_device(&dev_id));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  return dev_id;
+}
+
+void SetXPUDeviceId(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetXPUDeviceCount(),
+      paddle::platform::errors::InvalidArgument("id must less than XPU count"));
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id));
+}
+
+static inline std::vector<std::string> Split(std::string const& original,
+                                             char separator) {
+  std::vector<std::string> results;
+  std::string token;
+  std::istringstream is(original);
+  while (std::getline(is, token, separator)) {
+    if (!token.empty()) {
+      results.push_back(token);
+    }
+  }
+  return results;
+}
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetXPUSelectedDevices() {
+  // use user specified XPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_xpus.empty()) {
+    auto devices_str = Split(FLAGS_selected_xpus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetXPUDeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
+/**************************** Memory Management **************************/
+
+void MemcpySyncH2D(void* dst,
+                   const void* src,
+                   size_t count,
+                   const pten::XPUPlace& dst_place) {
+  XPUDeviceGuard guard(dst_place.device);
+  PADDLE_ENFORCE_XPU_SUCCESS(
+      xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+}
+
+void MemcpySyncD2H(void* dst,
+                   const void* src,
+                   size_t count,
+                   const pten::XPUPlace& src_place,
+                   const pten::XPUContext& dev_ctx) {
+  XPUDeviceGuard guard(src_place.GetDeviceId());
+  dev_ctx.Wait();
+  PADDLE_ENFORCE_XPU_SUCCESS(
+      xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST));
+}
+
+// if src.device == dst.device and you need sync , after call this function,
+// need to call xpu_wait()
+void MemcpySyncD2D(void* dst,
+                   const pten::XPUPlace& dst_place,
+                   const void* src,
+                   const pten::XPUPlace& src_place,
+                   size_t count,
+                   const pten::XPUContext& dev_ctx) {
+  int dev_id = GetXPUCurrentDeviceId();
+  if (dst_place.device == dev_id && src_place.device == dev_id) {
+    PADDLE_ENFORCE_XDNN_SUCCESS(
+        baidu::xpu::api::copy(dev_ctx.x_context(),
+                              static_cast<const int8_t*>(src),
+                              static_cast<int8_t*>(dst),
+                              count),
+        "copy ");
+  } else {
+    PADDLE_ENFORCE_XPU_SUCCESS(
+        xpu_memcpy_peer(dst_place.device, dst, src_place.device, src, count));
+  }
+}
+
+/**************************** Others **************************/
+
+XPUVersion get_xpu_version(int dev_id) {
+  uint64_t v = 0;
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id));
+
+  if (v == K100 || v == K200) {
+    VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n";
+    return XPU1;
+  } else {
+    VLOG(1) << "KUNLUN device " << dev_id << " is XPU2\n";
+    return XPU2;
+  }
+}
+
+}  // namespace xpu
+}  // namespace backends
+}  // namespace pten
diff --git a/paddle/pten/backends/xpu/xpu_info.h b/paddle/pten/backends/xpu/xpu_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..8cf836ba16dc6a4ff1e5408bb92b8e60758895b1
--- /dev/null
+++ b/paddle/pten/backends/xpu/xpu_info.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/pten/common/place.h"
+
+namespace pten {
+
+class XPUContext;
+
+namespace backends {
+namespace xpu {
+
+/***** Version Management *****/
+
+//! Get the version of XPU Driver
+int GetDriverVersion();
+
+//! Get the version of XPU Runtime
+int GetRuntimeVersion();
+
+/***** Device Management *****/
+
+//! Get the total number of XPU devices in system.
+int GetXPUDeviceCount();
+
+//! Set the XPU device id for next execution.
+void SetXPUDeviceId(int device_id);
+
+//! Get the current XPU device id in system.
+int GetXPUCurrentDeviceId();
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetXPUSelectedDevices();
+
+/***** Memory Management *****/
+
+//! Copy memory from address src to dst synchronously.
+void MemcpySyncH2D(void *dst,
+                   const void *src,
+                   size_t count,
+                   const pten::XPUPlace &dst_place);
+void MemcpySyncD2H(void *dst,
+                   const void *src,
+                   size_t count,
+                   const pten::XPUPlace &src_place,
+                   const pten::XPUContext &dev_ctx);
+void MemcpySyncD2D(void *dst,
+                   const pten::XPUPlace &dst_place,
+                   const void *src,
+                   const pten::XPUPlace &src_place,
+                   size_t count,
+                   const pten::XPUContext &dev_ctx);
+
+class XPUDeviceGuard {
+ public:
+  explicit inline XPUDeviceGuard(int dev_id) {
+    int prev_id = GetXPUCurrentDeviceId();
+    if (prev_id != dev_id) {
+      prev_id_ = prev_id;
+      SetXPUDeviceId(dev_id);
+    }
+  }
+
+  inline ~XPUDeviceGuard() {
+    if (prev_id_ != -1) {
+      SetXPUDeviceId(prev_id_);
+    }
+  }
+
+  XPUDeviceGuard(const XPUDeviceGuard &o) = delete;
+  XPUDeviceGuard &operator=(const XPUDeviceGuard &o) = delete;
+
+ private:
+  int prev_id_{-1};
+};
+
+enum XPUVersion { XPU1, XPU2 };
+XPUVersion get_xpu_version(int dev_id);
+
+}  // namespace xpu
+}  // namespace backends
+}  // namespace pten
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
index b281f95f36bbd9100987a7d92c03822131ff0200..0c5437ff6d07abf2f5b4536fd314455839807e00 100644
--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
@@ -9,23 +9,26 @@ else()
   cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place)
 endif()
 
-cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce convert_utils)
-cc_library(kernel_context SRCS kernel_context.cc DEPS enforce pten_context)
+cc_library(errors SRCS errors.cc)
+set(pten_enforce_deps errors flags)
+cc_library(pten_enforce INTERFACE SRCS enforce.cc DEPS ${pten_enforce_deps})
 
-cc_library(tensor_base SRCS tensor_base.cc allocator.cc storage.cc DEPS enforce)
-cc_library(tensor_meta SRCS tensor_meta.cc DEPS enforce mixed_vector)
-cc_library(lod_utils SRCS lod_utils.cc DEPS enforce mixed_vector)
-cc_library(dense_tensor SRCS dense_tensor.cc DEPS convert_utils tensor_meta tensor_base)
+cc_library(kernel_factory SRCS kernel_factory.cc DEPS pten_enforce convert_utils)
+cc_library(kernel_context SRCS kernel_context.cc DEPS pten_enforce pten_context)
+
+cc_library(tensor_base SRCS tensor_base.cc allocator.cc storage.cc DEPS pten_enforce)
+cc_library(tensor_meta SRCS tensor_meta.cc DEPS pten_enforce mixed_vector)
+cc_library(lod_utils SRCS lod_utils.cc DEPS pten_enforce mixed_vector)
+cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS convert_utils tensor_meta tensor_base)
 cc_library(pten_device_context SRCS device_context.cc DEPS tensor_base )
 
 cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
 cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
+cc_library(selected_rows SRCS selected_rows.cc DEPS dense_tensor mixed_vector pten_enforce ddim)
 
 cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
 cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
 
-cc_library(selected_rows SRCS selected_rows.cc DEPS dense_tensor mixed_vector enforce ddim)
-
 # Will remove once we implemented MKLDNN_Tensor
 if(WITH_MKLDNN)
     add_dependencies(dense_tensor mkldnn)
diff --git a/paddle/pten/core/compat/CMakeLists.txt b/paddle/pten/core/compat/CMakeLists.txt
index 253f60daf1f890caccdeb02908c1b4fb3d6c62da..0c081edb81ccf740ed74c377070f4847650a1ff2 100644
--- a/paddle/pten/core/compat/CMakeLists.txt
+++ b/paddle/pten/core/compat/CMakeLists.txt
@@ -1 +1,2 @@
-cc_library(arg_map_context SRCS arg_map_context.cc DEPS enforce)
+cc_library(arg_map_context SRCS arg_map_context.cc DEPS pten_enforce)
+cc_library(op_utils SRCS op_utils.cc DEPS arg_map_context enforce convert_utils)
diff --git a/paddle/pten/core/compat/arg_map_context.cc b/paddle/pten/core/compat/arg_map_context.cc
index 3914a8a684eda937cf54283f72a04bec67cf64af..73fa0b300cf96cb653129148ea86883a8536ebe0 100644
--- a/paddle/pten/core/compat/arg_map_context.cc
+++ b/paddle/pten/core/compat/arg_map_context.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/pten/core/compat/arg_map_context.h"
 
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/pten/core/enforce.h"
 
 namespace pten {
-
 std::ostream& operator<<(std::ostream& os, KernelSignature signature) {
   os << "Kernel Signature - name: " << signature.name << "; inputs: "
      << paddle::string::join_strings(std::get<0>(signature.args), ", ")
diff --git a/paddle/pten/core/compat/op_utils.cc b/paddle/pten/core/compat/op_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..12c2d74737d5d988c3708b2d47f9cad2a0d78e08
--- /dev/null
+++ b/paddle/pten/core/compat/op_utils.cc
@@ -0,0 +1,29 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/compat/op_utils.h"
+
+namespace pten {
+
+DefaultKernelSignatureMap& DefaultKernelSignatureMap::Instance() {
+  static DefaultKernelSignatureMap g_default_kernel_sig_map;
+  return g_default_kernel_sig_map;
+}
+
+OpUtilsMap& OpUtilsMap::Instance() {
+  static OpUtilsMap g_op_utils_map;
+  return g_op_utils_map;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/core/compat/op_utils.h b/paddle/pten/core/compat/op_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..505ef13891aeea1454c8807cfee59223ce49cbab
--- /dev/null
+++ b/paddle/pten/core/compat/op_utils.h
@@ -0,0 +1,166 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mutex>
+
+#include "paddle/pten/core/compat/arg_map_context.h"
+#include "paddle/pten/core/infermeta_utils.h"
+#include "paddle/pten/core/kernel_def.h"
+#include "paddle/pten/core/macros.h"
+#include "paddle/utils/flat_hash_map.h"
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pten {
+
+class DefaultKernelSignatureMap {
+ public:
+  static DefaultKernelSignatureMap& Instance();
+
+  bool Has(const std::string& op_type) const { return map_.count(op_type) > 0; }
+
+  const KernelSignature& Get(const std::string& op_type) const {
+    auto it = map_.find(op_type);
+    PADDLE_ENFORCE_NE(
+        it,
+        map_.end(),
+        paddle::platform::errors::NotFound(
+            "Operator `%s`'s kernel signature is not registered.", op_type));
+    return it->second;
+  }
+
+  void Insert(std::string op_type, KernelSignature signature) {
+    PADDLE_ENFORCE_NE(
+        Has(op_type),
+        true,
+        paddle::platform::errors::AlreadyExists(
+            "Operator (%s)'s Kernel Siginature has been registered.", op_type));
+    map_.insert({std::move(op_type), std::move(signature)});
+  }
+
+ private:
+  DefaultKernelSignatureMap() = default;
+
+  paddle::flat_hash_map<std::string, KernelSignature> map_;
+
+  DISABLE_COPY_AND_ASSIGN(DefaultKernelSignatureMap);
+};
+
+class OpUtilsMap {
+ public:
+  static OpUtilsMap& Instance();
+
+  bool Contains(const std::string& op_type) const {
+    return name_map_.count(op_type) || arg_mapping_fn_map_.count(op_type);
+  }
+
+  void InsertApiName(std::string op_type, std::string api_name) {
+    PADDLE_ENFORCE_EQ(
+        name_map_.count(op_type),
+        0UL,
+        paddle::platform::errors::AlreadyExists(
+            "Operator (%s)'s api name has been registered.", op_type));
+    name_map_.insert({std::move(op_type), std::move(api_name)});
+  }
+
+  void InsertArgumentMappingFn(std::string op_type, ArgumentMappingFn fn) {
+    PADDLE_ENFORCE_EQ(
+        arg_mapping_fn_map_.count(op_type),
+        0UL,
+        paddle::platform::errors::AlreadyExists(
+            "Operator (%s)'s argu,emt mapping function has been registered.",
+            op_type));
+    arg_mapping_fn_map_.insert({std::move(op_type), std::move(fn)});
+  }
+
+  std::string GetApiName(const std::string& op_type) const {
+    auto it = name_map_.find(op_type);
+    if (it == name_map_.end()) {
+      return "deprecated";
+    } else {
+      return it->second;
+    }
+  }
+
+  ArgumentMappingFn GetArgumentMappingFn(const std::string& op_type) const {
+    auto it = arg_mapping_fn_map_.find(op_type);
+    if (it == arg_mapping_fn_map_.end()) {
+      auto func =
+          [op_type](const ArgumentMappingContext& ctx) -> KernelSignature {
+        return DefaultKernelSignatureMap::Instance().Get(op_type);
+      };
+      return func;
+    } else {
+      return it->second;
+    }
+  }
+
+ private:
+  OpUtilsMap() = default;
+
+  paddle::flat_hash_map<std::string, std::string> name_map_;
+  paddle::flat_hash_map<std::string, ArgumentMappingFn> arg_mapping_fn_map_;
+
+  DISABLE_COPY_AND_ASSIGN(OpUtilsMap);
+};
+
+struct ApiNameRegistrar {
+  ApiNameRegistrar(const char* op_type, const char* api_name) {
+    OpUtilsMap::Instance().InsertApiName(op_type, api_name);
+  }
+};
+
+struct ArgumentMappingFnRegistrar {
+  ArgumentMappingFnRegistrar(const char* op_type,
+                             ArgumentMappingFn arg_mapping_fn) {
+    OpUtilsMap::Instance().InsertArgumentMappingFn(op_type,
+                                                   std::move(arg_mapping_fn));
+  }
+};
+
+#define PT_REGISTER_API_NAME(op_type, api_name)                             \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                        \
+      pt_register_api_name_ns_check_##op_type,                              \
+      "PT_REGISTER_API_NAME must be called in global namespace.");          \
+  static const ::pten::ApiNameRegistrar __registrar_api_name_for_##op_type( \
+      #op_type, #api_name);                                                 \
+  int TouchApiNameSymbol_##op_type() { return 0; }
+
+#define PT_DECLARE_API_NAME(op_type)                              \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                              \
+      pt_declare_ai_name_ns_check_##op_type,                      \
+      "PT_DECLARE_API_NAME must be called in global namespace."); \
+  extern int TouchApiNameSymbol_##op_type();                      \
+  UNUSED static int __declare_api_name_symbol_for_##op_type =     \
+      TouchApiNameSymbol_##op_type()
+
+#define PT_REGISTER_ARG_MAPPING_FN(op_type, arg_mapping_fn)              \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                     \
+      pt_register_arg_map_fn_ns_check_##op_type,                         \
+      "PT_REGISTER_ARG_MAPPING_FN must be called in global namespace."); \
+  static const ::pten::ArgumentMappingFnRegistrar                        \
+      __registrar_arg_map_fn_for_##op_type(#op_type, arg_mapping_fn);    \
+  int TouchArgumentMappingFnSymbol_##op_type() { return 0; }
+
+#define PT_DECLARE_ARG_MAPPING_FN(op_type)                              \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                    \
+      pt_declare_arg_map_fn_ns_check_##op_type,                         \
+      "PT_DECLARE_ARG_MAPPING_FN must be called in global namespace."); \
+  extern int TouchArgumentMappingFnSymbol_##op_type();                  \
+  UNUSED static int __declare_arg_map_fn_symbol_for_##op_type =         \
+      TouchArgumentMappingFnSymbol_##op_type()
+
+}  // namespace pten
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index b1a5015f010c2002b8e5dbb6fc9eac1269224ad1..15f9f0bda3c25e2b8a4125d1025d8b0a673f2dc5 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -22,14 +22,6 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/convert_utils.h"
 
-namespace paddle {
-namespace framework {
-extern void TensorCopy(const pten::DenseTensor& src,
-                       const paddle::platform::Place& dst_place,
-                       pten::DenseTensor* dst);
-}
-}
-
 namespace pten {
 
 DenseTensor::DenseTensor(Allocator* a, const DenseTensorMeta& meta)
@@ -126,6 +118,19 @@ void DenseTensor::set_meta(DenseTensorMeta&& meta) {
   meta_ = std::move(meta);
 }
 
+void DenseTensor::set_meta(const DenseTensorMeta& meta) {
+  PADDLE_ENFORCE(
+      meta.valid(),
+      paddle::platform::errors::InvalidArgument(
+          "Input meta is invalid, please check the meta attribute."));
+  meta_.dims = meta.dims;
+  meta_.dtype = meta.dtype;
+  meta_.is_scalar = meta.is_scalar;
+  meta_.layout = meta.layout;
+  meta_.lod = meta.lod;
+  meta_.offset = meta.offset;
+}
+
 /* @jim19930609: This interface will be further modified util we finalized the
    design for Allocator - Allocation
    For now, we have to temporarily accommodate two independent use cases:
@@ -167,370 +172,4 @@ DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128);
 
 #undef DATA_MEMBER_FUNC_INSTANTIATION
 
-/* --------------------------- */
-/*   From framework::Tensor    */
-/* --------------------------- */
-DenseTensor::DenseTensor() {
-  inplace_version_counter_ = std::make_shared<TensorInplaceVersion>(0);
-  meta_.dtype = paddle::experimental::DataType::FLOAT32;
-  meta_.offset = 0;
-}
-
-DenseTensor::DenseTensor(paddle::framework::proto::VarType::Type dtype) {
-  inplace_version_counter_ = std::make_shared<TensorInplaceVersion>(0);
-  meta_.dtype = TransToPtenDataType(dtype);
-  meta_.offset = 0;
-}
-
-size_t DenseTensor::memory_size() const {
-  return holder_ == nullptr ? 0UL : holder_->size() - meta_.offset;
-}
-
-void DenseTensor::check_memory_size() const {
-  PADDLE_ENFORCE_NOT_NULL(holder_,
-                          paddle::platform::errors::PreconditionNotMet(
-                              "Tensor holds no memory. "
-                              "Call Tensor::mutable_data firstly."));
-  PADDLE_ENFORCE_LE(
-      numel() * SizeOf(dtype()),
-      memory_size(),
-      paddle::platform::errors::PreconditionNotMet(
-          "Tensor's dimension is out of bound."
-          "Tensor's dimension must be equal or less than the size of its "
-          "memory."
-          "But received Tensor's dimension is d%, memory's size is %d.",
-          numel() * SizeOf(dtype()),
-          memory_size()));
-}
-
-const paddle::platform::Place& DenseTensor::place() const {
-  PADDLE_ENFORCE_NOT_NULL(
-      holder_,
-      paddle::platform::errors::PreconditionNotMet(
-          "Tensor not initialized yet when DenseTensor::place() is called."));
-  return holder_->place();
-}
-
-paddle::framework::proto::VarType::Type DenseTensor::type() const {
-  return TransToProtoVarType(meta_.dtype);
-}
-
-paddle::framework::proto::VarType::Type DenseTensor::saved_type() const {
-  return TransToProtoVarType(meta_.dtype);
-}
-
-void DenseTensor::set_layout(const paddle::framework::DataLayout layout) {
-  meta_.layout = layout;
-}
-
-void DenseTensor::ResetHolder(const std::shared_ptr<pten::Allocation>& holder) {
-  PADDLE_ENFORCE_EQ(
-      meta_.offset,
-      0,
-      paddle::platform::errors::Fatal(
-          "Only the offset is supported to zero when the holder is reset."));
-
-  if (holder_) {
-    PADDLE_ENFORCE_LE(
-        numel() * SizeOf(dtype()) + meta_.offset,
-        holder->size(),
-        paddle::platform::errors::InvalidArgument(
-            "The size of Holder is not enough to store the Tensor."));
-  }
-  holder_ = holder;
-}
-
-void DenseTensor::ResetHolderWithType(
-    const std::shared_ptr<pten::Allocation>& holder,
-    paddle::framework::proto::VarType::Type type) {
-  set_type(type);
-  ResetHolder(holder);
-}
-
-void DenseTensor::set_type(paddle::framework::proto::VarType::Type type) {
-  meta_.dtype = TransToPtenDataType(type);
-}
-
-void* DenseTensor::mutable_data(const paddle::platform::Place& place,
-                                paddle::framework::proto::VarType::Type type,
-                                size_t requested_size) {
-  set_type(type);
-  PADDLE_ENFORCE_GE(
-      numel(),
-      0,
-      paddle::platform::errors::PreconditionNotMet(
-          "The Tensor's element number must be equal or greater than zero. "
-          "The Tensor's shape is [",
-          dims(),
-          "] now"));
-  size_t size = numel() * SizeOf(dtype());
-  if (requested_size && (requested_size > size)) {
-    size = requested_size;
-  }
-
-  /* some versions of boost::variant don't have operator!= */
-  if (holder_ == nullptr || !(holder_->place() == place) ||
-      holder_->size() < size + meta_.offset) {
-    holder_.reset();
-    holder_ = paddle::memory::AllocShared(place, size);
-    meta_.offset = 0;
-  }
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 meta_.offset);
-}
-
-void* DenseTensor::mutable_data(const paddle::platform::Place& place,
-                                size_t requested_size) {
-  return mutable_data(place, type(), requested_size);
-}
-
-void* DenseTensor::mutable_data(const paddle::platform::Place& place,
-                                paddle::framework::proto::VarType::Type type,
-                                const paddle::platform::Stream& stream) {
-  set_type(type);
-  PADDLE_ENFORCE_GE(
-      numel(),
-      0,
-      paddle::platform::errors::PreconditionNotMet(
-          "The Tensor's element number must be equal or greater than zero. "
-          "The Tensor's shape is [",
-          dims(),
-          "] now"));
-  size_t size = numel() * SizeOf(dtype());
-
-  /* some versions of boost::variant don't have operator!= */
-  if (holder_ == nullptr || !(holder_->place() == place) ||
-      holder_->size() < size + meta_.offset ||
-      !(paddle::platform::is_gpu_place(place) &&
-        paddle::memory::InSameStream(holder_, stream))) {
-    holder_.reset();
-    holder_ = paddle::memory::AllocShared(place, size, stream);
-    meta_.offset = 0;
-  }
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 meta_.offset);
-}
-
-/* @jim19930609: The following "mutable_data" only supports specific dtypes
-   defined in OpProto. This part need another clean up once the data type across
-   Fluid
-   and Pten get unified.
-   */
-template <typename T>
-inline T* DenseTensor::mutable_data(const DDim& dims,
-                                    const paddle::platform::Place& place,
-                                    size_t requested_size) {
-  static_assert(std::is_pod<T>::value, "T must be POD");
-  meta_.dims = dims;
-  return mutable_data<T>(place, requested_size);
-}
-
-template <typename T>
-inline T* DenseTensor::mutable_data(const paddle::platform::Place& place,
-                                    size_t requested_size) {
-  static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(
-      place, paddle::framework::DataTypeTrait<T>::DataType(), requested_size));
-}
-
-void DenseTensor::ShareBufferWith(const DenseTensor& tensor) {
-  holder_ = tensor.holder_;
-  meta_.offset = tensor.meta().offset;
-  meta_.dtype = tensor.dtype();
-}
-
-#define LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(dtype) \
-  template dtype* DenseTensor::mutable_data(         \
-      const DDim& dims,                              \
-      const paddle::platform::Place& place,          \
-      size_t requested_size);                        \
-  template dtype* DenseTensor::mutable_data(         \
-      const paddle::platform::Place& place, size_t requested_size);
-
-LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(bool)
-LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int8_t)
-LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(uint8_t)
-LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int16_t)
-LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int32_t)
-LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int64_t)
-LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(float)
-LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(double)
-LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::bfloat16)
-LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::float16)
-LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex64)
-LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128)
-
-#undef LEGACY_DATA_MEMBER_FUNC_INSTANTIATION
-
-/* ------------------------------ */
-/*   From framework::LoDTensor    */
-/* ------------------------------ */
-
-DenseTensor::DenseTensor(intrusive_ptr<Storage> storage,
-                         const DenseTensorMeta& meta)
-    : meta_(meta), holder_(storage->move_data_shared()) {}
-
-DenseTensor::DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta)
-    : meta_(std::move(meta)), holder_(storage->move_data_shared()) {}
-
-DenseTensor::DenseTensor(const LoD& lod) : DenseTensor() { meta_.lod = lod; }
-
-void DenseTensor::set_lod(const LoD& lod) { meta_.lod = lod; }
-
-LoD* DenseTensor::mutable_lod() { return &meta_.lod; }
-
-std::pair<size_t, size_t> DenseTensor::lod_element(size_t level,
-                                                   size_t elem) const {
-  PADDLE_ENFORCE_LT(
-      level,
-      NumLevels(),
-      paddle::platform::errors::InvalidArgument(
-          "The input level of LoD is invalid, it should be less than LoD "
-          "size. The input level is %zu, the LoD size is %zu.",
-          level,
-          NumLevels()));
-
-  PADDLE_ENFORCE_LT(elem,
-                    NumElements(level),
-                    paddle::platform::errors::InvalidArgument(
-                        "The input element of LoD is invalid, it should be "
-                        "less than the number of elements in its level."
-                        "The input element is %zu, the number of elements in "
-                        "its level is %zu.",
-                        elem,
-                        NumElements(level)));
-
-  return std::make_pair((meta_.lod)[level][elem], (meta_.lod)[level][elem + 1]);
-}
-
-size_t DenseTensor::NumLevels() const { return meta_.lod.size(); }
-
-size_t DenseTensor::NumElements(size_t level) const {
-  PADDLE_ENFORCE_LT(
-      level,
-      NumLevels(),
-      paddle::platform::errors::InvalidArgument(
-          "The input level of LoD is invalid, it should be less than LoD "
-          "size. The input level is %zu, the LoD size is %zu.",
-          level,
-          NumLevels()));
-
-  // the last offset is the end of last element
-  return (meta_.lod)[level].size() - 1;
-}
-
-DenseTensor& DenseTensor::Resize(const DDim& dims) {
-  meta_.dims = dims;
-  return *this;
-}
-
-DenseTensor DenseTensor::Slice(int64_t begin_idx, int64_t end_idx) const {
-  check_memory_size();
-  PADDLE_ENFORCE_GE(begin_idx,
-                    0,
-                    paddle::platform::errors::OutOfRange(
-                        "The start row index must be greater than 0."
-                        "But received the start index is d%.",
-                        begin_idx));
-  PADDLE_ENFORCE_LE(end_idx,
-                    meta_.dims[0],
-                    paddle::platform::errors::OutOfRange(
-                        "The end row index is out of bound."));
-  PADDLE_ENFORCE_LT(
-      begin_idx,
-      end_idx,
-      paddle::platform::errors::InvalidArgument(
-          "The start row index must be less than the end row index."
-          "But received the start index = %d, the end index = %d.",
-          begin_idx,
-          end_idx));
-
-  if (meta_.dims[0] == 1) {
-    return *this;
-  } else {
-    size_t base = numel() / meta_.dims[0];
-    DenseTensor dst;
-    dst.holder_ = holder_;
-    dst.set_layout(meta_.layout);
-    dst.meta_.dtype = meta_.dtype;
-    DDim dst_dims = meta_.dims;
-    dst_dims[0] = end_idx - begin_idx;
-    dst.Resize(dst_dims);
-    dst.meta_.offset = meta_.offset + begin_idx * base * SizeOf(dtype());
-    return dst;
-  }
-}
-
-std::vector<DenseTensor> DenseTensor::Split(int64_t split_size,
-                                            int64_t axis) const {
-  check_memory_size();
-
-  PADDLE_ENFORCE_GE(meta_.dims.size(),
-                    0,
-                    paddle::platform::errors::OutOfRange(
-                        "split expects at least a 1-dimensional tensor"));
-
-  PADDLE_ENFORCE_GE(
-      split_size,
-      0,
-      paddle::platform::errors::OutOfRange(
-          "split expects split_size be non-negative, but got split_size is %d",
-          split_size));
-
-  int64_t numel_size = meta_.dims[axis];
-
-  int64_t num_splits = 1;
-  if (split_size != 0) {
-    num_splits =
-        std::max<int64_t>((numel_size + split_size - 1) / split_size, 1);
-  }
-
-  std::vector<DenseTensor> splits(num_splits);
-  int64_t last_split_size = split_size - (split_size * num_splits - numel_size);
-
-  for (int64_t i = 0; i < num_splits; ++i) {
-    int64_t length = i < num_splits - 1 ? split_size : last_split_size;
-    splits[i] = Slice(i * split_size, i * split_size + length);
-  }
-  return splits;
-}
-
-std::vector<DenseTensor> DenseTensor::Chunk(int64_t chunks,
-                                            int64_t axis) const {
-  check_memory_size();
-  PADDLE_ENFORCE_GE(meta_.dims.size(),
-                    0,
-                    paddle::platform::errors::OutOfRange(
-                        "split expects at least a 1-dimensional tensor"));
-  PADDLE_ENFORCE_GE(
-      chunks,
-      0,
-      paddle::platform::errors::OutOfRange(
-          "chunks expects to be greater than 0, but got chunks is %d", chunks));
-
-  int64_t numel_size = meta_.dims[axis];
-  int64_t split_size = (numel_size + chunks - 1) / chunks;
-  return Split(split_size, axis);
-}
-
-DenseTensor& DenseTensor::ShareDataWith(const DenseTensor& src) {
-  src.check_memory_size();
-  // Preserve LoD
-  auto lod = meta_.lod;
-  *this = src;
-  meta_.lod = lod;
-  return *this;
-}
-
-DenseTensor& DenseTensor::ShareInplaceVersionCounterWith(
-    const DenseTensor& src) {
-  PADDLE_ENFORCE_NOT_NULL(
-      inplace_version_counter_,
-      paddle::platform::errors::PreconditionNotMet(
-          "Tensor does not hold inplace_version_counter_."));
-
-  inplace_version_counter_ = src.inplace_version_counter_;
-  return *this;
-}
-
 }  // namespace pten
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index 88c459e6d87eaee4cd52111c42458868698eda43..2823441f97da2a784d6fb175429a0496e50d6aaa 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -33,25 +33,6 @@ namespace pten {
 
 class CompatibleDenseTensorUtils;
 
-/* --------------------------- */
-/*   From framework::Tensor    */
-/* --------------------------- */
-/* Temporarily put TensorInplaceVersion inside DenseTensor.
-   Will move to AutogradMeta as soon as we switch to Eager Dygraph.
-   */
-class TensorInplaceVersion {
- public:
-  explicit TensorInplaceVersion(uint32_t inplace_version = 0)
-      : inplace_version_(inplace_version) {}
-  bool IsUnique() const { return inplace_version_ == 0; }
-  void Bump() { ++inplace_version_; }
-  uint32_t CurrentVersion() const { return inplace_version_; }
-  void SetInplaceVersionToZero() { inplace_version_ = 0; }
-
- private:
-  uint32_t inplace_version_;
-};
-
 /// \brief The Dense tensor store values in a contiguous sequential block
 /// of memory where all values are represented. Tensors or multi-dimensional
 /// arrays are used in math operators.
@@ -90,6 +71,8 @@ class DenseTensor : public TensorBase,
 
   DenseTensor& operator=(DenseTensor&& other);
 
+  DenseTensor();
+
   /// \brief Destroy the tensor object and release exclusive resources.
   virtual ~DenseTensor() = default;
 
@@ -131,6 +114,8 @@ class DenseTensor : public TensorBase,
   /// \param meta The meta information of the tensor.
   void set_meta(DenseTensorMeta&& meta);
 
+  void set_meta(const DenseTensorMeta& meta);
+
   /// \brief Test whether the metadata is valid.
   /// \return Whether the metadata is valid.
   bool valid() const noexcept override { return meta_.valid(); }
@@ -177,181 +162,6 @@ class DenseTensor : public TensorBase,
   DenseTensorMeta meta_;
   std::shared_ptr<pten::Allocation> holder_;
 
-  /* --------------------------- */
-  /*   From framework::Tensor    */
-  /* --------------------------- */
-  /* The following members & interfaces were copied from framework::Tensor,
-     so as to facilitate the unification of different Tensors
-
-     Will be adjusted/removed/moved in the near future
-   */
- public:
-  /* @jim19930609: The way default constructor handles allocator might change,
-     according to
-                   the final design of Allocation - Allocator.
-   */
-  DenseTensor();
-
-  /* @jim19930609: Remove dependency on protobuf after Tensor Unification.
-   */
-  explicit DenseTensor(paddle::framework::proto::VarType::Type dtype);
-
-  /// \brief Use existing storage space to create dense tensor. This interface
-  /// can be used to deliberately create an uninitialized dense tensor.
-  /// \param storage The existing storage.
-  /// \param meta The meta data of dense tensor.
-  DenseTensor(intrusive_ptr<Storage> storage, const DenseTensorMeta& meta);
-
-  /// \brief Use existing storage space to create dense tensor. This interface
-  /// can be used to deliberately create an uninitialized dense tensor.
-  /// \param storage The existing storage.
-  /// \param meta The meta data of dense tensor.
-  DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta);
-
-  inline bool IsInitialized() const { return holder_ != nullptr; }
-
-  template <typename T>
-  T* data();
-
-  void* data();
-
-  template <typename T>
-  T* mutable_data(const paddle::platform::Place& place,
-                  size_t requested_size = 0);
-
-  template <typename T>
-  T* mutable_data(const DDim& dims,
-                  const paddle::platform::Place& place,
-                  size_t requested_size = 0);
-
-  void* mutable_data(const paddle::platform::Place& place,
-                     paddle::framework::proto::VarType::Type type,
-                     size_t requested_size = 0);
-
-  void* mutable_data(const paddle::platform::Place& place,
-                     size_t requested_size = 0);
-
-  void* mutable_data(const paddle::platform::Place& place,
-                     paddle::framework::proto::VarType::Type type,
-                     const paddle::platform::Stream& stream);
-
-  /* @jim19930609: Remove dependency on protobuf after Tensor Unification.
-   */
-  paddle::framework::proto::VarType::Type type() const;
-
-  /* @jim19930609: Remove dependency on protobuf after Tensor Unification.
-   */
-  paddle::framework::proto::VarType::Type saved_type() const;
-
-  // memory size returns the holding memory size in byte.
-  size_t memory_size() const;
-
-  void check_memory_size() const;
-
-  void set_layout(const paddle::framework::DataLayout layout);
-
-  void clear() {
-    holder_.reset();
-    meta_.offset = 0;
-  }
-
-  void ShareBufferWith(const DenseTensor& tensor);
-
-  void ShareDataTypeWith(const DenseTensor& tensor) {
-    meta_.dtype = tensor.meta().dtype;
-  }
-
-  bool IsSharedBufferWith(const DenseTensor& src) const {
-    return holder_ && holder_ == src.Holder();
-  }
-
-  const std::shared_ptr<pten::Allocation>& Holder() const { return holder_; }
-
-  void set_offset(size_t offset) { meta_.offset = offset; }
-  size_t offset() const { return meta_.offset; }
-
-  std::shared_ptr<pten::Allocation> MoveMemoryHolder() {
-    return std::move(holder_);
-  }
-
-  void ResetHolder(const std::shared_ptr<pten::Allocation>& holder);
-
-  void ResetHolderWithType(const std::shared_ptr<pten::Allocation>& holder,
-                           paddle::framework::proto::VarType::Type type);
-
-  void set_type(paddle::framework::proto::VarType::Type type);
-
-  TensorInplaceVersion& InplaceVersionCounter() {
-    return *inplace_version_counter_;
-  }
-
-  /*! The internal of two tensors share the same memory block. */
-  DenseTensor& ShareDataWith(const DenseTensor& src);
-
-  /*! The internal of two tensors share the same inplace version counter. */
-  DenseTensor& ShareInplaceVersionCounterWith(const DenseTensor& src);
-
-  DenseTensor Slice(int64_t begin_idx, int64_t end_idx) const;
-
-  std::vector<DenseTensor> Split(int64_t split_size, int64_t axis) const;
-
-  std::vector<DenseTensor> Chunk(int64_t chunks, int64_t axis) const;
-
- protected:
-  std::shared_ptr<TensorInplaceVersion> inplace_version_counter_;
-
-/* @jim19930609: This is a hack
-   In general, it is badly designed to fuse MKLDNN-specific objects into a
-   generic Tensor.
-   We temporarily leave them here to unblock Tensor Unification progress.
-   In the final state, we should come up with a MKLDNN_Tensor and move the
-   following codes there.
-   */
-#ifdef PADDLE_WITH_MKLDNN
-
- public:
-  inline dnnl::memory::format_tag format() const { return format_; }
-
-  inline void set_format(const dnnl::memory::format_tag format) {
-    format_ = format;
-  }
-
- protected:
-  /**
-   * @brief the detail format of memory block which have layout as kMKLDNN
-   *
-   * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
-   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
-   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
-   *       this field.
-   */
-
-  dnnl::memory::format_tag format_ = dnnl::memory::format_tag::undef;
-#endif
-
-  /* ------------------------------ */
-  /*   From framework::LoDTensor    */
-  /* ------------------------------ */
-  /* The following members & interfaces were copied from framework::Tensor,
-     so as to facilitate the unification of different Tensors
-
-     Will be adjusted/removed/moved in the near future
-   */
- public:
-  explicit DenseTensor(const LoD& lod);
-
-  void set_lod(const LoD& lod);
-
-  LoD* mutable_lod();
-
-  /*
-   * Get the start offset and end offset of an  element from LoD.
-   */
-  std::pair<size_t, size_t> lod_element(size_t level, size_t elem) const;
-
-  size_t NumLevels() const;
-
-  size_t NumElements(size_t level = 0) const;
+#include "paddle/pten/core/dense_tensor.inl"
 };
-
 }  // namespace pten
diff --git a/paddle/pten/core/dense_tensor.inl b/paddle/pten/core/dense_tensor.inl
new file mode 100644
index 0000000000000000000000000000000000000000..754baeb73c90c2b494bf774588219c877a2fb8e9
--- /dev/null
+++ b/paddle/pten/core/dense_tensor.inl
@@ -0,0 +1,197 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/* --------------------------- */
+/*   From framework::Tensor    */
+/* --------------------------- */
+/* The following members & interfaces were copied from framework::Tensor,
+    so as to facilitate the unification of different Tensors
+
+    Will be adjusted/removed/moved in the near future
+*/
+public:
+/* Temporarily put InplaceVersion inside DenseTensor.
+Will move to AutogradMeta as soon as we switch to Eager Dygraph.
+*/
+class InplaceVersion {
+public:
+  bool IsUnique() const { return inplace_version_ == 0; }
+  void Bump() { ++inplace_version_; }
+  uint32_t CurrentVersion() const { return inplace_version_; }
+  void SetInplaceVersionToZero() { inplace_version_ = 0; }
+
+private:
+  uint32_t inplace_version_{0};
+};
+
+/* @jim19930609: Remove dependency on protobuf after Tensor Unification.
+*/
+explicit DenseTensor(paddle::framework::proto::VarType::Type dtype);
+
+/// \brief Use existing storage space to create dense tensor. This interface
+/// can be used to deliberately create an uninitialized dense tensor.
+/// \param storage The existing storage.
+/// \param meta The meta data of dense tensor.
+DenseTensor(intrusive_ptr<Storage> storage, const DenseTensorMeta& meta);
+
+/// \brief Use existing storage space to create dense tensor. This interface
+/// can be used to deliberately create an uninitialized dense tensor.
+/// \param storage The existing storage.
+/// \param meta The meta data of dense tensor.
+DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta);
+
+inline bool IsInitialized() const { return holder_ != nullptr; }
+
+template <typename T>
+T* data();
+
+void* data();
+
+template <typename T>
+T* mutable_data(const paddle::platform::Place& place,
+                size_t requested_size = 0);
+
+template <typename T>
+T* mutable_data(const DDim& dims,
+                const paddle::platform::Place& place,
+                size_t requested_size = 0);
+
+void* mutable_data(const paddle::platform::Place& place,
+                    paddle::framework::proto::VarType::Type type,
+                    size_t requested_size = 0);
+
+void* mutable_data(const paddle::platform::Place& place,
+                    size_t requested_size = 0);
+
+void* mutable_data(const paddle::platform::Place& place,
+                    paddle::framework::proto::VarType::Type type,
+                    const paddle::platform::Stream& stream);
+
+/* @jim19930609: Remove dependency on protobuf after Tensor Unification.
+*/
+paddle::framework::proto::VarType::Type type() const;
+
+/* @jim19930609: Remove dependency on protobuf after Tensor Unification.
+*/
+paddle::framework::proto::VarType::Type saved_type() const;
+
+// memory size returns the holding memory size in byte.
+size_t memory_size() const;
+
+void check_memory_size() const;
+
+void set_layout(const paddle::framework::DataLayout layout);
+
+void clear() {
+  holder_.reset();
+  meta_.offset = 0;
+}
+
+void ShareBufferWith(const DenseTensor& tensor);
+
+void ShareDataTypeWith(const DenseTensor& tensor) {
+  meta_.dtype = tensor.meta().dtype;
+}
+
+bool IsSharedBufferWith(const DenseTensor& src) const {
+  return holder_ && holder_ == src.Holder();
+}
+
+const std::shared_ptr<pten::Allocation>& Holder() const { return holder_; }
+
+void set_offset(size_t offset) { meta_.offset = offset; }
+size_t offset() const { return meta_.offset; }
+
+std::shared_ptr<pten::Allocation> MoveMemoryHolder() {
+  return std::move(holder_);
+}
+
+void ResetHolder(const std::shared_ptr<pten::Allocation>& holder);
+
+void ResetHolderWithType(const std::shared_ptr<pten::Allocation>& holder,
+                        paddle::framework::proto::VarType::Type type);
+
+void set_type(paddle::framework::proto::VarType::Type type);
+
+InplaceVersion& InplaceVersionCounter() {
+  return *inplace_version_counter_;
+}
+
+/*! The internal of two tensors share the same memory block. */
+DenseTensor& ShareDataWith(const DenseTensor& src);
+
+/*! The internal of two tensors share the same inplace version counter. */
+DenseTensor& ShareInplaceVersionCounterWith(const DenseTensor& src);
+
+DenseTensor Slice(int64_t begin_idx, int64_t end_idx) const;
+
+std::vector<DenseTensor> Split(int64_t split_size, int64_t axis) const;
+
+std::vector<DenseTensor> Chunk(int64_t chunks, int64_t axis) const;
+
+protected:
+std::shared_ptr<InplaceVersion> inplace_version_counter_{std::make_shared<InplaceVersion>()};
+
+/* @jim19930609: This is a hack
+In general, it is badly designed to fuse MKLDNN-specific objects into a
+generic Tensor.
+We temporarily leave them here to unblock Tensor Unification progress.
+In the final state, we should come up with a MKLDNN_Tensor and move the
+following codes there.
+*/
+#ifdef PADDLE_WITH_MKLDNN
+
+public:
+inline dnnl::memory::format_tag format() const { return format_; }
+
+inline void set_format(const dnnl::memory::format_tag format) {
+  format_ = format;
+}
+
+protected:
+/**
+ * @brief the detail format of memory block which have layout as kMKLDNN
+ *
+ * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
+ *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
+ *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
+ *       this field.
+ */
+
+dnnl::memory::format_tag format_ = dnnl::memory::format_tag::undef;
+#endif
+
+/* ------------------------------ */
+/*   From framework::LoDTensor    */
+/* ------------------------------ */
+/* The following members & interfaces were copied from framework::Tensor,
+    so as to facilitate the unification of different Tensors
+
+    Will be adjusted/removed/moved in the near future
+*/
+public:
+explicit DenseTensor(const LoD& lod);
+
+void set_lod(const LoD& lod);
+
+LoD* mutable_lod();
+
+/*
+* Get the start offset and end offset of an  element from LoD.
+*/
+std::pair<size_t, size_t> lod_element(size_t level, size_t elem) const;
+
+size_t NumLevels() const;
+
+size_t NumElements(size_t level = 0) const;
diff --git a/paddle/pten/core/dense_tensor_impl.cc b/paddle/pten/core/dense_tensor_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f825d3619b92bcfd8d66ea47d9f176630ccbb525
--- /dev/null
+++ b/paddle/pten/core/dense_tensor_impl.cc
@@ -0,0 +1,394 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/dense_tensor.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"
+
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/core/convert_utils.h"
+
+namespace pten {
+/* --------------------------- */
+/*   From framework::Tensor    */
+/* --------------------------- */
+DenseTensor::DenseTensor() {
+  meta_.dtype = paddle::experimental::DataType::FLOAT32;
+  meta_.offset = 0;
+}
+
+DenseTensor::DenseTensor(paddle::framework::proto::VarType::Type dtype) {
+  meta_.dtype = TransToPtenDataType(dtype);
+  meta_.offset = 0;
+}
+
+size_t DenseTensor::memory_size() const {
+  return holder_ == nullptr ? 0UL : holder_->size() - meta_.offset;
+}
+
+void DenseTensor::check_memory_size() const {
+  PADDLE_ENFORCE_NOT_NULL(holder_,
+                          paddle::platform::errors::PreconditionNotMet(
+                              "Tensor holds no memory. "
+                              "Call Tensor::mutable_data firstly."));
+  PADDLE_ENFORCE_LE(
+      numel() * SizeOf(dtype()),
+      memory_size(),
+      paddle::platform::errors::PreconditionNotMet(
+          "Tensor's dimension is out of bound."
+          "Tensor's dimension must be equal or less than the size of its "
+          "memory."
+          "But received Tensor's dimension is d%, memory's size is %d.",
+          numel() * SizeOf(dtype()),
+          memory_size()));
+}
+
+const paddle::platform::Place& DenseTensor::place() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      holder_,
+      paddle::platform::errors::PreconditionNotMet(
+          "Tensor not initialized yet when DenseTensor::place() is called."));
+  return holder_->place();
+}
+
+paddle::framework::proto::VarType::Type DenseTensor::type() const {
+  return TransToProtoVarType(meta_.dtype);
+}
+
+paddle::framework::proto::VarType::Type DenseTensor::saved_type() const {
+  return TransToProtoVarType(meta_.dtype);
+}
+
+void DenseTensor::set_layout(const paddle::framework::DataLayout layout) {
+  meta_.layout = layout;
+}
+
+void DenseTensor::ResetHolder(const std::shared_ptr<pten::Allocation>& holder) {
+  PADDLE_ENFORCE_EQ(
+      meta_.offset,
+      0,
+      paddle::platform::errors::Fatal(
+          "Only the offset is supported to zero when the holder is reset."));
+
+  if (holder_) {
+    // TODO(zyfncg): The change of static_cast<> in check will recover back
+    // when SetAllocationForOutputTenosr is deleted.
+    // Now the numel() may return -1, and will cast to a very large number when
+    // compare with a data with unsigned long type, this will make checking
+    // failed, so it's a temporary solution to deal with this problem.
+    PADDLE_ENFORCE_LE(
+        numel() * static_cast<int64_t>(SizeOf(dtype())),
+        static_cast<int64_t>(holder->size()),
+        paddle::platform::errors::InvalidArgument(
+            "The size of Holder is not enough to store the Tensor."));
+  }
+  holder_ = holder;
+}
+
+void DenseTensor::ResetHolderWithType(
+    const std::shared_ptr<pten::Allocation>& holder,
+    paddle::framework::proto::VarType::Type type) {
+  set_type(type);
+  ResetHolder(holder);
+}
+
+void DenseTensor::set_type(paddle::framework::proto::VarType::Type type) {
+  meta_.dtype = TransToPtenDataType(type);
+}
+
+void* DenseTensor::mutable_data(const paddle::platform::Place& place,
+                                paddle::framework::proto::VarType::Type type,
+                                size_t requested_size) {
+  set_type(type);
+  PADDLE_ENFORCE_GE(
+      numel(),
+      0,
+      paddle::platform::errors::PreconditionNotMet(
+          "The Tensor's element number must be equal or greater than zero. "
+          "The Tensor's shape is [",
+          dims(),
+          "] now"));
+  size_t size = numel() * SizeOf(dtype());
+  if (requested_size && (requested_size > size)) {
+    size = requested_size;
+  }
+
+  /* some versions of boost::variant don't have operator!= */
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + meta_.offset) {
+    holder_.reset();
+    holder_ = paddle::memory::AllocShared(place, size);
+    meta_.offset = 0;
+  }
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 meta_.offset);
+}
+
+void* DenseTensor::mutable_data(const paddle::platform::Place& place,
+                                size_t requested_size) {
+  return mutable_data(place, type(), requested_size);
+}
+
+void* DenseTensor::mutable_data(const paddle::platform::Place& place,
+                                paddle::framework::proto::VarType::Type type,
+                                const paddle::platform::Stream& stream) {
+  set_type(type);
+  PADDLE_ENFORCE_GE(
+      numel(),
+      0,
+      paddle::platform::errors::PreconditionNotMet(
+          "The Tensor's element number must be equal or greater than zero. "
+          "The Tensor's shape is [",
+          dims(),
+          "] now"));
+  size_t size = numel() * SizeOf(dtype());
+
+  /* some versions of boost::variant don't have operator!= */
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + meta_.offset ||
+      !(paddle::platform::is_gpu_place(place) &&
+        paddle::memory::InSameStream(holder_, stream))) {
+    holder_.reset();
+    holder_ = paddle::memory::AllocShared(place, size, stream);
+    meta_.offset = 0;
+  }
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 meta_.offset);
+}
+
+/* @jim19930609: The following "mutable_data" only supports specific dtypes
+   defined in OpProto. This part need another clean up once the data type across
+   Fluid
+   and Pten get unified.
+   */
+template <typename T>
+inline T* DenseTensor::mutable_data(const DDim& dims,
+                                    const paddle::platform::Place& place,
+                                    size_t requested_size) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+  meta_.dims = dims;
+  return mutable_data<T>(place, requested_size);
+}
+
+template <typename T>
+inline T* DenseTensor::mutable_data(const paddle::platform::Place& place,
+                                    size_t requested_size) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+  return reinterpret_cast<T*>(mutable_data(
+      place, paddle::framework::DataTypeTrait<T>::DataType(), requested_size));
+}
+
+void DenseTensor::ShareBufferWith(const DenseTensor& tensor) {
+  holder_ = tensor.holder_;
+  meta_.offset = tensor.meta().offset;
+  meta_.dtype = tensor.dtype();
+}
+
+#define LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(dtype) \
+  template dtype* DenseTensor::mutable_data(         \
+      const DDim& dims,                              \
+      const paddle::platform::Place& place,          \
+      size_t requested_size);                        \
+  template dtype* DenseTensor::mutable_data(         \
+      const paddle::platform::Place& place, size_t requested_size);
+
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(bool)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int8_t)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(uint8_t)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int16_t)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int32_t)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int64_t)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(float)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(double)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::bfloat16)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::float16)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex64)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128)
+
+#undef LEGACY_DATA_MEMBER_FUNC_INSTANTIATION
+
+/* ------------------------------ */
+/*   From framework::LoDTensor    */
+/* ------------------------------ */
+
+DenseTensor::DenseTensor(intrusive_ptr<Storage> storage,
+                         const DenseTensorMeta& meta)
+    : meta_(meta), holder_(storage->move_data_shared()) {}
+
+DenseTensor::DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta)
+    : meta_(std::move(meta)), holder_(storage->move_data_shared()) {}
+
+DenseTensor::DenseTensor(const LoD& lod) : DenseTensor() { meta_.lod = lod; }
+
+void DenseTensor::set_lod(const LoD& lod) { meta_.lod = lod; }
+
+LoD* DenseTensor::mutable_lod() { return &meta_.lod; }
+
+std::pair<size_t, size_t> DenseTensor::lod_element(size_t level,
+                                                   size_t elem) const {
+  PADDLE_ENFORCE_LT(
+      level,
+      NumLevels(),
+      paddle::platform::errors::InvalidArgument(
+          "The input level of LoD is invalid, it should be less than LoD "
+          "size. The input level is %zu, the LoD size is %zu.",
+          level,
+          NumLevels()));
+
+  PADDLE_ENFORCE_LT(elem,
+                    NumElements(level),
+                    paddle::platform::errors::InvalidArgument(
+                        "The input element of LoD is invalid, it should be "
+                        "less than the number of elements in its level."
+                        "The input element is %zu, the number of elements in "
+                        "its level is %zu.",
+                        elem,
+                        NumElements(level)));
+
+  return std::make_pair((meta_.lod)[level][elem], (meta_.lod)[level][elem + 1]);
+}
+
+size_t DenseTensor::NumLevels() const { return meta_.lod.size(); }
+
+size_t DenseTensor::NumElements(size_t level) const {
+  PADDLE_ENFORCE_LT(
+      level,
+      NumLevels(),
+      paddle::platform::errors::InvalidArgument(
+          "The input level of LoD is invalid, it should be less than LoD "
+          "size. The input level is %zu, the LoD size is %zu.",
+          level,
+          NumLevels()));
+
+  // the last offset is the end of last element
+  return (meta_.lod)[level].size() - 1;
+}
+
+DenseTensor& DenseTensor::Resize(const DDim& dims) {
+  meta_.dims = dims;
+  return *this;
+}
+
+DenseTensor DenseTensor::Slice(int64_t begin_idx, int64_t end_idx) const {
+  check_memory_size();
+  PADDLE_ENFORCE_GE(begin_idx,
+                    0,
+                    paddle::platform::errors::OutOfRange(
+                        "The start row index must be greater than 0."
+                        "But received the start index is d%.",
+                        begin_idx));
+  PADDLE_ENFORCE_LE(end_idx,
+                    meta_.dims[0],
+                    paddle::platform::errors::OutOfRange(
+                        "The end row index is out of bound."));
+  PADDLE_ENFORCE_LT(
+      begin_idx,
+      end_idx,
+      paddle::platform::errors::InvalidArgument(
+          "The start row index must be less than the end row index."
+          "But received the start index = %d, the end index = %d.",
+          begin_idx,
+          end_idx));
+
+  if (meta_.dims[0] == 1) {
+    return *this;
+  } else {
+    size_t base = numel() / meta_.dims[0];
+    DenseTensor dst;
+    dst.holder_ = holder_;
+    dst.set_layout(meta_.layout);
+    dst.meta_.dtype = meta_.dtype;
+    DDim dst_dims = meta_.dims;
+    dst_dims[0] = end_idx - begin_idx;
+    dst.Resize(dst_dims);
+    dst.meta_.offset = meta_.offset + begin_idx * base * SizeOf(dtype());
+    return dst;
+  }
+}
+
+std::vector<DenseTensor> DenseTensor::Split(int64_t split_size,
+                                            int64_t axis) const {
+  check_memory_size();
+
+  PADDLE_ENFORCE_GE(meta_.dims.size(),
+                    0,
+                    paddle::platform::errors::OutOfRange(
+                        "split expects at least a 1-dimensional tensor"));
+
+  PADDLE_ENFORCE_GE(
+      split_size,
+      0,
+      paddle::platform::errors::OutOfRange(
+          "split expects split_size be non-negative, but got split_size is %d",
+          split_size));
+
+  int64_t numel_size = meta_.dims[axis];
+
+  int64_t num_splits = 1;
+  if (split_size != 0) {
+    num_splits =
+        std::max<int64_t>((numel_size + split_size - 1) / split_size, 1);
+  }
+
+  std::vector<DenseTensor> splits(num_splits);
+  int64_t last_split_size = split_size - (split_size * num_splits - numel_size);
+
+  for (int64_t i = 0; i < num_splits; ++i) {
+    int64_t length = i < num_splits - 1 ? split_size : last_split_size;
+    splits[i] = Slice(i * split_size, i * split_size + length);
+  }
+  return splits;
+}
+
+std::vector<DenseTensor> DenseTensor::Chunk(int64_t chunks,
+                                            int64_t axis) const {
+  check_memory_size();
+  PADDLE_ENFORCE_GE(meta_.dims.size(),
+                    0,
+                    paddle::platform::errors::OutOfRange(
+                        "split expects at least a 1-dimensional tensor"));
+  PADDLE_ENFORCE_GE(
+      chunks,
+      0,
+      paddle::platform::errors::OutOfRange(
+          "chunks expects to be greater than 0, but got chunks is %d", chunks));
+
+  int64_t numel_size = meta_.dims[axis];
+  int64_t split_size = (numel_size + chunks - 1) / chunks;
+  return Split(split_size, axis);
+}
+
+DenseTensor& DenseTensor::ShareDataWith(const DenseTensor& src) {
+  src.check_memory_size();
+  // Preserve LoD
+  auto lod = meta_.lod;
+  *this = src;
+  meta_.lod = lod;
+  return *this;
+}
+
+DenseTensor& DenseTensor::ShareInplaceVersionCounterWith(
+    const DenseTensor& src) {
+  PADDLE_ENFORCE_NOT_NULL(
+      inplace_version_counter_,
+      paddle::platform::errors::PreconditionNotMet(
+          "Tensor does not hold inplace_version_counter_."));
+
+  inplace_version_counter_ = src.inplace_version_counter_;
+  return *this;
+}
+}  // namespace pten
diff --git a/paddle/pten/core/device_context.cc b/paddle/pten/core/device_context.cc
index 7b2c4a2cf170f18fefb4df3e3a1dca23230f9ae8..7566b351bf63401acba3bad247b10bd7bb3c9cf1 100644
--- a/paddle/pten/core/device_context.cc
+++ b/paddle/pten/core/device_context.cc
@@ -13,28 +13,45 @@
 // limitations under the License.
 
 #include "paddle/pten/core/device_context.h"
+#include "paddle/pten/api/ext/exception.h"
 
 namespace pten {
 
 struct DeviceContext::Impl {
-  Allocator* allocator_{nullptr};
-
   Impl() = default;
   ~Impl() = default;
 
-  void SetAllocator(Allocator* allocator) { allocator_ = allocator; }
+  void SetDeviceAllocator(Allocator* allocator) {
+    device_allocator_ = allocator;
+  }
+
+  void SetHostAllocator(Allocator* allocator) { host_allocator_ = allocator; }
+
+  const Allocator& GetDeviceAllocator() const {
+    PD_CHECK(device_allocator_ != nullptr, "the device_allocator is nullptr.");
+    return *device_allocator_;
+  }
 
-  const Allocator& GetAllocator() const { return *allocator_; }
+  const Allocator& GetHostAllocator() const {
+    PD_CHECK(host_allocator_ != nullptr, "the host_allocator is nullptr.");
+    return *host_allocator_;
+  }
 
   // TODO(Wilber): Add impl. It seems that tensorbase not have interface to
   // communicate with allocator.
-  void Alloc(TensorBase* tensor) {}
+  void HostAlloc(TensorBase* tensor) {}
+  void DeviceAlloc(TensorBase* tensor) {}
+
+  Allocator* device_allocator_{nullptr};
+  Allocator* host_allocator_{nullptr};
 };
 
 DeviceContext::DeviceContext() { impl_ = std::make_unique<Impl>(); }
 
 DeviceContext::DeviceContext(const DeviceContext& other) {
-  impl_->SetAllocator(const_cast<Allocator*>(&other.GetAllocator()));
+  impl_->SetDeviceAllocator(
+      const_cast<Allocator*>(&other.GetDeviceAllocator()));
+  impl_->SetHostAllocator(const_cast<Allocator*>(&other.GetHostAllocator()));
 }
 
 DeviceContext::DeviceContext(DeviceContext&& other) {
@@ -43,14 +60,26 @@ DeviceContext::DeviceContext(DeviceContext&& other) {
 
 DeviceContext::~DeviceContext() = default;
 
-void DeviceContext::SetAllocator(Allocator* allocator) {
-  impl_->SetAllocator(allocator);
+void DeviceContext::SetHostAllocator(Allocator* allocator) {
+  impl_->SetHostAllocator(allocator);
+}
+
+void DeviceContext::SetDeviceAllocator(Allocator* allocator) {
+  impl_->SetDeviceAllocator(allocator);
+}
+
+const Allocator& DeviceContext::GetHostAllocator() const {
+  return impl_->GetHostAllocator();
 }
 
-const Allocator& DeviceContext::GetAllocator() const {
-  return impl_->GetAllocator();
+const Allocator& DeviceContext::GetDeviceAllocator() const {
+  return impl_->GetDeviceAllocator();
 }
 
-void DeviceContext::Alloc(TensorBase* tensor) { impl_->Alloc(tensor); }
+void DeviceContext::HostAlloc(TensorBase* tensor) { impl_->HostAlloc(tensor); }
+
+void DeviceContext::DeviceAlloc(TensorBase* tensor) {
+  impl_->DeviceAlloc(tensor);
+}
 
 }  // namespace pten
diff --git a/paddle/pten/core/device_context.h b/paddle/pten/core/device_context.h
index 1ee2e21494bf544c130ede20ea84c11ae94ca812..c658a24c3527d50efacc9b2b768ac8f07c07b338 100644
--- a/paddle/pten/core/device_context.h
+++ b/paddle/pten/core/device_context.h
@@ -57,19 +57,38 @@ class DeviceContext {
    *
    * @param allocator
    */
-  void SetAllocator(Allocator*);
+  void SetDeviceAllocator(Allocator*);
 
   /**
-   * @brief Get the const Allocator object.
+   * @brief Get the const deveice-releated Allocator object.
    *
    * @return Allocator
    */
-  const Allocator& GetAllocator() const;
+  const Allocator& GetDeviceAllocator() const;
 
   /**
-   * @brief Allocate memory for tensor.
+   * @brief Allocate device memory for tensor.
    */
-  void Alloc(pten::TensorBase*);
+  void DeviceAlloc(pten::TensorBase*);
+
+  /**
+   * @brief Set the host Allocator object.
+   *
+   * @param allocator
+   */
+  void SetHostAllocator(Allocator*);
+
+  /**
+   * @brief Get the const host Allocator object.
+   *
+   * @return Allocator
+   */
+  const Allocator& GetHostAllocator() const;
+
+  /**
+   * @brief Allocate host memory for tensor.
+   */
+  void HostAlloc(pten::TensorBase*);
 
   // TODO(wilber): Just for the convenience of migrating the code, it will be
   // modified or removed later.
diff --git a/paddle/pten/core/enforce.cc b/paddle/pten/core/enforce.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ce23565a8874f1afae8aa1f4feb2f217da5f8ed8
--- /dev/null
+++ b/paddle/pten/core/enforce.cc
@@ -0,0 +1,15 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/enforce.h"
diff --git a/paddle/pten/core/enforce.h b/paddle/pten/core/enforce.h
new file mode 100644
index 0000000000000000000000000000000000000000..97433f1a6d5fc3d209528d8c419e9737e85cd4ad
--- /dev/null
+++ b/paddle/pten/core/enforce.h
@@ -0,0 +1,558 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef __GNUC__
+#include <cxxabi.h>  // for __cxa_demangle
+#endif               // __GNUC__
+
+#if !defined(_WIN32)
+#include <dlfcn.h>   // dladdr
+#include <unistd.h>  // sleep, usleep
+#else                // _WIN32
+#ifndef NOMINMAX
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#endif
+#include <windows.h>  // GetModuleFileName, Sleep
+#endif
+
+#include <fstream>
+#include <iomanip>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL)
+#include <execinfo.h>
+#endif
+
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "paddle/pten/core/errors.h"
+#include "paddle/utils/string/printf.h"
+#include "paddle/utils/string/to_string.h"
+
+// Note: these headers for simplify demangle type string
+#include "paddle/pten/core/type_defs.h"
+
+namespace pten {
+class ErrorSummary;
+}  // namespace pten
+
+DECLARE_int32(call_stack_level);
+namespace pten {
+namespace enforce {
+/** HELPER MACROS AND FUNCTIONS **/
+
+#ifndef PADDLE_MAY_THROW
+#define PADDLE_MAY_THROW noexcept(false)
+#endif
+
+// Because most enforce conditions would evaluate to true, we can use
+// __builtin_expect to instruct the C++ compiler to generate code that
+// always forces branch prediction of true.
+// This generates faster binary code. __builtin_expect is since C++11.
+// For more details, please check https://stackoverflow.com/a/43870188/724872.
+#if !defined(_WIN32)
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+#else
+// there is no equivalent intrinsics in msvc.
+#define UNLIKELY(condition) (condition)
+#endif
+
+#if !defined(_WIN32)
+#define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
+#else
+// there is no equivalent intrinsics in msvc.
+#define LIKELY(condition) (condition)
+#endif
+
+#if defined _WIN32 && defined PADDLE_ON_INFERENCE && defined PADDLE_NO_PYTHON
+#define HANDLE_THE_ERROR try {
+#define END_HANDLE_THE_ERROR            \
+  }                                     \
+  catch (const std::exception& e) {     \
+    std::cout << e.what() << std::endl; \
+    throw;                              \
+  }
+#else
+#define HANDLE_THE_ERROR
+#define END_HANDLE_THE_ERROR
+#endif
+
+#ifdef __GNUC__
+inline std::string demangle(std::string name) {
+  int status = -4;  // some arbitrary value to eliminate the compiler warning
+  std::unique_ptr<char, void (*)(void*)> res{
+      abi::__cxa_demangle(name.c_str(), NULL, NULL, &status), std::free};
+  return (status == 0) ? res.get() : name;
+}
+#else
+inline std::string demangle(std::string name) { return name; }
+#endif
+
+namespace details {
+template <typename T>
+inline constexpr bool IsArithmetic() {
+  return std::is_arithmetic<T>::value;
+}
+
+template <typename T1, typename T2, bool kIsArithmetic /* = true */>
+struct TypeConverterImpl {
+  using Type1 = typename std::common_type<T1, T2>::type;
+  using Type2 = Type1;
+};
+
+template <typename T1, typename T2>
+struct TypeConverterImpl<T1, T2, false> {
+  using Type1 = T1;
+  using Type2 = T2;
+};
+
+template <typename T1, typename T2>
+struct TypeConverter {
+  static constexpr bool kIsArithmetic =
+      IsArithmetic<T1>() && IsArithmetic<T2>();
+  using Type1 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type1;
+  using Type2 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type2;
+};
+
+template <typename T1, typename T2>
+using CommonType1 = typename std::add_lvalue_reference<
+    typename std::add_const<typename TypeConverter<T1, T2>::Type1>::type>::type;
+
+template <typename T1, typename T2>
+using CommonType2 = typename std::add_lvalue_reference<
+    typename std::add_const<typename TypeConverter<T1, T2>::Type2>::type>::type;
+
+// Here, we use SFINAE to check whether T can be converted to std::string
+template <typename T>
+struct CanToString {
+ private:
+  using YesType = uint8_t;
+  using NoType = uint16_t;
+
+  template <typename U>
+  static YesType Check(decltype(std::cout << std::declval<U>())) {
+    return 0;
+  }
+
+  template <typename U>
+  static NoType Check(...) {
+    return 0;
+  }
+
+ public:
+  static constexpr bool kValue =
+      std::is_same<YesType, decltype(Check<T>(std::cout))>::value;
+};
+
+template <bool kCanToString /* = true */>
+struct BinaryCompareMessageConverter {
+  template <typename T>
+  static std::string Convert(const char* expression, const T& value) {
+    return expression + std::string(":") + paddle::string::to_string(value);
+  }
+};
+
+template <>
+struct BinaryCompareMessageConverter<false> {
+  template <typename T>
+  static const char* Convert(const char* expression, const T& value) {
+    return expression;
+  }
+};
+}  // namespace details
+
+template <typename T>
+inline std::string ReplaceComplexTypeStr(std::string str,
+                                         const std::string& type_name) {
+  auto demangle_type_str = demangle(typeid(T).name());
+  size_t start_pos = 0;
+  while ((start_pos = str.find(demangle_type_str, start_pos)) !=
+         std::string::npos) {
+    str.replace(start_pos, demangle_type_str.length(), type_name);
+    start_pos += type_name.length();
+  }
+  return str;
+}
+
+#define __REPLACE_COMPLEX_TYPE_STR__(__TYPENAME, __STR)                       \
+  do {                                                                        \
+    __STR =                                                                   \
+        pten::enforce::ReplaceComplexTypeStr<__TYPENAME>(__STR, #__TYPENAME); \
+  } while (0)
+
+inline std::string SimplifyDemangleStr(std::string str) {
+  // the older is important, you have to put complex types in front
+  __REPLACE_COMPLEX_TYPE_STR__(paddle::framework::AttributeMap, str);
+  __REPLACE_COMPLEX_TYPE_STR__(paddle::framework::Attribute, str);
+  __REPLACE_COMPLEX_TYPE_STR__(paddle::imperative::NameVariableWrapperMap, str);
+  __REPLACE_COMPLEX_TYPE_STR__(paddle::imperative::NameVarBaseMap, str);
+  __REPLACE_COMPLEX_TYPE_STR__(std::string, str);
+  return str;
+}
+
+inline std::string GetCurrentTraceBackString(bool for_signal = false) {
+  std::ostringstream sout;
+
+  if (!for_signal) {
+    sout << "\n\n--------------------------------------\n";
+    sout << "C++ Traceback (most recent call last):";
+    sout << "\n--------------------------------------\n";
+  }
+#if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL)
+  static constexpr int TRACE_STACK_LIMIT = 100;
+
+  void* call_stack[TRACE_STACK_LIMIT];
+  auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
+  auto symbols = backtrace_symbols(call_stack, size);
+  Dl_info info;
+  int idx = 0;
+  // `for_signal` used to remove the stack trace introduced by
+  // obtaining the error stack trace when the signal error occurred,
+  // that is not related to the signal error self, remove it to
+  // avoid misleading users and developers
+  int end_idx = for_signal ? 2 : 0;
+  for (int i = size - 1; i >= end_idx; --i) {
+    if (dladdr(call_stack[i], &info) && info.dli_sname) {
+      auto demangled = demangle(info.dli_sname);
+      std::string path(info.dli_fname);
+      // C++ traceback info are from core.so
+      if (path.substr(path.length() - 3).compare(".so") == 0) {
+        sout << paddle::string::Sprintf(
+            "%-3d %s\n", idx++, SimplifyDemangleStr(demangled));
+      }
+    }
+  }
+  free(symbols);
+#else
+  sout << "Not support stack backtrace yet.\n";
+#endif
+  return sout.str();
+}
+
+template <typename StrType>
+inline std::string GetErrorSumaryString(StrType&& what,
+                                        const char* file,
+                                        int line) {
+  std::ostringstream sout;
+  if (FLAGS_call_stack_level > 1) {
+    sout << "\n----------------------\nError Message "
+            "Summary:\n----------------------\n";
+  }
+  sout << paddle::string::Sprintf(
+              "%s (at %s:%d)", std::forward<StrType>(what), file, line)
+       << std::endl;
+  return sout.str();
+}
+
+template <typename StrType>
+inline std::string GetTraceBackString(StrType&& what,
+                                      const char* file,
+                                      int line) {
+  if (FLAGS_call_stack_level > 1) {
+    // FLAGS_call_stack_level>1 means showing c++ call stack
+    return GetCurrentTraceBackString() + GetErrorSumaryString(what, file, line);
+  } else {
+    return GetErrorSumaryString(what, file, line);
+  }
+}
+
+inline std::string SimplifyErrorTypeFormat(const std::string& str) {
+  std::ostringstream sout;
+  size_t type_end_pos = str.find(":", 0);
+  if (type_end_pos == std::string::npos) {
+    sout << str;
+  } else {
+    // Remove "Error:", add "()""
+    sout << "(" << str.substr(0, type_end_pos - 5) << ")"
+         << str.substr(type_end_pos + 1);
+  }
+  return sout.str();
+}
+
+inline bool is_error(bool stat) { return !stat; }
+
+// Note: This Macro can only be used within enforce.h
+#define __THROW_ERROR_INTERNAL__(__ERROR_SUMMARY)                              \
+  do {                                                                         \
+    HANDLE_THE_ERROR                                                           \
+    throw ::pten::enforce::EnforceNotMet(__ERROR_SUMMARY, __FILE__, __LINE__); \
+    END_HANDLE_THE_ERROR                                                       \
+  } while (0)
+
+/** ENFORCE EXCEPTION AND MACROS **/
+
+struct EnforceNotMet : public std::exception {
+ public:
+  EnforceNotMet(std::exception_ptr e, const char* file, int line) {
+    try {
+      std::rethrow_exception(e);
+    } catch (EnforceNotMet& e) {
+      code_ = e.code();
+      err_str_ = GetTraceBackString(e.what(), file, line);
+      simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
+    } catch (std::exception& e) {
+      err_str_ = GetTraceBackString(e.what(), file, line);
+      simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
+    }
+  }
+
+  EnforceNotMet(const std::string& str, const char* file, int line)
+      : err_str_(GetTraceBackString(str, file, line)) {
+    simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
+  }
+
+  EnforceNotMet(const pten::ErrorSummary& error, const char* file, int line)
+      : code_(error.code()),
+        err_str_(GetTraceBackString(error.to_string(), file, line)) {
+    simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
+  }
+
+  const char* what() const noexcept override {
+    if (FLAGS_call_stack_level > 1) {
+      return err_str_.c_str();
+    } else {
+      return simple_err_str_.c_str();
+    }
+  }
+
+  pten::ErrorCode code() const { return code_; }
+
+  const std::string& error_str() const { return err_str_; }
+
+  const std::string& simple_error_str() const { return simple_err_str_; }
+
+  void set_error_str(std::string str) {
+    if (FLAGS_call_stack_level > 1) {
+      err_str_ = str;
+    } else {
+      simple_err_str_ = str;
+    }
+  }
+
+ private:
+  // Used to determine the final type of exception thrown
+  pten::ErrorCode code_ = pten::ErrorCode::LEGACY;
+  // Complete error message
+  // e.g. InvalidArgumentError: ***
+  std::string err_str_;
+  // Simple errror message used when no C++ stack and python compile stack
+  // e.g. (InvalidArgument) ***
+  std::string simple_err_str_;
+};
+
+#define PADDLE_THROW(...)                                       \
+  do {                                                          \
+    HANDLE_THE_ERROR                                            \
+    throw ::pten::enforce::EnforceNotMet(                       \
+        ::pten::ErrorSummary(__VA_ARGS__), __FILE__, __LINE__); \
+    END_HANDLE_THE_ERROR                                        \
+  } while (0)
+
+#if defined(__CUDA_ARCH__)
+// For cuda, the assertions can affect performance and it is therefore
+// recommended to disable them in production code
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
+#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...)               \
+  do {                                                             \
+    if (!(_IS_NOT_ERROR)) {                                        \
+      printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", \
+             __FILE__,                                             \
+             __LINE__,                                             \
+             #_IS_NOT_ERROR,                                       \
+             ##__VA_ARGS__);                                       \
+      asm("trap;");                                                \
+    }                                                              \
+  } while (0)
+#elif defined(__HIPCC__)
+#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...)               \
+  do {                                                             \
+    if (!(_IS_NOT_ERROR)) {                                        \
+      printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", \
+             __FILE__,                                             \
+             __LINE__,                                             \
+             #_IS_NOT_ERROR,                                       \
+             ##__VA_ARGS__);                                       \
+      abort();                                                     \
+    }                                                              \
+  } while (0)
+#else
+#define PADDLE_ENFORCE(COND, ...)                                \
+  do {                                                           \
+    auto __cond__ = (COND);                                      \
+    if (UNLIKELY(::pten::is_error(__cond__))) {                  \
+      __THROW_ERROR_INTERNAL__(pten::ErrorSummary(__VA_ARGS__)); \
+    }                                                            \
+  } while (0)
+#endif
+
+/*
+ * Some enforce helpers here, usage:
+ *    int a = 1;
+ *    int b = 2;
+ *    PADDLE_ENFORCE_EQ(a, b);
+ *
+ *    will raise an expression described as follows:
+ *    "Expected input a == b, but received a(1) != b(2)."
+ *      with detailed stack information.
+ *
+ *    extra messages is also supported, for example:
+ *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
+ */
+
+#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                     \
+  do {                                                          \
+    if (UNLIKELY(nullptr == (__VAL))) {                         \
+      auto __summary__ = pten::ErrorSummary(__VA_ARGS__);       \
+      auto __message__ = ::paddle::string::Sprintf(             \
+          "%s\n  [Hint: " #__VAL " should not be null.]",       \
+          __summary__.error_message());                         \
+      __THROW_ERROR_INTERNAL__(                                 \
+          pten::ErrorSummary(__summary__.code(), __message__)); \
+    }                                                           \
+  } while (0)
+
+#define __PADDLE_BINARY_COMPARE(__VAL1, __VAL2, __CMP, __INV_CMP, ...)  \
+  do {                                                                  \
+    auto __val1 = (__VAL1);                                             \
+    auto __val2 = (__VAL2);                                             \
+    using __TYPE1__ = decltype(__val1);                                 \
+    using __TYPE2__ = decltype(__val2);                                 \
+    using __COMMON_TYPE1__ =                                            \
+        ::pten::details::CommonType1<__TYPE1__, __TYPE2__>;             \
+    using __COMMON_TYPE2__ =                                            \
+        ::pten::details::CommonType2<__TYPE1__, __TYPE2__>;             \
+    bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP( \
+        static_cast<__COMMON_TYPE2__>(__val2));                         \
+    if (UNLIKELY(!__is_not_error)) {                                    \
+      auto __summary__ = pten::ErrorSummary(__VA_ARGS__);               \
+      constexpr bool __kCanToString__ =                                 \
+          ::pten::details::CanToString<__TYPE1__>::kValue &&            \
+          ::pten::details::CanToString<__TYPE2__>::kValue;              \
+      auto __message__ = ::paddle::string::Sprintf(                     \
+          "%s\n  [Hint: Expected %s " #__CMP                            \
+          " %s, but received %s " #__INV_CMP " %s.]",                   \
+          __summary__.error_message(),                                  \
+          #__VAL1,                                                      \
+          #__VAL2,                                                      \
+          ::pten::details::BinaryCompareMessageConverter<               \
+              __kCanToString__>::Convert(#__VAL1, __val1),              \
+          ::pten::details::BinaryCompareMessageConverter<               \
+              __kCanToString__>::Convert(#__VAL2, __val2));             \
+      __THROW_ERROR_INTERNAL__(                                         \
+          pten::ErrorSummary(__summary__.code(), __message__));         \
+    }                                                                   \
+  } while (0)
+
+#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
+#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
+#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
+#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
+#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
+#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
+
+/** EXTENDED TOOL FUNCTIONS WITH CHECKING **/
+
+/*
+ * Summary: This macro is used to get Variable or internal type
+ *   data (such as LoDTensor or SelectedRows) of the Input and
+ *   Output in op, generally used when call scope.FindVar(Input/
+ *   Output("Name")) or ctx.Input<LoDTensor>().
+ *   Firstly this macro check whether the obtained pointer is null,
+ *   and then return data if it is not null.
+ *
+ * Note: This macro is only suitable for specific scenarios and
+ *   does not intended to be widely used. If it cannot meet the
+ *   requirements, please use other PADDLE_ENFORCE** check macro.
+ *
+ * Parameters:
+ *     __PTR: pointer
+ *     __ROLE: (string), Input or Output
+ *     __NAME: (string), Input or Output name
+ *     __OP_TYPE: (string), the op type
+ *  
+ * Return: The data pointed to by the pointer.
+ *
+ * Examples:
+ *    GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
+ */
+#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE)               \
+  (([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type {      \
+    auto* __ptr = (__PTR);                                              \
+    if (UNLIKELY(nullptr == __ptr)) {                                   \
+      auto __summary__ = pten::errors::NotFound(                        \
+          "Unable to get %s data of %s %s in operator %s. "             \
+          "Possible reasons are:\n"                                     \
+          "  1. The %s is not the %s of operator %s;\n"                 \
+          "  2. The %s has no corresponding variable passed in;\n"      \
+          "  3. The %s corresponding variable is not initialized.",     \
+          pten::demangle(                                               \
+              typeid(std::add_lvalue_reference<decltype(*__ptr)>::type) \
+                  .name()),                                             \
+          __ROLE,                                                       \
+          __NAME,                                                       \
+          __OP_TYPE,                                                    \
+          __NAME,                                                       \
+          __ROLE,                                                       \
+          __OP_TYPE,                                                    \
+          __NAME,                                                       \
+          __NAME);                                                      \
+      auto __message__ = ::paddle::string::Sprintf(                     \
+          "%s\n  [Hint: pointer " #__PTR " should not be null.]",       \
+          __summary__.error_message());                                 \
+      __THROW_ERROR_INTERNAL__(                                         \
+          pten::ErrorSummary(__summary__.code(), __message__));         \
+    }                                                                   \
+    return *__ptr;                                                      \
+  })())
+
+/*
+ * Summary: This macro is used to check whether op has specified
+ * Input or Output Variables. Because op's Input and Output
+ * checking are written similarly, so abstract this macro.
+ *
+ * Parameters:
+ *     __EXPR: (bool), the bool expression
+ *     __ROLE: (string), Input or Output
+ *     __NAME: (string), Input or Output name
+ *     __OP_TYPE: (string), the op type
+ *
+ * Examples:
+ *    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Mul");
+ */
+#define OP_INOUT_CHECK(__EXPR, __ROLE, __NAME, __OP_TYPE)                    \
+  do {                                                                       \
+    PADDLE_ENFORCE_EQ(                                                       \
+        __EXPR,                                                              \
+        true,                                                                \
+        pten::errors::NotFound(                                              \
+            "No %s(%s) found for %s operator.", __ROLE, __NAME, __OP_TYPE)); \
+  } while (0)
+
+}  // namespace enforce
+using namespace enforce;  // NOLINT
+}  // namespace pten
diff --git a/paddle/fluid/platform/errors.cc b/paddle/pten/core/errors.cc
similarity index 63%
rename from paddle/fluid/platform/errors.cc
rename to paddle/pten/core/errors.cc
index 94a182f96567889c3093a2eca0d7ac013599c471..c567cfe66465cc90a313b88d4627dceafa627798 100644
--- a/paddle/fluid/platform/errors.cc
+++ b/paddle/pten/core/errors.cc
@@ -12,54 +12,50 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/platform/errors.h"
+#include "paddle/pten/core/errors.h"
 
 #include <stdexcept>
 
-namespace paddle {
-namespace platform {
-
-typedef ::paddle::platform::error::Code Code;
-
-std::string error_name(Code code) {
+namespace pten {
+std::string error_name(ErrorCode code) {
   switch (code) {
-    case paddle::platform::error::LEGACY:
+    case ErrorCode::LEGACY:
       return "Error";
       break;
-    case paddle::platform::error::INVALID_ARGUMENT:
+    case ErrorCode::INVALID_ARGUMENT:
       return "InvalidArgumentError";
       break;
-    case paddle::platform::error::NOT_FOUND:
+    case ErrorCode::NOT_FOUND:
       return "NotFoundError";
       break;
-    case paddle::platform::error::OUT_OF_RANGE:
+    case ErrorCode::OUT_OF_RANGE:
       return "OutOfRangeError";
       break;
-    case paddle::platform::error::ALREADY_EXISTS:
+    case ErrorCode::ALREADY_EXISTS:
       return "AlreadyExistsError";
       break;
-    case paddle::platform::error::RESOURCE_EXHAUSTED:
+    case ErrorCode::RESOURCE_EXHAUSTED:
       return "ResourceExhaustedError";
       break;
-    case paddle::platform::error::PRECONDITION_NOT_MET:
+    case ErrorCode::PRECONDITION_NOT_MET:
       return "PreconditionNotMetError";
       break;
-    case paddle::platform::error::PERMISSION_DENIED:
+    case ErrorCode::PERMISSION_DENIED:
       return "PermissionDeniedError";
       break;
-    case paddle::platform::error::EXECUTION_TIMEOUT:
+    case ErrorCode::EXECUTION_TIMEOUT:
       return "ExecutionTimeoutError";
       break;
-    case paddle::platform::error::UNIMPLEMENTED:
+    case ErrorCode::UNIMPLEMENTED:
       return "UnimplementedError";
       break;
-    case paddle::platform::error::UNAVAILABLE:
+    case ErrorCode::UNAVAILABLE:
       return "UnavailableError";
       break;
-    case paddle::platform::error::FATAL:
+    case ErrorCode::FATAL:
       return "FatalError";
       break;
-    case paddle::platform::error::EXTERNAL:
+    case ErrorCode::EXTERNAL:
       return "ExternalError";
       break;
     default:
@@ -74,6 +70,4 @@ std::string ErrorSummary::to_string() const {
   result += error_message();
   return result;
 }
-
-}  // namespace platform
-}  // namespace paddle
+}  // namespace pten
diff --git a/paddle/pten/core/errors.h b/paddle/pten/core/errors.h
new file mode 100644
index 0000000000000000000000000000000000000000..56bbeef644f9e1dd1087a3923d302180975b7573
--- /dev/null
+++ b/paddle/pten/core/errors.h
@@ -0,0 +1,146 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <type_traits>
+
+#include "paddle/utils/string/printf.h"
+
+namespace pten {
+enum ErrorCode {
+  // Legacy error.
+  // Error type string: "Error"
+  LEGACY = 0,
+
+  // Client specified an invalid argument.
+  // Error type string: "InvalidArgumentError"
+  INVALID_ARGUMENT = 1,
+
+  // Some requested entity (e.g., file or directory) was not found.
+  // Error type string: "NotFoundError"
+  NOT_FOUND = 2,
+
+  // Operation tried to iterate past the valid input range.  E.g., seeking or
+  // reading past end of file.
+  // Error type string: "OutOfRangeError"
+  OUT_OF_RANGE = 3,
+
+  // Some entity that we attempted to create (e.g., file or directory)
+  // already exists.
+  // Error type string: "AlreadyExistsError"
+  ALREADY_EXISTS = 4,
+
+  // Some resource has been exhausted, perhaps a per-user quota, or
+  // perhaps the entire file system is out of space.
+  // Error type string: "ResourceExhaustedError"
+  RESOURCE_EXHAUSTED = 5,
+
+  // Operation was rejected because the system is not in a state
+  // required for the operation's execution.
+  // Error type string: "PreconditionNotMetError"
+  PRECONDITION_NOT_MET = 6,
+
+  // The caller does not have permission to execute the specified
+  // operation.
+  // Error type string: "PermissionDeniedError"
+  PERMISSION_DENIED = 7,
+
+  // Deadline expired before operation could complete.
+  // Error type string: "ExecutionTimeout"
+  EXECUTION_TIMEOUT = 8,
+
+  // Operation is not implemented or not supported/enabled in this service.
+  // Error type string: "UnimpelmentedError"
+  UNIMPLEMENTED = 9,
+
+  // The service is currently unavailable.  This is a most likely a
+  // transient condition and may be corrected by retrying with
+  // a backoff.
+  // Error type string: "UnavailableError"
+  UNAVAILABLE = 10,
+
+  // Fatal errors.  Means some invariant expected by the underlying
+  // system has been broken.  If you see one of these errors,
+  // something is very broken.
+  // Error type string: "FatalError"
+  FATAL = 11,
+
+  // Third-party library error.
+  // Error type string: "ExternalError"
+  EXTERNAL = 12,
+};
+
+class ErrorSummary {
+ public:
+  // Note(chenweihang): Final deprecated constructor
+  //   This constructor is used to be compatible with
+  //   current existing untyped PADDLE_ENFORCE_*
+  //   PADDLE_ENFORCE
+  // Note(chenweihang): Windows openblas need this
+  //   constructor for compiling PADDLE_ENFORCE in *.cu,
+  //   this is a bug cause we can't remove this
+  //   constructor now.
+  template <typename... Args>
+  explicit ErrorSummary(Args... args) {
+    code_ = pten::ErrorCode::LEGACY;
+    msg_ = paddle::string::Sprintf(args...);
+  }
+
+  // Note(chenweihang): Only recommended constructor
+  //   No longer supports PADDLE_ENFORCE without type or without error message
+  explicit ErrorSummary(ErrorCode code, std::string msg)
+      : code_(code), msg_(msg) {}
+
+  ErrorCode code() const { return code_; }
+
+  const std::string& error_message() const { return msg_; }
+
+  std::string to_string() const;
+
+ private:
+  ErrorCode code_;
+  std::string msg_;
+};
+
+namespace errors {
+
+#define REGISTER_ERROR(FUNC, CONST, ...)                             \
+  template <typename... Args>                                        \
+  ::pten::ErrorSummary FUNC(Args... args) {                          \
+    return ::pten::ErrorSummary(::pten::CONST,                       \
+                                ::paddle::string::Sprintf(args...)); \
+  }
+
+REGISTER_ERROR(InvalidArgument, ErrorCode::INVALID_ARGUMENT)
+REGISTER_ERROR(NotFound, ErrorCode::NOT_FOUND)
+REGISTER_ERROR(OutOfRange, ErrorCode::OUT_OF_RANGE)
+REGISTER_ERROR(AlreadyExists, ErrorCode::ALREADY_EXISTS)
+REGISTER_ERROR(ResourceExhausted, ErrorCode::RESOURCE_EXHAUSTED)
+REGISTER_ERROR(PreconditionNotMet, ErrorCode::PRECONDITION_NOT_MET)
+REGISTER_ERROR(PermissionDenied, ErrorCode::PERMISSION_DENIED)
+REGISTER_ERROR(ExecutionTimeout, ErrorCode::EXECUTION_TIMEOUT)
+REGISTER_ERROR(Unimplemented, ErrorCode::UNIMPLEMENTED)
+REGISTER_ERROR(Unavailable, ErrorCode::UNAVAILABLE)
+REGISTER_ERROR(Fatal, ErrorCode::FATAL)
+REGISTER_ERROR(External, ErrorCode::EXTERNAL)
+
+#undef REGISTER_ERROR
+
+}  // namespace errors
+}  // namespace pten
diff --git a/paddle/pten/core/infermeta_utils.h b/paddle/pten/core/infermeta_utils.h
index c6812dee92b6a77534faa6a8853e322e285d2c6d..bfc9d29e63709f7ad6eff498953027003c677edf 100644
--- a/paddle/pten/core/infermeta_utils.h
+++ b/paddle/pten/core/infermeta_utils.h
@@ -151,7 +151,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
   struct InferMetaFnCallHelper<MetaConfig, Tail...> {
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
     static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
-      const MetaConfig& arg = ctx->GetMetaConfig();
+      MetaConfig arg = ctx->GetMetaConfig();
       InferMetaFnCallHelper<Tail...>::template Call<in_idx, attr_idx, out_idx>(
           ctx, pargs..., arg);
     }
diff --git a/paddle/pten/core/kernel_alias_name.h b/paddle/pten/core/kernel_alias_name.h
index 8e089970f9139e2ec2fb2d84644e9982018bbbd1..e473861dcf09c88de3936ca8849aedc6dac744d6 100644
--- a/paddle/pten/core/kernel_alias_name.h
+++ b/paddle/pten/core/kernel_alias_name.h
@@ -21,6 +21,7 @@ namespace pten {
 // the key is sorted by key's alphabet
 const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
     {"elementwise_add", "add_raw"},
+    {"elementwise_add_grad", "add_grad"},
     {"elementwise_div", "divide_raw"},
     {"elementwise_mul", "muliply_raw"},
     {"elementwise_sub", "subtract_raw"},
diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h
index 5dd2bf367b3b83fbef585239af6a11c552821398..def1019e204cd85da56f1a45e162f3e9c4251af3 100644
--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -24,7 +24,7 @@
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/core/enforce.h"
 
 namespace pten {
 
@@ -123,7 +123,7 @@ class KernelContext {
     try {
       return paddle::any_cast<AttrType>(attrs_.at(idx));
     } catch (paddle::bad_any_cast&) {
-      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+      PADDLE_THROW(pten::errors::InvalidArgument(
           "Attribute cast error in Op Kernel Context."));
     }
   }
diff --git a/paddle/pten/core/kernel_factory.cc b/paddle/pten/core/kernel_factory.cc
index f10b58506f728ed39b62ec6c6efad621ab8ce926..06049b237d57946d89dd3793211a2f6af85f610f 100644
--- a/paddle/pten/core/kernel_factory.cc
+++ b/paddle/pten/core/kernel_factory.cc
@@ -15,7 +15,7 @@
 #include "paddle/pten/core/kernel_factory.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/core/enforce.h"
 
 namespace pten {
 
@@ -64,8 +64,8 @@ const Kernel& KernelFactory::SelectKernelOrThrowError(
   auto iter = kernels_.find(kernel_name);
   PADDLE_ENFORCE_NE(iter,
                     kernels_.end(),
-                    paddle::platform::errors::NotFound(
-                        "The kernel `%s` is not registered.", kernel_name));
+                    pten::errors::NotFound("The kernel `%s` is not registered.",
+                                           kernel_name));
 
   auto kernel_iter = iter->second.find(kernel_key);
   // TODO(chenweihang): polish refind impl here
@@ -78,7 +78,7 @@ const Kernel& KernelFactory::SelectKernelOrThrowError(
   PADDLE_ENFORCE_NE(
       kernel_iter,
       iter->second.end(),
-      paddle::platform::errors::NotFound(
+      pten::errors::NotFound(
           "The kernel with key %s of kernel `%s` is not registered.",
           kernel_key,
           kernel_name));
diff --git a/paddle/pten/core/kernel_factory.h b/paddle/pten/core/kernel_factory.h
index bd26d86a34a0942da61f08c040fdd6a0ec47a2cf..8a100451cd4a8f99064e4a5b129e2c50413befa4 100644
--- a/paddle/pten/core/kernel_factory.h
+++ b/paddle/pten/core/kernel_factory.h
@@ -27,7 +27,7 @@
 #include "paddle/pten/core/kernel_def.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/core/enforce.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/small_vector.h"
 
diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h
index e1160ea6b7d5dfa0bcb086247e3cecc99a3fdb78..800c01f6916821e75a5335de7b1efd91cd6ea9f8 100644
--- a/paddle/pten/core/kernel_registry.h
+++ b/paddle/pten/core/kernel_registry.h
@@ -26,7 +26,7 @@
 #include "paddle/pten/core/kernel_utils.h"
 #include "paddle/pten/core/macros.h"
 
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/core/enforce.h"
 
 namespace pten {
 
diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h
index 60201151c62a23130878d93cc0992f9b6e79c02e..85fe2f22836e61bf7348fa0bbe36c9efb2b02331 100644
--- a/paddle/pten/core/kernel_utils.h
+++ b/paddle/pten/core/kernel_utils.h
@@ -22,7 +22,7 @@
 #include "paddle/pten/core/kernel_def.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/core/enforce.h"
 
 namespace pten {
 
diff --git a/paddle/pten/core/meta_tensor.cc b/paddle/pten/core/meta_tensor.cc
index f52d771b73bb90312a1080fea80aa476bcd90d95..a8229b568a617160ba4d1870f9c6954fb0697de6 100644
--- a/paddle/pten/core/meta_tensor.cc
+++ b/paddle/pten/core/meta_tensor.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/pten/core/meta_tensor.h b/paddle/pten/core/meta_tensor.h
index 442ff4137de4267e863c169df3dceb4deca2757a..1435e1c3912d0cc661beb839c354171272fbfac5 100644
--- a/paddle/pten/core/meta_tensor.h
+++ b/paddle/pten/core/meta_tensor.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/pten/core/selected_rows.cc b/paddle/pten/core/selected_rows.cc
index 6f64602bdcf4d9f70d57a76677a1796b373808ac..1dfcfa49347b50d305c2b37ccc4379eedb08a107 100644
--- a/paddle/pten/core/selected_rows.cc
+++ b/paddle/pten/core/selected_rows.cc
@@ -13,9 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/pten/core/selected_rows.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/data_type.h"
+#include "paddle/pten/core/utils/data_type.h"
 
 namespace pten {
 
@@ -191,16 +189,16 @@ void SelectedRows::Get(const pten::DenseTensor& ids,
       int64_t index = AutoGrownIndex(id, auto_grown, is_test);
       if (index < 0) {
         VLOG(5) << "id " << id << " not in the table, return 0";
-        paddle::framework::VisitDataType(
-            value_->type(),
+        pten::VisitDataType(
+            value_->dtype(),
             TensorFillVisitor(value, i * value_width, value_width, 0.0));
       } else {
-        paddle::framework::VisitDataType(value_->type(),
-                                         TensorCopyVisitor(value,
-                                                           i * value_width,
-                                                           *value_.get(),
-                                                           index * value_width,
-                                                           value_width));
+        pten::VisitDataType(value_->dtype(),
+                            TensorCopyVisitor(value,
+                                              i * value_width,
+                                              *value_.get(),
+                                              index * value_width,
+                                              value_width));
       }
     }
   }
diff --git a/paddle/pten/core/selected_rows.h b/paddle/pten/core/selected_rows.h
index f5be0a906dbdbb5339f995430a95a4be106a4a62..e12f59d02f2ba21054700248404640730614b277 100644
--- a/paddle/pten/core/selected_rows.h
+++ b/paddle/pten/core/selected_rows.h
@@ -24,15 +24,16 @@ limitations under the License. */
 #include "paddle/pten/common/place.h"
 #include "paddle/pten/core/ddim.h"
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/enforce.h"
 #include "paddle/pten/core/utils/rw_lock.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/enforce.h"
 
 namespace pten {
-class SelectedRows {
+class SelectedRows : public TensorBase,
+                     public TypeInfoTraits<TensorBase, SelectedRows> {
   /*
    * @brief We can use the SelectedRows structure to reproduce a sparse table.
    *  A sparse table is a key-value structure that the key is an `int64_t`,
@@ -51,21 +52,19 @@ class SelectedRows {
  public:
   SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
       : rows_(rows), height_(height) {
-    value_.reset(new pten::DenseTensor());
+    value_.reset(new DenseTensor());
     rwlock_.reset(new RWLock);
   }
 
   SelectedRows() {
     height_ = 0;
-    value_.reset(new pten::DenseTensor());
+    value_.reset(new DenseTensor());
     rwlock_.reset(new RWLock);
   }
 
-  const pten::Place& place() const { return value_->place(); }
+  const DenseTensor& value() const { return *value_; }
 
-  const pten::DenseTensor& value() const { return *value_; }
-
-  pten::DenseTensor* mutable_value() { return value_.get(); }
+  DenseTensor* mutable_value() { return value_.get(); }
 
   int64_t height() const { return height_; }
 
@@ -109,8 +108,8 @@ class SelectedRows {
    * @return a list of pair which contains the non-exists key and the index in
    * the value
    */
-  void Get(const pten::DenseTensor& ids,
-           pten::DenseTensor* value,
+  void Get(const DenseTensor& ids,
+           DenseTensor* value,
            bool auto_grown = false,
            bool is_test = false);
 
@@ -149,6 +148,41 @@ class SelectedRows {
     return pten::framework::make_ddim(dims);
   }
 
+  /// \brief Returns the name of the class for type traits.
+  /// \return The name of the class.
+  static const char* name() { return "SelectedRows"; }
+
+  /// \brief Returns the number of elements contained in tensor.
+  /// \return The number of elements contained in tensor.
+  int64_t numel() const override { return value_->numel(); };
+
+  /// \brief Returns the dims of the tensor.
+  /// \return The dims of the tensor.
+  const DDim& dims() const noexcept override {
+    return value_->dims();
+    // return paddle::framework::make_ddim(dims);
+  }
+
+  /// \brief Returns the data type of the tensor.
+  /// \return The data type of the tensor.
+  DataType dtype() const noexcept override { return value_->dtype(); }
+
+  /// \brief Returns the data layout of the tensor.
+  /// \return The data layout of the tensor.
+  DataLayout layout() const noexcept override { return value_->layout(); }
+
+  /// \brief Returns the data place of the tensor.
+  /// \return The data place of the tensor.
+  const Place& place() const override { return value_->place(); };
+
+  /// \brief Test whether the metadata is valid.
+  /// \return Whether the metadata is valid.
+  bool valid() const noexcept override { return value_->valid(); }
+
+  /// \brief Test whether the storage is allocated.
+  /// return Whether the storage is allocated.
+  bool initialized() const override { return value_->initialized(); }
+
  private:
   // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
   // SelectedRows are simply concated when adding together. Until a
@@ -156,7 +190,7 @@ class SelectedRows {
   paddle::framework::Vector<int64_t> rows_;
   std::unordered_map<int64_t, int64_t>
       id_to_index_;  // should not be used when rows_ has duplicate member
-  std::unique_ptr<pten::DenseTensor> value_{nullptr};
+  std::unique_ptr<DenseTensor> value_{nullptr};
   int64_t height_;  // height indicates the underline tensor's height
   std::unique_ptr<RWLock> rwlock_{nullptr};
 };
diff --git a/paddle/pten/core/tensor_status.h b/paddle/pten/core/tensor_status.h
deleted file mode 100644
index e426a27eabb882adf447d610c957173a46903c49..0000000000000000000000000000000000000000
--- a/paddle/pten/core/tensor_status.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/common/backend.h"
-#include "paddle/pten/common/data_type.h"
-#include "paddle/pten/common/layout.h"
-namespace pten {
-class TensorInplaceVersion {
- public:
-  explicit TensorInplaceVersion(uint32_t inplace_version = 0)
-      : inplace_version_(inplace_version) {}
-  bool IsUnique() const { return inplace_version_ == 0; }
-  void Bump() { ++inplace_version_; }
-  uint32_t CurrentVersion() const { return inplace_version_; }
-
- private:
-  uint32_t inplace_version_;
-};
-
-/**
- * The Status data member of DenseTensor.
- *
- * Here the `static` represents information describing the status of Tensor,
- * such as version counter, or other bool status members.
- *
- * Note: TensorStatus is a struct, the members are named like
- * ordinary nonmember variables, such as `type` instead of `type_`.
- * And we direct access its members, in addition to constructor, destructor
- * and functions for setting data members, can not provide other functions.
- *
- * Note: polish impl later
- */
-struct TensorStatus {
-  TensorStatus() = default;
-  TensorStatus(const TensorStatus&) = default;
-  TensorStatus(TensorStatus&&) = default;
-
-  TensorStatus& operator=(const TensorStatus&) = delete;
-  TensorStatus& operator=(TensorStatus&&) = delete;
-
-  TensorInplaceVersion inplace_version_counter{0};
-
-  /**
-   * For Scalar Tensor design
-   */
-  bool is_scalar{false};
-};
-
-}  // namespace pten
diff --git a/paddle/pten/core/type_defs.h b/paddle/pten/core/type_defs.h
new file mode 100644
index 0000000000000000000000000000000000000000..13e7bb51c2e1bada6957108faace579b3cb76ecc
--- /dev/null
+++ b/paddle/pten/core/type_defs.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include <boost/variant.hpp>
+
+namespace paddle {
+namespace framework {
+// The order should be as same as framework.proto
+// NOTE(xiongkun): we extract from framework/typedef.h to ensure we can transfer
+// enforce.h
+class BlockDesc;
+using Attribute = boost::variant<boost::blank,
+                                 int,
+                                 float,
+                                 std::string,
+                                 std::vector<int>,
+                                 std::vector<float>,
+                                 std::vector<std::string>,
+                                 bool,
+                                 std::vector<bool>,
+                                 BlockDesc*,
+                                 int64_t,
+                                 std::vector<BlockDesc*>,
+                                 std::vector<int64_t>,
+                                 std::vector<double>>;
+using AttributeMap = std::unordered_map<std::string, Attribute>;
+}  // namespace framework
+
+namespace imperative {
+
+class VariableWrapper;
+class SavedVariableWrapperList;
+class VarBase;
+class OpBase;
+class GradOpNode;
+class Tracer;
+
+using WeakNameVarBaseMap =
+    std::map<std::string, std::vector<std::weak_ptr<VarBase>>>;
+
+namespace details {
+template <typename T>
+struct NameVarMapTrait {};
+
+template <>
+struct NameVarMapTrait<VarBase> {
+  using Type = std::map<std::string, std::vector<std::shared_ptr<VarBase>>>;
+};
+
+template <>
+struct NameVarMapTrait<VariableWrapper> {
+  using Type = std::map<std::string, SavedVariableWrapperList>;
+};
+}  // namespace details
+
+template <typename T>
+using NameVarMap = typename details::NameVarMapTrait<T>::Type;
+
+using NameVarBaseMap = NameVarMap<VarBase>;
+using NameVariableWrapperMap = NameVarMap<VariableWrapper>;
+
+using VariableWrapperList = std::vector<std::shared_ptr<VariableWrapper>>;
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/pten/core/utils/data_type.h b/paddle/pten/core/utils/data_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee223afb3b03c0e2b770097e4313ce31c45927ea
--- /dev/null
+++ b/paddle/pten/core/utils/data_type.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <iostream>
+#include <string>
+#include <typeindex>
+
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/enforce.h"
+#include "paddle/pten/kernels/funcs/eigen/extensions.h"
+
+namespace pten {
+
+#define _PtenForEachDataTypeHelper_(callback, cpp_type, data_type) \
+  callback(cpp_type, data_type);
+
+#define _PtenForEachDataType_(callback)                                   \
+  _PtenForEachDataTypeHelper_(callback, float, DataType::FLOAT32);        \
+  _PtenForEachDataTypeHelper_(                                            \
+      callback, ::paddle::platform::float16, DataType::FLOAT16);          \
+  _PtenForEachDataTypeHelper_(                                            \
+      callback, ::paddle::platform::bfloat16, DataType::BFLOAT16);        \
+  _PtenForEachDataTypeHelper_(callback, double, DataType::FLOAT64);       \
+  _PtenForEachDataTypeHelper_(callback, int, DataType::INT32);            \
+  _PtenForEachDataTypeHelper_(callback, int64_t, DataType::INT64);        \
+  _PtenForEachDataTypeHelper_(callback, bool, DataType::BOOL);            \
+  _PtenForEachDataTypeHelper_(callback, uint8_t, DataType::UINT8);        \
+  _PtenForEachDataTypeHelper_(callback, int16_t, DataType::INT16);        \
+  _PtenForEachDataTypeHelper_(callback, int8_t, DataType::INT8);          \
+  _PtenForEachDataTypeHelper_(                                            \
+      callback, ::paddle::platform::complex<float>, DataType::COMPLEX64); \
+  _PtenForEachDataTypeHelper_(                                            \
+      callback, ::paddle::platform::complex<double>, DataType::COMPLEX128);
+
+template <typename Visitor>
+inline void VisitDataType(pten::DataType type, Visitor visitor) {
+#define PtenVisitDataTypeCallback(cpp_type, data_type) \
+  do {                                                 \
+    if (type == data_type) {                           \
+      visitor.template apply<cpp_type>();              \
+      return;                                          \
+    }                                                  \
+  } while (0)
+
+  _PtenForEachDataType_(PtenVisitDataTypeCallback);
+#undef PtenVisitDataTypeCallback
+  PADDLE_THROW(pten::errors::Unimplemented(
+      "Not supported proto::VarType::Type(%d) as data type.",
+      static_cast<int>(type)));
+}
+}  // namespace pten
diff --git a/paddle/pten/core/utils/intrusive_ptr.h b/paddle/pten/core/utils/intrusive_ptr.h
index ed9a21e7f3a8a6c169a4f83572cfc9be6ff3a8d6..40f1dba4f64ad7688753042cb28ed2115d73f8aa 100644
--- a/paddle/pten/core/utils/intrusive_ptr.h
+++ b/paddle/pten/core/utils/intrusive_ptr.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <utility>
 #include "glog/logging.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/core/enforce.h"
 
 namespace pten {
 
@@ -62,7 +62,7 @@ class intrusive_ptr {
   T& operator*() const {
     PADDLE_ENFORCE_NOT_NULL(
         px,
-        paddle::platform::errors::PreconditionNotMet(
+        pten::errors::PreconditionNotMet(
             "The pointer must be non-null before the dereference operation."));
     return *px;
   }
@@ -70,7 +70,7 @@ class intrusive_ptr {
   T* operator->() const {
     PADDLE_ENFORCE_NOT_NULL(
         px,
-        paddle::platform::errors::PreconditionNotMet(
+        pten::errors::PreconditionNotMet(
             "The pointer must be non-null before the dereference operation."));
     return px;
   }
diff --git a/paddle/pten/infermeta/CMakeLists.txt b/paddle/pten/infermeta/CMakeLists.txt
index 8e50d9d2c90d435eddd75f110ca7de38e11c9044..2216d38708b0b4746e55481ca63299b96b496eb6 100644
--- a/paddle/pten/infermeta/CMakeLists.txt
+++ b/paddle/pten/infermeta/CMakeLists.txt
@@ -1,2 +1,2 @@
-cc_library(infermeta SRCS nullary.cc unary.cc binary.cc multiary.cc DEPS convert_utils)
+cc_library(infermeta SRCS nullary.cc unary.cc binary.cc multiary.cc DEPS convert_utils infermeta_utils)
 cc_library(backward_infermeta SRCS backward.cc DEPS convert_utils)
diff --git a/paddle/pten/infermeta/binary.cc b/paddle/pten/infermeta/binary.cc
index 083fb0fca21881bcfbf078d31fb23687d07864f2..cb605db78d962e2deff1295686c5e95945f02531 100644
--- a/paddle/pten/infermeta/binary.cc
+++ b/paddle/pten/infermeta/binary.cc
@@ -131,8 +131,13 @@ DenseTensorMeta MatmulInferMeta(const DenseTensorMeta& x_meta,
 }
 
 DenseTensorMeta ElementwiseInferMeta(const DenseTensorMeta& x_meta,
-                                     const DenseTensorMeta& y_meta,
-                                     int axis) {
+                                     const DenseTensorMeta& y_meta) {
+  return ElementwiseRawInferMeta(x_meta, y_meta, -1);
+}
+
+DenseTensorMeta ElementwiseRawInferMeta(const DenseTensorMeta& x_meta,
+                                        const DenseTensorMeta& y_meta,
+                                        int axis) {
   DenseTensorMeta return_meta(x_meta.dtype, x_meta.dims, x_meta.layout);
   if (x_meta.dims != y_meta.dims) {
     auto x_dims = x_meta.dims;
diff --git a/paddle/pten/infermeta/binary.h b/paddle/pten/infermeta/binary.h
index c86fc12a20abef6db422b93c1aa258e008688e0c..658211e48ac0a44c57d83ce63154e481a90ce69c 100644
--- a/paddle/pten/infermeta/binary.h
+++ b/paddle/pten/infermeta/binary.h
@@ -42,6 +42,10 @@ DenseTensorMeta MatmulInferMeta(const DenseTensorMeta& x_meta,
                                 bool trans_y);
 
 DenseTensorMeta ElementwiseInferMeta(const DenseTensorMeta& x_meta,
-                                     const DenseTensorMeta& y_meta,
-                                     int axis);
+                                     const DenseTensorMeta& y_meta);
+
+DenseTensorMeta ElementwiseRawInferMeta(const DenseTensorMeta& x_meta,
+                                        const DenseTensorMeta& y_meta,
+                                        int axis);
+
 }  // namespace pten
diff --git a/paddle/pten/infermeta/unary.cc b/paddle/pten/infermeta/unary.cc
index 27e1dc9511df231ba3c81f9a1ece7dbaafdb2450..fec50d528dfc42f357c006ef895549465a02f3e7 100644
--- a/paddle/pten/infermeta/unary.cc
+++ b/paddle/pten/infermeta/unary.cc
@@ -12,12 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-// See Note [ Why still include the fluid headers? ]
 #include "paddle/pten/infermeta/unary.h"
+
 #include <set>
 
+#include "paddle/pten/core/infermeta_utils.h"
+
 namespace pten {
 
+void UnchangedInferMetaNew(MetaConfig config,
+                           const MetaTensor& x,
+                           MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->share_lod(x);
+}
+
 DenseTensorMeta UnchangedInferMeta(const DenseTensorMeta& x_meta) {
   return x_meta;
 }
@@ -232,6 +241,16 @@ DenseTensorMeta ReshapeInferMeta(const DenseTensorMeta& x_meta,
   return InferMetaFromVecValue(x_meta, shape.GetData());
 }
 
+/*  Why not use ReduceInferMeta directly?
+    Because we need make InferMetaFunction's args follow the design of api.yaml
+*/
+DenseTensorMeta SumInferMeta(const DenseTensorMeta& x_meta,
+                             const std::vector<int64_t>& axis,
+                             DataType dtype,
+                             bool keep_dim) {
+  return ReduceInferMeta(x_meta, axis, keep_dim, dtype);
+}
+
 DenseTensorMeta ReduceInferMeta(const DenseTensorMeta& x_meta,
                                 const std::vector<int64_t>& axis,
                                 bool keep_dim,
diff --git a/paddle/pten/infermeta/unary.h b/paddle/pten/infermeta/unary.h
index ae42cbd5dd2c6d764bd10660834f24aa002baeab..670c70de84ccfdd01288ad5ad02b0b0ce5226c24 100644
--- a/paddle/pten/infermeta/unary.h
+++ b/paddle/pten/infermeta/unary.h
@@ -16,23 +16,27 @@ limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/pten/common/scalar_array.h"
+#include "paddle/pten/core/infermeta_utils.h"
+#include "paddle/pten/core/meta_tensor.h"
 #include "paddle/pten/core/tensor_meta.h"
 
 namespace pten {
 
+class MetaConfig;
+
 // Common InferMeta Functions for unary operators, The format like:
 //
-//   1. DenseTensorMeta [OpName]InferMeta(const DenseTensorMeta& x_meta, ...)
-//   {}
-//   2. std::pair<DenseTensorMeta, DenseTensorMeta> [OpName]InferMeta(const
-//   DenseTensorMeta&
-//   x_meta, ...) {}
-//   3. std::tuple<DenseTensorMeta, DenseTensorMeta, DenseTensorMeta>
-//   [OpName]InferMeta(const
-//   DenseTensorMeta& x_meta, ...)
-//  NOTE: The name "InferMeta" may be not appropriate. "InferMeta" may be good.
-//  Because functions in this file
-//  not only can infer shape, but alse need infer lod or other useful data.
+//   void [OpName]InferMeta(const MetaTensor& x, ..., MetaTensor* out) {}
+//
+// NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
+// Because functions in this file not only can infer shape, but also need
+// infer lod or other useful data.
+
+// TODO(chenweihang): update all InferMeta function format in next pr,
+// now add UnchangedInferMetaNew for test new format
+void UnchangedInferMetaNew(MetaConfig config,
+                           const MetaTensor& x,
+                           MetaTensor* out);
 
 DenseTensorMeta UnchangedInferMeta(const DenseTensorMeta& x_meta);
 
@@ -58,4 +62,9 @@ DenseTensorMeta ReduceInferMeta(const DenseTensorMeta& x_meta,
                                 const std::vector<int64_t>& axis,
                                 bool keep_dim,
                                 DataType dtype = DataType::UNDEFINED);
+
+DenseTensorMeta SumInferMeta(const DenseTensorMeta& x_meta,
+                             const std::vector<int64_t>& axis,
+                             DataType dtype,
+                             bool keep_dim);
 }  // namespace pten
diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt
index 999f72a7e6b657613067cd9311a774b6d3a69b8d..615b80be592a081c044e458e46b52a3cb866c369 100644
--- a/paddle/pten/kernels/CMakeLists.txt
+++ b/paddle/pten/kernels/CMakeLists.txt
@@ -9,7 +9,7 @@ add_subdirectory(funcs)
 set_property(GLOBAL PROPERTY PTEN_KERNELS "")
 
 set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} pten_api_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
diff --git a/paddle/pten/kernels/cpu/elementwise.h b/paddle/pten/kernels/cpu/elementwise.h
index 6bfde977ce51789d1d62736338f1098a8d4783a7..179a1881189222e18f2dde14c35c14caadc831f4 100644
--- a/paddle/pten/kernels/cpu/elementwise.h
+++ b/paddle/pten/kernels/cpu/elementwise.h
@@ -706,4 +706,94 @@ void ElemwiseGradComputeWithBroadcast(const CPUContext& ctx,
   }
 }
 
+// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
+// explicit gradient can cut off X, Y, Out from gradient op
+// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
+// elementwise code.
+template <typename T, typename DX_OP, typename DY_OP>
+void ElemwiseExplicitGradCompute(const CPUContext& dev_ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 const DenseTensor& out,
+                                 const DenseTensor& dout,
+                                 int axis,
+                                 DenseTensor* dx,
+                                 DenseTensor* dy,
+                                 DX_OP dx_op,
+                                 DY_OP dy_op) {
+  const DDim& x_dim = x.dims();
+  const DDim& y_dim = y.dims();
+  if (x.dims() == y.dims()) {
+    pten::funcs::ElemwiseGradComputeNoBroadcast<CPUContext, T, DX_OP, DY_OP>(
+        dev_ctx,
+        x_dim,
+        y_dim,
+        dout,
+        dout,
+        out,
+        dout,
+        axis,
+        dx,
+        dy,
+        dx_op,
+        dy_op);
+  } else {
+    ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP>(dev_ctx,
+                                                      x_dim,
+                                                      y_dim,
+                                                      dout,
+                                                      dout,
+                                                      out,
+                                                      dout,
+                                                      axis,
+                                                      dx,
+                                                      dy,
+                                                      dx_op,
+                                                      dy_op);
+  }
+}
+
+// Add Grad
+
+template <typename T>
+struct IdentityGrad {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
+};
+
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value>::type
+elementwise_add_grad(const CPUContext& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out,
+                     const DenseTensor& dout,
+                     DenseTensor* dx,
+                     DenseTensor* dy,
+                     int axis = -1) {
+  auto blas = paddle::operators::math::GetBlas<CPUContext, T>(ctx);
+  if (dx) {
+    blas.VCOPY(
+        dout.numel(), dout.data<T>(), dx->mutable_data<T>(ctx.GetPlace()));
+  }
+
+  if (dy) {
+    blas.VCOPY(
+        dout.numel(), dout.data<T>(), dy->mutable_data<T>(ctx.GetPlace()));
+  }
+}
+
+template <typename T>
+typename std::enable_if<!std::is_floating_point<T>::value>::type
+elementwise_add_grad(const CPUContext& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out,
+                     const DenseTensor& dout,
+                     DenseTensor* dx,
+                     DenseTensor* dy,
+                     int axis = -1) {
+  ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
+      ctx, x, y, out, dout, axis, dx, dy, IdentityGrad<T>(), IdentityGrad<T>());
+}
+
 }  // namespace pten
diff --git a/paddle/pten/kernels/cpu/elementwise_grad_kernel.cc b/paddle/pten/kernels/cpu/elementwise_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4a940c2be15c00e129611a6da7e5d5a6d1545a27
--- /dev/null
+++ b/paddle/pten/kernels/cpu/elementwise_grad_kernel.cc
@@ -0,0 +1,128 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/elementwise_grad_kernel.h"
+
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/copy_kernel.h"
+#include "paddle/pten/kernels/cpu/elementwise.h"
+#include "paddle/pten/kernels/funcs/elementwise_functor.h"
+#include "paddle/pten/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace pten {
+
+template <typename T>
+void AddGradFunc(const CPUContext& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 const DenseTensor& out,
+                 const DenseTensor& dout,
+                 DenseTensor* dx,
+                 DenseTensor* dy,
+                 int axis = -1) {
+  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
+    elementwise_add_grad<T>(dev_ctx, x, y, out, dout, dx, dy);
+  } else {
+    ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
+        dev_ctx,
+        x,
+        y,
+        out,
+        dout,
+        axis,
+        dx,
+        dy,
+        IdentityGrad<T>(),
+        IdentityGrad<T>());
+  }
+}
+
+template <typename T, typename Context>
+void AddGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& dout,
+                   int axis,
+                   DenseTensor* dx,
+                   DenseTensor* dy) {
+  pten::AddGradImpl<T>(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc<T>);
+}
+
+template <typename T, typename Context>
+void AddDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y,
+                         paddle::optional<const DenseTensor&> ddx,
+                         paddle::optional<const DenseTensor&> ddy,
+                         const DenseTensor& dout,
+                         int axis,
+                         DenseTensor* ddout) {
+  pten::AddDoubleGradImpl<T>(
+      dev_ctx,
+      y,
+      ddx,
+      ddy,
+      dout,
+      axis,
+      ddout,
+      ElementwiseCompute<funcs::AddFunctor<T>, T>,
+      ElementwiseCompute<funcs::InverseAddFunctor<T>, T>);
+}
+
+template <typename T, typename Context>
+void AddTripleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& ddx,
+                         const DenseTensor& ddy,
+                         const DenseTensor& d_ddout,
+                         int axis,
+                         DenseTensor* d_ddx,
+                         DenseTensor* d_ddy) {
+  pten::AddGradImpl<T>(
+      dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc<T>);
+}
+
+}  // namespace pten
+
+PT_REGISTER_KERNEL(add_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::AddGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
+
+PT_REGISTER_KERNEL(add_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::AddDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
+
+PT_REGISTER_KERNEL(add_triple_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::AddTripleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/elementwise_grad_kernel.h b/paddle/pten/kernels/elementwise_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..067eebc9e15b95c870a7fac4c03bb52d79fed511
--- /dev/null
+++ b/paddle/pten/kernels/elementwise_grad_kernel.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void AddGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& dout,
+                   int axis,
+                   DenseTensor* dx,
+                   DenseTensor* dy);
+
+template <typename T, typename Context>
+void AddDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y,
+                         paddle::optional<const DenseTensor&> ddx,
+                         paddle::optional<const DenseTensor&> ddy,
+                         const DenseTensor& dout,
+                         int axis,
+                         DenseTensor* ddout);
+
+template <typename T, typename Context>
+void AddTripleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& ddx,
+                         const DenseTensor& ddy,
+                         const DenseTensor& d_ddout,
+                         int axis,
+                         DenseTensor* d_ddx,
+                         DenseTensor* d_ddy);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h
index 1c18e9f7998adc777c1f267ecf66ba1ad673112b..9ea27fd9c5b8d5f9b9a4d6fb0d6cb608d13f5984 100644
--- a/paddle/pten/kernels/funcs/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
@@ -14,18 +14,20 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/fluid/platform/transform.h"
 #include "paddle/pten/backends/all_context.h"
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/empty_kernel.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/function_traits.h"
+#include "paddle/pten/kernels/primitive/kernel_primitives.h"
 
-namespace kps = paddle::operators::kernel_primitives;
+namespace kps = pten::kps;
 
 #endif
 
@@ -360,6 +362,43 @@ inline void get_mid_dims(const DDim &x_dims,
   }
 }
 
+// for broadcast backwards
+static inline std::vector<int> GetReduceDim(const paddle::framework::DDim &in,
+                                            const paddle::framework::DDim &out,
+                                            int axis) {
+  axis =
+      (axis == -1 ? std::abs(static_cast<int>(out.size() - in.size())) : axis);
+  std::vector<int> dims;
+  for (int i = 0; i < axis; ++i) {
+    dims.push_back(i);
+  }
+  for (int i = 0; i < in.size(); ++i) {
+    if (out[i + axis] != in[i]) {
+      dims.push_back(i + axis);
+    }
+  }
+  for (int i = axis + in.size(); i < out.size(); ++i) {
+    dims.push_back(i);
+  }
+  return dims;
+}
+
+template <typename DeviceContext, typename T>
+static inline void GetDoubleGradSafeTensor(const DeviceContext &dev_ctx,
+                                           const DenseTensor &x,
+                                           const DenseTensor *ddx,
+                                           DenseTensor *ddx_safe) {
+  if (ddx) {
+    *ddx_safe = *ddx;
+  } else {
+    auto meta = pten::DenseTensorMeta(x.dtype(), x.dims(), x.layout());
+    *ddx_safe = pten::Empty<T, DeviceContext>(dev_ctx, std::move(meta));
+    ddx_safe->mutable_data(dev_ctx.GetPlace());
+    paddle::operators::math::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, ddx_safe, static_cast<T>(0));
+  }
+}
+
 template <typename DeviceContext,
           typename T,
           typename DX_OP,
@@ -390,6 +429,13 @@ void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
       dy == nullptr ? nullptr : dy->mutable_data<T>(dev_ctx.GetPlace())});
 }
 
+inline void ElementwiseGradPreProcess(const DenseTensor &dout,
+                                      DenseTensor *dx) {
+  if (dx != nullptr) {
+    dx->set_lod(dout.lod());
+  }
+}
+
 #if defined(__NVCC__) || defined(__HIPCC__)
 
 template <typename InT, typename OutT>
diff --git a/paddle/pten/kernels/funcs/elementwise_functor.h b/paddle/pten/kernels/funcs/elementwise_functor.h
index 6d139d68530befe57bc0094eb3d5537cf00e660b..0b279d5325905885d69943629523f130b2411aff 100644
--- a/paddle/pten/kernels/funcs/elementwise_functor.h
+++ b/paddle/pten/kernels/funcs/elementwise_functor.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/pten/common/float16.h"
+#include "paddle/pten/core/enforce.h"
 #include "paddle/pten/core/hostdevice.h"
 
 namespace pten {
diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h
index f4d8e442fcdebfe76cfa89df82d3132a7a65fae4..9a3ae7f12dfcd62a1a18154971fa99ab72c5561d 100644
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -14,9 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/pten/kernels/copy_kernel.h"
 #include "paddle/pten/kernels/funcs/common_shape.h"
 #include "paddle/pten/kernels/funcs/cuda_kernel_config.h"
 #include "paddle/pten/kernels/funcs/elementwise_base.h"
+#include "paddle/pten/kernels/gpu/reduce.h"
 
 #ifdef __HIPCC__
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
@@ -578,6 +580,20 @@ void LaunchElementwiseCudaKernel(const KPDevice &ctx,
   }
 }
 
+template <typename Functor, typename T, typename OutType = T>
+void ElementwiseCompute(const GPUContext &dev_ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        int axis,
+                        Functor func,
+                        DenseTensor *z) {
+  std::vector<const DenseTensor *> ins = {&x, &y};
+  std::vector<DenseTensor *> outs = {z};
+  z->mutable_data<OutType>(dev_ctx.GetPlace());
+  pten::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, OutType>(
+      dev_ctx, ins, &outs, axis, func);
+}
+
 // BACKWARD CODE
 
 // Suppose only has contiguous dims
@@ -1938,4 +1954,130 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
   }
 }
 
+template <typename T>
+static __global__ void SimpleElemwiseAddGradCUDAKernel(
+    const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = gridDim.x * blockDim.x;
+  int loop = size / vec_size;
+  int remainder = size % vec_size;
+  const float4 *dout_vec = reinterpret_cast<const float4 *>(dout);
+  float4 *dx_vec = reinterpret_cast<float4 *>(dx);
+  float4 *dy_vec = reinterpret_cast<float4 *>(dy);
+  float4 tmp_loop;
+
+  for (int i = tid; i < loop; i += stride) {
+    tmp_loop = dout_vec[i];
+    dx_vec[i] = tmp_loop;
+    dy_vec[i] = tmp_loop;
+  }
+
+  if (tid == loop && remainder != 0) {
+    T tmp_rem;
+    while (remainder) {
+      int idx = size - remainder;
+      remainder--;
+      tmp_rem = dout[idx];
+      dx[idx] = tmp_rem;
+      dy[idx] = tmp_rem;
+    }
+  }
+}
+
+template <typename T>
+void default_elementwise_add_grad(const GPUContext &ctx,
+                                  const DenseTensor &x,
+                                  const DenseTensor &y,
+                                  const DenseTensor &out,
+                                  const DenseTensor &dout,
+                                  DenseTensor *dx,
+                                  DenseTensor *dy,
+                                  int axis = -1) {
+  auto *dout_data = dout.data<T>();
+
+  // dx
+  if (dx != nullptr) {
+    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    if (dx->dims() == dout.dims()) {
+      if (dx_data != dout_data) {
+        pten::Copy(ctx, dout, false, dx);
+      }
+    } else {
+      // For inplace strategy, dx will be stored in addr of dout, which makes
+      // the result of dy wrong.
+      if (dx->IsSharedBufferWith(dout)) {
+        dx->clear();
+        dx->mutable_data<T>(x.dims(), ctx.GetPlace());
+      }
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(x.dims(), out.dims(), axis);
+      gpuStream_t stream = ctx.stream();
+      kernels::TensorReduceFunctorImpl<T,
+                                       T,
+                                       kps::AddFunctor,
+                                       kps::IdentityFunctor<T>>(
+          dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    }
+  }
+  // dy
+  if (dy != nullptr) {
+    auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
+    if (dy->dims() == dout.dims()) {
+      if (dy_data != dout_data) {
+        pten::Copy(ctx, dout, false, dy);
+      }
+    } else {
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(y.dims(), out.dims(), axis);
+      gpuStream_t stream = ctx.stream();
+      kernels::TensorReduceFunctorImpl<T,
+                                       T,
+                                       kps::AddFunctor,
+                                       kps::IdentityFunctor<T>>(
+          dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    }
+  }
+}
+
+template <typename T>
+void elementwise_add_grad(const GPUContext &ctx,
+                          const DenseTensor &x,
+                          const DenseTensor &y,
+                          const DenseTensor &out,
+                          const DenseTensor &dout,
+                          DenseTensor *dx,
+                          DenseTensor *dy) {
+  auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
+  auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
+  auto *dout_data = dout.data<T>();
+  if (dx_data == dout_data && dy_data != dout_data) {
+    VLOG(4) << "Special case when dx_data is the same as dout_data, "
+               "only need copy dout to dy";
+    pten::Copy(ctx, dout, false, dy);
+  } else if (dx_data != dout_data && dy_data == dout_data) {
+    VLOG(4) << "Special case when dy_data is the same as dout_data, "
+               "only need copy dout to dx";
+    pten::Copy(ctx, dout, false, dx);
+  } else if (dx_data != dout_data && dy_data != dout_data) {
+    auto size = x.numel();
+    int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
+    dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
+    dim3 grid_size =
+        dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
+                 PREDEFINED_BLOCK_SIZE,
+             1);
+    SimpleElemwiseAddGradCUDAKernel<
+        T><<<grid_size, block_size, 0, ctx.stream()>>>(
+        dout.data<T>(),
+        size,
+        vec_size,
+        dx->mutable_data<T>(ctx.GetPlace()),
+        dy->mutable_data<T>(ctx.GetPlace()));
+  } else {
+    VLOG(4) << "Special case when dy_data is the same as dout_data, "
+               "and dx_data is the same as dout_data, do not need "
+               "any operator";
+  }
+}
+
 }  // namespace pten
diff --git a/paddle/pten/kernels/gpu/elementwise_grad_kernel.cu b/paddle/pten/kernels/gpu/elementwise_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..76af94f42fde2a10033169015e2acb7fed3c46a7
--- /dev/null
+++ b/paddle/pten/kernels/gpu/elementwise_grad_kernel.cu
@@ -0,0 +1,121 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/elementwise_grad_kernel.h"
+
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/copy_kernel.h"
+#include "paddle/pten/kernels/funcs/elementwise_functor.h"
+#include "paddle/pten/kernels/gpu/elementwise.h"
+#include "paddle/pten/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace pten {
+
+template <typename T>
+void AddGradFunc(const GPUContext& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 const DenseTensor& out,
+                 const DenseTensor& dout,
+                 DenseTensor* dx,
+                 DenseTensor* dy,
+                 int axis = -1) {
+  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
+    elementwise_add_grad<T>(dev_ctx, x, y, out, dout, dx, dy);
+  } else {
+    default_elementwise_add_grad<T>(dev_ctx, x, y, out, dout, dx, dy, axis);
+  }
+}
+
+template <typename T, typename Context>
+void AddGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& dout,
+                   int axis,
+                   DenseTensor* dx,
+                   DenseTensor* dy) {
+  pten::AddGradImpl<T>(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc<T>);
+}
+
+template <typename T, typename Context>
+void AddDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y,
+                         paddle::optional<const DenseTensor&> ddx,
+                         paddle::optional<const DenseTensor&> ddy,
+                         const DenseTensor& dout,
+                         int axis,
+                         DenseTensor* ddout) {
+  pten::AddDoubleGradImpl<T>(
+      dev_ctx,
+      y,
+      ddx,
+      ddy,
+      dout,
+      axis,
+      ddout,
+      ElementwiseCompute<funcs::AddFunctor<T>, T>,
+      ElementwiseCompute<funcs::InverseAddFunctor<T>, T>);
+}
+
+template <typename T, typename Context>
+void AddTripleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& ddx,
+                         const DenseTensor& ddy,
+                         const DenseTensor& d_ddout,
+                         int axis,
+                         DenseTensor* d_ddx,
+                         DenseTensor* d_ddy) {
+  pten::AddGradImpl<T>(
+      dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc<T>);
+}
+
+}  // namespace pten
+
+PT_REGISTER_KERNEL(add_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::AddGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
+
+PT_REGISTER_KERNEL(add_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::AddDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
+
+PT_REGISTER_KERNEL(add_triple_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::AddTripleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu
index d06dc1c43f6d41bc988283b2cfa9be072f4a69c8..996d85d3f42a7996e481a4887a0d0f4fa2587893 100644
--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -27,10 +27,10 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/pten/common/complex.h"
 #include "paddle/pten/common/float16.h"
 #include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/enforce.h"
 #include "paddle/pten/core/kernel_registry.h"
 
 namespace pten {
diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h
index 26f17bc00507e8bc401a50942ce951710b120d64..d864c76ea197408e4d035c816a32d5bb5ccb71c1 100644
--- a/paddle/pten/kernels/gpu/reduce.h
+++ b/paddle/pten/kernels/gpu/reduce.h
@@ -34,13 +34,13 @@ namespace cub = hipcub;
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/pten/core/array.h"
+#include "paddle/pten/core/enforce.h"
+#include "paddle/pten/kernels/primitive/kernel_primitives.h"
 
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
@@ -51,7 +51,7 @@ namespace cub = hipcub;
 #define REDUCE_SPLIT_BOUNDARY 512
 #define REDUCE_VEC_SIZE 4
 
-namespace kps = paddle::operators::kernel_primitives;
+namespace kps = pten::kps;
 
 namespace pten {
 namespace kernels {
@@ -94,7 +94,7 @@ static inline void CheckReduceRank(int reduce_rank, int rank) {
   if (rank % 2 == 0) {
     PADDLE_ENFORCE_EQ(reduce_rank,
                       rank / 2,
-                      paddle::platform::errors::InvalidArgument(
+                      pten::errors::InvalidArgument(
                           "ReduceOp: invalid reduce rank. When rank = %d, "
                           "reduce_rank must be %d, but got %d.",
                           rank,
@@ -106,7 +106,7 @@ static inline void CheckReduceRank(int reduce_rank, int rank) {
     PADDLE_ENFORCE_EQ(
         reduce_rank == lower_rank || reduce_rank == upper_rank,
         true,
-        paddle::platform::errors::InvalidArgument(
+        pten::errors::InvalidArgument(
             "ReduceOp: invalid reduce rank. When rank = %d, reduce_rank "
             "must be %d or %d, but got %d.",
             rank,
@@ -122,7 +122,7 @@ static inline pten::framework::Array<T, ElementCount> VectorToArray(
     const VectorLikeType& vec) {
   PADDLE_ENFORCE_LE(vec.size(),
                     ElementCount,
-                    paddle::platform::errors::InvalidArgument(
+                    pten::errors::InvalidArgument(
                         "Cub reduce Array: size not match. Received "
                         "vec.size() %d > ElementCount %d.",
                         vec.size(),
@@ -149,7 +149,7 @@ static inline std::vector<int> GetReduceDim(const std::vector<int64_t>& dims,
     for (auto e : dims) {
       PADDLE_ENFORCE_LT(e,
                         dim_size,
-                        paddle::platform::errors::InvalidArgument(
+                        pten::errors::InvalidArgument(
                             "ReduceOp: invalid axis, when x_dims is %d, "
                             "axis[i] should less than x_dims, but got %d.",
                             dim_size,
@@ -1057,7 +1057,7 @@ static
                                int reduce_num,
                                const paddle::platform::Place& place,
                                gpuStream_t stream) {
-  PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+  PADDLE_THROW(pten::errors::InvalidArgument(
       "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
 }
 
diff --git a/paddle/pten/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/pten/kernels/impl/elementwise_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a74c9c0b6be10449a9f589d6437d828d65283096
--- /dev/null
+++ b/paddle/pten/kernels/impl/elementwise_grad_kernel_impl.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/funcs/elementwise_base.h"
+#include "paddle/pten/kernels/funcs/elementwise_functor.h"
+
+namespace pten {
+
+template <typename T, typename Context, typename GradFunc>
+void AddGradImpl(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 const DenseTensor& out_grad,
+                 int axis,
+                 DenseTensor* x_grad,
+                 DenseTensor* y_grad,
+                 GradFunc grad_func) {
+  pten::funcs::ElementwiseGradPreProcess(out_grad, x_grad);
+  auto* out = &out_grad;
+  // Special case when y_grad is not needed and x_grad doesn't reduce
+  if (x_grad != nullptr && y_grad == nullptr &&
+      x_grad->dims() == out_grad.dims()) {
+    VLOG(4) << "Special case when y_grad is not needed and x_grad doesn't "
+               "reduce";
+    pten::Copy(dev_ctx, out_grad, false, x_grad);
+  } else if (x_grad == nullptr && y_grad != nullptr &&
+             y_grad->dims() == out_grad.dims()) {
+    VLOG(4) << "Special case when x_grad is not needed and y_grad doesn't "
+               "reduce";
+    pten::Copy(dev_ctx, out_grad, false, y_grad);
+  } else {
+    grad_func(dev_ctx, x, y, *out, out_grad, x_grad, y_grad, axis);
+  }
+}
+
+template <typename T,
+          typename Context,
+          typename GradFunc,
+          typename GradInverseFunc>
+void AddDoubleGradImpl(const Context& dev_ctx,
+                       const DenseTensor& y,
+                       const paddle::optional<const DenseTensor&>& ddx,
+                       const paddle::optional<const DenseTensor&>& ddy,
+                       const DenseTensor& dout,
+                       int axis,
+                       DenseTensor* ddout,
+                       GradFunc grad_func,
+                       GradInverseFunc grad_inverse_func) {
+  // ddOut = ddx + ddy
+  if (ddout) {
+    DenseTensor ddx_safe, ddy_safe;
+    funcs::GetDoubleGradSafeTensor<Context, T>(
+        dev_ctx, dout, ddx.get_ptr(), &ddx_safe);
+    funcs::GetDoubleGradSafeTensor<Context, T>(
+        dev_ctx, y, ddy.get_ptr(), &ddy_safe);
+
+    ddout->mutable_data<T>(dev_ctx.GetPlace());
+    auto ddx_dims = ddx_safe.dims();
+    auto ddy_dims = ddy_safe.dims();
+    if (ddx_dims.size() >= ddy_dims.size()) {
+      grad_func(
+          dev_ctx, ddx_safe, ddy_safe, axis, funcs::AddFunctor<T>(), ddout);
+    } else {
+      grad_inverse_func(dev_ctx,
+                        ddx_safe,
+                        ddy_safe,
+                        axis,
+                        funcs::InverseAddFunctor<T>(),
+                        ddout);
+    }
+  }
+}
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/math_kernel.cc b/paddle/pten/kernels/math_kernel.cc
index 423282ab97ca44966e1e8722aafc6c6703a9094c..29a2b48fa7c4f12558c47dc1d6d87c758f0c492e 100644
--- a/paddle/pten/kernels/math_kernel.cc
+++ b/paddle/pten/kernels/math_kernel.cc
@@ -33,8 +33,8 @@ template <typename T, typename Context>
 void SumKernel(const Context& dev_ctx,
                const DenseTensor& x,
                const std::vector<int64_t>& dims,
-               bool keep_dim,
                DataType out_dtype,
+               bool keep_dim,
                DenseTensor* out) {
   bool reduce_all = false;
   SumRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out);
diff --git a/paddle/pten/kernels/math_kernel.h b/paddle/pten/kernels/math_kernel.h
index 95379baaf350434d48fff4ce2a7b1988f7f041d5..afef58669312ca3d051b161caa39c1ca5d26bf9b 100644
--- a/paddle/pten/kernels/math_kernel.h
+++ b/paddle/pten/kernels/math_kernel.h
@@ -50,8 +50,8 @@ template <typename T, typename Context>
 void SumKernel(const Context& dev_ctx,
                const DenseTensor& x,
                const std::vector<int64_t>& dims,
-               bool keep_dim,
                DataType out_dtype,
+               bool keep_dim,
                DenseTensor* out);
 
 template <typename T, typename Context>
@@ -110,7 +110,7 @@ template <typename T, typename Context>
 DenseTensor Add(const Context& dev_ctx,
                 const DenseTensor& x,
                 const DenseTensor& y) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1);
+  auto out_meta = ElementwiseRawInferMeta(x.meta(), y.meta(), -1);
   auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
   AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
   return dense_out;
@@ -120,7 +120,7 @@ template <typename T, typename Context>
 DenseTensor Subtract(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& y) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1);
+  auto out_meta = ElementwiseRawInferMeta(x.meta(), y.meta(), -1);
   auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
   SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
   return dense_out;
@@ -130,7 +130,7 @@ template <typename T, typename Context>
 DenseTensor Divide(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& y) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1);
+  auto out_meta = ElementwiseRawInferMeta(x.meta(), y.meta(), -1);
   auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
   DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
   return dense_out;
@@ -140,7 +140,7 @@ template <typename T, typename Context>
 DenseTensor Multiply(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& y) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1);
+  auto out_meta = ElementwiseRawInferMeta(x.meta(), y.meta(), -1);
   auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
   MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
   return dense_out;
@@ -163,10 +163,10 @@ DenseTensor Sum(const Context& dev_ctx,
                 const std::vector<int64_t>& axis,
                 DataType dtype,
                 bool keep_dim) {
-  auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim, dtype);
+  auto out_meta = SumInferMeta(x.meta(), axis, dtype, keep_dim);
   auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
 
-  SumKernel<T, Context>(dev_ctx, x, axis, keep_dim, dtype, &dense_out);
+  SumKernel<T, Context>(dev_ctx, x, axis, dtype, keep_dim, &dense_out);
   return dense_out;
 }
 
diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives.h b/paddle/pten/kernels/primitive/compute_primitives.h
similarity index 87%
rename from paddle/fluid/operators/kernel_primitives/compute_primitives.h
rename to paddle/pten/kernels/primitive/compute_primitives.h
index 2320b9e0b2fbf47610365155558b869bd5d77b38..ac812c9c9f3eb3d8d97ef595ca3d1bdff3177e41 100644
--- a/paddle/fluid/operators/kernel_primitives/compute_primitives.h
+++ b/paddle/pten/kernels/primitive/compute_primitives.h
@@ -22,11 +22,10 @@
 #endif
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/float16.h"
 
-namespace paddle {
-namespace operators {
-namespace kernel_primitives {
+namespace pten {
+namespace kps {
 namespace details {
 
 #ifdef __HIPCC__
@@ -48,7 +47,7 @@ class MPTypeTrait {
 };
 
 template <>
-class MPTypeTrait<platform::float16> {
+class MPTypeTrait<pten::dtype::float16> {
  public:
   using Type = float;
 };
@@ -158,9 +157,14 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
  * in: The register pointer of in, the size is NX * NY.
  * compute: Compute function which was declared like OpFunc<InT, OutT>().
  */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
           class OpFunc>
-__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
+__device__ __forceinline__ void ElementwiseUnary(OutT* out,
+                                                 const InT* in,
                                                  OpFunc compute) {
 #pragma unroll
   for (int idx = 0; idx < NX * NY; idx++) {
@@ -193,9 +197,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
  * in2: The register pointer of second input, size is NX * NY.
  * compute: Compute function which was declared like OpFunc<InT>().
  */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
           class OpFunc>
-__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
+__device__ __forceinline__ void ElementwiseBinary(OutT* out,
+                                                  const InT* in1,
                                                   const InT* in2,
                                                   OpFunc compute) {
 #pragma unroll
@@ -231,12 +240,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
  * in3: The register pointer of third input, size is NX * NY.
  * compute: Compute function which was declared like OpFunc<InT>().
  */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
           class OpFunc>
-__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
-                                                   const InT* in2,
-                                                   const InT* in3,
-                                                   OpFunc compute) {
+__device__ __forceinline__ void ElementwiseTernary(
+    OutT* out, const InT* in1, const InT* in2, const InT* in3, OpFunc compute) {
 #pragma unroll
   for (int idx = 0; idx < NX * NY; ++idx) {
     out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx], in3[idx]));
@@ -268,9 +279,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
  * ins: A pointers of array consisting of multiple inputs.
  * compute: Compute function which was declared like OpFunc<InT>().
  */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
+          int Arity,
           class OpFunc>
-__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
+__device__ __forceinline__ void ElementwiseAny(OutT* out,
+                                               InT (*ins)[NX * NY],
                                                OpFunc compute) {
   InT args[Arity];
 #pragma unroll
@@ -309,10 +326,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
  * in2: The register pointer of second input, size is NX * NY.
  * compute: Compute function which was declared like OpFunc<InT, OutT>().
  */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
           class OpFunc>
-__device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
-                                            const InT* in2, OpFunc compute) {
+__device__ __forceinline__ void CycleBinary(OutT* out,
+                                            const InT* in1,
+                                            const InT* in2,
+                                            OpFunc compute) {
 #pragma unroll
   for (int idx = 0; idx < NX; idx++) {
 #pragma unroll
@@ -350,9 +373,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
  * reducer: Compute function which was declared like ReduceFunctor<InT>().
  * reduce_last_dim: if the last dim gets involved in reduction.
  */
-template <typename T, int NX, int NY, int BlockSize, class ReduceFunctor,
+template <typename T,
+          int NX,
+          int NY,
+          int BlockSize,
+          class ReduceFunctor,
           details::ReduceMode Mode>
-__device__ __forceinline__ void Reduce(T* out, const T* in,
+__device__ __forceinline__ void Reduce(T* out,
+                                       const T* in,
                                        ReduceFunctor reducer,
                                        bool reduce_last_dim) {
   int block_index = blockDim.y;
@@ -386,6 +414,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in,
   }
 }
 
-}  // namespace kernel_primitives
-}  // namespace operators
-}  // namespace paddle
+}  // namespace kps
+}  // namespace pten
diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h b/paddle/pten/kernels/primitive/compute_primitives_xpu2.h
similarity index 85%
rename from paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h
rename to paddle/pten/kernels/primitive/compute_primitives_xpu2.h
index 32355915809161ae1a4dcc275eba8a28966fb92e..d7282c089fc9cc332abc132941188c7804e68f80 100644
--- a/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h
+++ b/paddle/pten/kernels/primitive/compute_primitives_xpu2.h
@@ -13,13 +13,13 @@
 // limitations under the License.
 
 #pragma once
+#include "paddle/pten/common/float16.h"
 #include "xpu/kernel/cluster_header.h"
 #include "xpu/kernel/debug.h"
 #include "xpu/kernel/math.h"
 
-namespace paddle {
-namespace operators {
-namespace kernel_primitives {
+namespace pten {
+namespace kps {
 namespace details {
 
 // kGlobalMode: block reduce, each block gets an output;
@@ -33,7 +33,7 @@ class MPTypeTrait {
 };
 
 template <>
-class MPTypeTrait<platform::float16> {
+class MPTypeTrait<pten::dtype::float16> {
  public:
   using Type = float;
 };
@@ -102,9 +102,14 @@ __device__ void BlockXReduce(T* data, OpFunc reducer) {
  * in: The register pointer of in, the size is NX * NY.
  * compute: Compute function which was declared like OpFunc<InT, OutT>().
  */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
           class OpFunc>
-__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
+__device__ __forceinline__ void ElementwiseUnary(OutT* out,
+                                                 const InT* in,
                                                  OpFunc compute) {
 #pragma unroll
   for (int idx = 0; idx < NX * NY; idx++) {
@@ -137,9 +142,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
  * in2: The register pointer of second input, size is NX * NY.
  * compute: Compute function which was declared like OpFunc<InT>().
  */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
           class OpFunc>
-__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
+__device__ __forceinline__ void ElementwiseBinary(OutT* out,
+                                                  const InT* in1,
                                                   const InT* in2,
                                                   OpFunc compute) {
 #pragma unroll
@@ -175,12 +185,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
  * in3: The register pointer of third input, size is NX * NY.
  * compute: Compute function which was declared like OpFunc<InT>().
  */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
           class OpFunc>
-__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
-                                                   const InT* in2,
-                                                   const InT* in3,
-                                                   OpFunc compute) {
+__device__ __forceinline__ void ElementwiseTernary(
+    OutT* out, const InT* in1, const InT* in2, const InT* in3, OpFunc compute) {
 #pragma unroll
   for (int idx = 0; idx < NX * NY; ++idx) {
     out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx], in3[idx]));
@@ -212,9 +224,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
  * ins: A pointers of array consisting of multiple inputs.
  * compute: Compute function which was declared like OpFunc<InT>().
  */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
+          int Arity,
           class OpFunc>
-__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
+__device__ __forceinline__ void ElementwiseAny(OutT* out,
+                                               InT (*ins)[NX * NY],
                                                OpFunc compute) {
   __local__ InT args[Arity];
 #pragma unroll
@@ -253,10 +271,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
  * in2: The register pointer of second input, size is NX * NY.
  * compute: Compute function which was declared like OpFunc<InT, OutT>().
  */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
           class OpFunc>
-__device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
-                                            const InT* in2, OpFunc compute) {
+__device__ __forceinline__ void CycleBinary(OutT* out,
+                                            const InT* in1,
+                                            const InT* in2,
+                                            OpFunc compute) {
 #pragma unroll
   for (int idx = 0; idx < NX; idx++) {
 #pragma unroll
@@ -294,9 +318,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
  * reducer: Compute function which was declared like ReduceFunctor<InT>().
  * reduce_last_dim: if the last dim gets involved in reduction.
  */
-template <typename T, int NX, int NY, int BlockSize, class ReduceFunctor,
+template <typename T,
+          int NX,
+          int NY,
+          int BlockSize,
+          class ReduceFunctor,
           details::ReduceMode Mode>
-__device__ __forceinline__ void Reduce(T* out, const T* in,
+__device__ __forceinline__ void Reduce(T* out,
+                                       const T* in,
                                        ReduceFunctor reducer,
                                        bool reduce_last_dim) {
   if (Mode == kGlobalMode) {
@@ -319,6 +348,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in,
   }
 }
 
-}  // namespace kernel_primitives
-}  // namespace operators
-}  // namespace paddle
+}  // namespace kps
+}  // namespace pten
diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h b/paddle/pten/kernels/primitive/datamover_primitives.h
similarity index 87%
rename from paddle/fluid/operators/kernel_primitives/datamover_primitives.h
rename to paddle/pten/kernels/primitive/datamover_primitives.h
index 45697073cbf85b436a4db33b0a2d49d8b805fd63..2a8006f3ecbc427c3e0cf36a08457c2ecd5f84df 100644
--- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
+++ b/paddle/pten/kernels/primitive/datamover_primitives.h
@@ -22,9 +22,8 @@
 #endif
 #include "paddle/pten/core/ddim.h"
 
-namespace paddle {
-namespace operators {
-namespace kernel_primitives {
+namespace pten {
+namespace kps {
 namespace details {
 
 #define INT_BITS 32
@@ -103,11 +102,12 @@ struct BroadcastConfig {
     strides_in.resize(dim_size, 1);
     for (int i = 0; i < dim_size; ++i) {
       strides_in[i] = in_dims[i] == 1 ? 0 : strides_in[i];
-      strides_in[i] =
-          (i != 0 && strides_in[i] != 0)
-              ? std::accumulate(in_dims.begin(), in_dims.begin() + i, 1,
-                                std::multiplies<int64_t>())
-              : strides_in[i];
+      strides_in[i] = (i != 0 && strides_in[i] != 0)
+                          ? std::accumulate(in_dims.begin(),
+                                            in_dims.begin() + i,
+                                            1,
+                                            std::multiplies<int64_t>())
+                          : strides_in[i];
     }
 
     memcpy(strides, strides_in.data(), kDims * sizeof(uint32_t));
@@ -144,11 +144,18 @@ struct BroadcastConfig {
  * stride_nx: Each read one element stride stride_nx elements in the last dim.
  * stride_ny: Each read one element stride stride_ny elements in the first dim.
  */
-template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
+template <typename Tx,
+          typename Ty,
+          int NX,
+          int NY,
+          int BlockSize,
           bool IsBoundary = false>
-__device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
-                                         int size_nx, int size_ny,
-                                         int stride_nx, int stride_ny) {
+__device__ __forceinline__ void ReadData(Ty* dst,
+                                         const Tx* __restrict__ src,
+                                         int size_nx,
+                                         int size_ny,
+                                         int stride_nx,
+                                         int stride_ny) {
   int thread_offset = threadIdx.x;
   int left_size_nx = size_nx - thread_offset;
 
@@ -244,7 +251,8 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
  * size: The current block needs to load size data continuously.
  */
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
-__device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
+__device__ __forceinline__ void ReadData(T* dst,
+                                         const T* __restrict__ src,
                                          int num) {
   if (IsBoundary) {  // blockDim.x * NX > num
     int thread_offset = threadIdx.x * NX;
@@ -299,11 +307,19 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
  * stride_nx: Each read one element stride stride_nx elements in the last dim.
  * stride_ny: Each read one element stride stride_ny elements in the first dim.
  */
-template <typename T, int NX, int NY, int BlockSize, int Rank,
+template <typename T,
+          int NX,
+          int NY,
+          int BlockSize,
+          int Rank,
           bool IsBoundary = false>
 __device__ __forceinline__ void ReadDataBc(
-    T* dst, const T* __restrict__ src, uint32_t block_offset,
-    details::BroadcastConfig<Rank> config, int total_num_output, int stride_nx,
+    T* dst,
+    const T* __restrict__ src,
+    uint32_t block_offset,
+    details::BroadcastConfig<Rank> config,
+    int total_num_output,
+    int stride_nx,
     int stride_ny) {
   uint32_t thread_offset = block_offset + threadIdx.x;
   uint32_t index_src = 0;
@@ -361,12 +377,25 @@ __device__ __forceinline__ void ReadDataBc(
  * reduce_last_dim: Used to indicate whether the dimension of reduce contains
  * the lowest dimension.
  */
-template <typename Tx, typename Ty, int NX, int NY, int BlockSize, int Rank,
-          typename IndexCal, typename Functor, bool IsBoundary = false>
-__device__ __forceinline__ void ReadDataReduce(
-    Ty* dst, const Tx* __restrict__ src, int block_offset,
-    const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx,
-    int stride_ny, Functor func, bool reduce_last_dim) {
+template <typename Tx,
+          typename Ty,
+          int NX,
+          int NY,
+          int BlockSize,
+          int Rank,
+          typename IndexCal,
+          typename Functor,
+          bool IsBoundary = false>
+__device__ __forceinline__ void ReadDataReduce(Ty* dst,
+                                               const Tx* __restrict__ src,
+                                               int block_offset,
+                                               const IndexCal& index_cal,
+                                               int size_nx,
+                                               int size_ny,
+                                               int stride_nx,
+                                               int stride_ny,
+                                               Functor func,
+                                               bool reduce_last_dim) {
   int thread_offset = 0;
   int left_idx = 0;
   if (reduce_last_dim) {
@@ -430,7 +459,8 @@ __device__ __forceinline__ void ReadDataReduce(
  * size: The current block needs to load size elements continuously.
  */
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
-__device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
+__device__ __forceinline__ void WriteData(T* dst,
+                                          T* __restrict__ src,
                                           int num) {
   if (IsBoundary) {
     int thread_offset = threadIdx.x * NX;
@@ -483,11 +513,18 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
  * stride_nx: Each read one element stride stride_nx elements in the last dim.
  * stride_ny: Each read one element stride stride_ny elements in the first dim.
  */
-template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
+template <typename Tx,
+          typename Ty,
+          int NX,
+          int NY,
+          int BlockSize,
           bool IsBoundary = false>
-__device__ __forceinline__ void WriteData(Ty* dst, const Tx* __restrict__ src,
-                                          int size_nx, int size_ny,
-                                          int stride_nx, int stride_ny) {
+__device__ __forceinline__ void WriteData(Ty* dst,
+                                          const Tx* __restrict__ src,
+                                          int size_nx,
+                                          int size_ny,
+                                          int stride_nx,
+                                          int stride_ny) {
   int thread_offset = threadIdx.x;
   int left_size_nx = size_nx - thread_offset;
 
@@ -589,11 +626,18 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
  * coordinate mapping relationship between output data and input data.
  * total_num_output: Total number of original output.
  */
-template <typename T, int NX, int NY, int BlockSize, int Rank,
+template <typename T,
+          int NX,
+          int NY,
+          int BlockSize,
+          int Rank,
           bool IsBoundary = false>
 __device__ __forceinline__ void ReadDataBc(
-    T* dst, const T* __restrict__ src, uint32_t block_offset,
-    details::BroadcastConfig<Rank> config, int total_num_output) {
+    T* dst,
+    const T* __restrict__ src,
+    uint32_t block_offset,
+    details::BroadcastConfig<Rank> config,
+    int total_num_output) {
   uint32_t thread_offset = block_offset + threadIdx.x * NX;
   uint32_t index_src = 0;
 
@@ -616,6 +660,5 @@ __device__ __forceinline__ void ReadDataBc(
   }
 }
 
-}  // namespace kernel_primitives
-}  // namespace operators
-}  // namespace paddle
+}  // namespace kps
+}  // namespace pten
diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h b/paddle/pten/kernels/primitive/datamover_primitives_xpu2.h
similarity index 90%
rename from paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
rename to paddle/pten/kernels/primitive/datamover_primitives_xpu2.h
index 333899535894e0939086817c9fd6caad992f807f..d6586368c804126f896ba476cc1679d54a4c6eb8 100644
--- a/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
+++ b/paddle/pten/kernels/primitive/datamover_primitives_xpu2.h
@@ -17,9 +17,8 @@
 #include "xpu/kernel/debug.h"
 #include "xpu/kernel/math.h"
 
-namespace paddle {
-namespace operators {
-namespace kernel_primitives {
+namespace pten {
+namespace kps {
 namespace details {
 
 template <typename T, int VecSize>
@@ -105,10 +104,17 @@ struct BroadcastConfig {
  * stride_nx: Each read one element stride stride_nx elements in the last dim.
  * stride_ny: Each read one element stride stride_ny elements in the first dim.
  */
-template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
+template <typename Tx,
+          typename Ty,
+          int NX,
+          int NY,
+          int BlockSize,
           bool IsBoundary = false>
-__device__ __inline__ void ReadData(Ty* dst, const Tx _global_ptr_* src,
-                                    int size_nx, int size_ny, int stride_nx,
+__device__ __inline__ void ReadData(Ty* dst,
+                                    const Tx _global_ptr_* src,
+                                    int size_nx,
+                                    int size_ny,
+                                    int stride_nx,
                                     int stride_ny) {
   int thread_offset = core_id();
   int left_size_nx = size_nx - thread_offset;
@@ -205,7 +211,8 @@ __device__ __inline__ void Init(T* dst, T init_data) {
  * size: The current block needs to load size data continuously.
  */
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
-__device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src,
+__device__ __inline__ void ReadData(T* dst,
+                                    const T _global_ptr_* src,
                                     int num) {
   int thread_offset = core_id() * NX;
   __local__ T in_temp[1];
@@ -247,12 +254,18 @@ __device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src,
  * stride_nx: Each read one element stride stride_nx elements in the last dim.
  * stride_ny: Each read one element stride stride_ny elements in the first dim.
  */
-template <typename T, int NX, int NY, int BlockSize, int Rank,
+template <typename T,
+          int NX,
+          int NY,
+          int BlockSize,
+          int Rank,
           bool IsBoundary = false>
-__device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
+__device__ __inline__ void ReadDataBc(T* dst,
+                                      const T _global_ptr_* src,
                                       uint32_t block_offset,
                                       details::BroadcastConfig<Rank> config,
-                                      int total_num_output, int stride_nx,
+                                      int total_num_output,
+                                      int stride_nx,
                                       int stride_ny) {
   uint32_t thread_offset = block_offset + core_id();
   uint32_t index_src = 0;
@@ -307,13 +320,21 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
  * reduce_last_dim: Used to indicate whether the dimension of reduce contains
  * the lowest dimension.
  */
-template <typename T, int NX, int NY, int BlockSize, int Rank,
-          typename IndexCal, bool IsBoundary = false>
-__device__ __inline__ void ReadDataReduce(T* dst, const T _global_ptr_* src,
+template <typename T,
+          int NX,
+          int NY,
+          int BlockSize,
+          int Rank,
+          typename IndexCal,
+          bool IsBoundary = false>
+__device__ __inline__ void ReadDataReduce(T* dst,
+                                          const T _global_ptr_* src,
                                           int block_offset,
                                           const IndexCal& index_cal,
-                                          int size_nx, int size_ny,
-                                          int stride_nx, int stride_ny,
+                                          int size_nx,
+                                          int size_ny,
+                                          int stride_nx,
+                                          int stride_ny,
                                           bool reduce_last_dim) {
   __local__ Tx in_temp[1];
   int thread_offset = 0;
@@ -423,10 +444,17 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
  * stride_nx: Each read one element stride stride_nx elements in the last dim.
  * stride_ny: Each read one element stride stride_ny elements in the first dim.
  */
-template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
+template <typename Tx,
+          typename Ty,
+          int NX,
+          int NY,
+          int BlockSize,
           bool IsBoundary = false>
-__device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
-                                     int size_nx, int size_ny, int stride_nx,
+__device__ __inline__ void WriteData(Ty _global_ptr_* dst,
+                                     const Tx* src,
+                                     int size_nx,
+                                     int size_ny,
+                                     int stride_nx,
                                      int stride_ny) {
   int thread_offset = core_id();
   int left_size_nx = size_nx - thread_offset;
@@ -483,7 +511,8 @@ __device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
           }
         }
         in_temp[0] = static_cast<Ty>(src[idx + idy * NX]);
-        LM2GM(in_temp, dst + thread_offset + idx * stride_nx + idy * stride_ny,
+        LM2GM(in_temp,
+              dst + thread_offset + idx * stride_nx + idy * stride_ny,
               sizeof(Ty));
       }
     }
@@ -537,9 +566,14 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) {
  * coordinate mapping relationship between output data and input data.
  * total_num_output: Total number of original output.
  */
-template <typename T, int NX, int NY, int BlockSize, int Rank,
+template <typename T,
+          int NX,
+          int NY,
+          int BlockSize,
+          int Rank,
           bool IsBoundary = false>
-__device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
+__device__ __inline__ void ReadDataBc(T* dst,
+                                      const T _global_ptr_* src,
                                       uint32_t block_offset,
                                       details::BroadcastConfig<Rank> config,
                                       int total_num_output) {
@@ -562,6 +596,5 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
   }
 }
 
-}  // namespace kernel_primitives
-}  // namespace operators
-}  // namespace paddle
+}  // namespace kps
+}  // namespace pten
diff --git a/paddle/pten/kernels/primitive/functor_primitives.h b/paddle/pten/kernels/primitive/functor_primitives.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d62d622701342e058a57ff31d12410e78eb1306
--- /dev/null
+++ b/paddle/pten/kernels/primitive/functor_primitives.h
@@ -0,0 +1,255 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/pten/common/float16.h"
+#include "paddle/pten/core/enforce.h"
+#include "paddle/pten/kernels/funcs/eigen/extensions.h"
+
+namespace pten {
+namespace kps {
+namespace details {
+
+static __device__ __forceinline__ pten::dtype::float16 Exp(
+    pten::dtype::float16 x) {
+  return ::Eigen::numext::exp(x);
+}
+
+static __device__ __forceinline__ float Exp(float x) { return expf(x); }
+
+static __device__ __forceinline__ double Exp(double x) { return exp(x); }
+
+static __device__ __forceinline__ pten::dtype::float16 Log(
+    pten::dtype::float16 x) {
+  return ::Eigen::numext::log(x);
+}
+
+static __device__ __forceinline__ float Log(float x) { return logf(x); }
+
+static __device__ __forceinline__ double Log(double x) { return log(x); }
+
+}  // namespace details
+
+/******************************** Unary Functor *******************************/
+
+/**
+ * @brief Default unary exp functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct ExpFunctor {
+  HOSTDEVICE inline ExpFunctor() {}
+
+  HOSTDEVICE explicit inline ExpFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx x) const {
+    return static_cast<Ty>(details::Exp(x));
+  }
+};
+
+/**
+ * @brief Default unary identity functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct IdentityFunctor {
+  HOSTDEVICE inline IdentityFunctor() {}
+
+  HOSTDEVICE explicit inline IdentityFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx x) const {
+    return static_cast<Ty>(x);
+  }
+};
+
+/**
+ * @brief Default unary div functor. Divide by a constant
+ */
+template <typename Tx, typename Ty = Tx>
+struct DivideFunctor {
+ private:
+  using MPType = typename ::paddle::operators::details::MPTypeTrait<Tx>::Type;
+
+ public:
+  HOSTDEVICE inline DivideFunctor() { n_inv = static_cast<MPType>(1.0f); }
+
+  HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((MPType)(1.0 / n)) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx x) const {
+    return static_cast<Ty>(static_cast<MPType>(x) * n_inv);
+  }
+
+ private:
+  MPType n_inv;
+};
+
+/**
+ * @brief Default inverse functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct InverseFunctor {
+  HOSTDEVICE inline InverseFunctor() {}
+
+  HOSTDEVICE explicit inline InverseFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx x) const {
+    return static_cast<Ty>(-x);
+  }
+};
+
+/**
+ * @brief Default unary square functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct SquareFunctor {
+  HOSTDEVICE inline SquareFunctor() {}
+
+  HOSTDEVICE explicit inline SquareFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx x) const {
+    return static_cast<Ty>(x) * static_cast<Ty>(x);
+  }
+};
+
+/****************************** Binary Functor ********************************/
+
+/**
+ * @brief Default binary min functor
+ */
+template <typename T>
+struct MinFunctor {
+  inline T initial() { return static_cast<T>(std::numeric_limits<T>::max()); }
+
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
+    return (b < a) ? b : a;
+  }
+};
+
+/**
+ * @brief Default binary max functor
+ */
+template <typename T>
+struct MaxFunctor {
+  inline T initial() {
+    return static_cast<T>(std::numeric_limits<T>::lowest());
+  }
+
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
+    return (b > a) ? b : a;
+  }
+};
+
+/**
+ * @brief Default binary add functor
+ */
+template <typename T>
+struct AddFunctor {
+  inline T initial() { return static_cast<T>(0.0f); }
+
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
+    return b + a;
+  }
+};
+
+/**
+ * @brief Default binary add functor
+ */
+template <typename T>
+struct MulFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
+    return b * a;
+  }
+};
+
+/**
+ * @brief Default binary logic or functor
+ */
+template <typename T>
+struct LogicalOrFunctor {
+  inline T initial() { return static_cast<T>(false); }
+
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
+    return b || a;
+  }
+};
+
+/**
+ * @brief Default binary logic and functor
+ */
+template <typename T>
+struct LogicalAndFunctor {
+  inline T initial() { return static_cast<T>(true); }
+
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
+    return b && a;
+  }
+};
+
+/**
+ * @brief Default binary sub functor
+ */
+template <typename T>
+struct SubFunctor {
+  inline T initial() { return static_cast<T>(0.0f); }
+
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a - b; }
+};
+
+/**
+ * @brief Default binary div functor
+ */
+template <typename T, typename Enable = void>
+struct DivFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
+};
+
+template <typename T>
+struct DivFunctor<T,
+                  typename std::enable_if<std::is_integral<T>::value>::type> {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    // For int32/int64, need to check whether the divison is zero.
+    PADDLE_ENFORCE_NE(b,
+                      0,
+                      pten::errors::InvalidArgument(
+                          "Integer division by zero encountered "
+                          "in (floor) divide. Please check the input value."));
+    return a / b;
+  }
+};
+
+/**
+ * @brief Default binary floor divide functor
+ */
+template <typename T>
+struct FloorDivFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    PADDLE_ENFORCE_NE(b,
+                      0,
+                      pten::errors::InvalidArgument(
+                          "Integer division by zero encountered "
+                          "in (floor) divide. Please check the input value."));
+    return static_cast<T>(std::trunc(a / b));
+  }
+};
+
+}  // namespace kps
+}  // namespace pten
diff --git a/paddle/fluid/operators/kernel_primitives/helper_primitives.h b/paddle/pten/kernels/primitive/helper_primitives.h
similarity index 73%
rename from paddle/fluid/operators/kernel_primitives/helper_primitives.h
rename to paddle/pten/kernels/primitive/helper_primitives.h
index 48ac1509d1f6e8cd3c6ecf06ac0f3445dac39a51..26d431d46abae651e854820b0c7b43afadf148b6 100644
--- a/paddle/fluid/operators/kernel_primitives/helper_primitives.h
+++ b/paddle/pten/kernels/primitive/helper_primitives.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,9 +14,8 @@
 
 #pragma once
 
-namespace paddle {
-namespace operators {
-namespace kernel_primitives {
+namespace pten {
+namespace kps {
 
 #ifdef PADDLE_WITH_XPU2
 struct dim3 {
@@ -43,8 +42,12 @@ struct DimConfig {
   int rem_y;
   int rem_z;
 
-  HOSTDEVICE explicit inline DimConfig(int split_x, int split_y, int split_z,
-                                       int size_x, int size_y, int size_z) {
+  HOSTDEVICE explicit inline DimConfig(int split_x,
+                                       int split_y,
+                                       int split_z,
+                                       int size_x,
+                                       int size_y,
+                                       int size_z) {
     split_num_x = split_x;
     split_num_y = split_y;
     split_num_z = split_z;
@@ -60,6 +63,5 @@ struct DimConfig {
   }
 };
 
-}  // namespace kernel_primitives
-}  // namespace operators
-}  // namespace paddle
+}  // namespace kps
+}  // namespace pten
diff --git a/paddle/pten/kernels/primitive/kernel_primitives.h b/paddle/pten/kernels/primitive/kernel_primitives.h
new file mode 100644
index 0000000000000000000000000000000000000000..6067fa59d57ba6f400500805bff7aea80f17926d
--- /dev/null
+++ b/paddle/pten/kernels/primitive/kernel_primitives.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/pten/kernels/primitive/helper_primitives.h"
+#ifdef PADDLE_WITH_XPU2
+#include "paddle/pten/backends/xpu/xpu_context.h"
+#include "paddle/pten/kernels/primitive/compute_primitives_xpu2.h"
+#include "paddle/pten/kernels/primitive/datamover_primitives_xpu2.h"
+#include "paddle/pten/kernels/primitive/functor_primitives_xpu2.h"
+
+#define KPStream XPUStream
+#define KPDevice pten::XPUContext
+#define _ptr_ _global_ptr_
+#define __forceinline__ __inline__
+#define __restrict__
+
+#define THREAD_ID_X core_id()
+#define THREAD_ID_Y 0
+#define THREAD_ID_Z 0
+
+#define BLOCK_NUM_X core_num()
+#define BLOCK_NUM_Y 0
+#define BLOCK_NUM_Z 0
+
+#define BLOCK_ID_X cluster_id()
+#define BLOCK_ID_Y 0
+#define BLOCK_ID_Z 0
+
+#define GRID_NUM_X cluster_num()
+#define GRID_NUM_Y 0
+#define GRID_NUM_Z 0
+#else
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/kernels/primitive/compute_primitives.h"
+#include "paddle/pten/kernels/primitive/datamover_primitives.h"
+#include "paddle/pten/kernels/primitive/functor_primitives.h"
+
+#define KPStream gpuStream_t
+#define KPDevice pten::GPUContext
+#define _ptr_
+
+#define THREAD_ID_X threadIdx.x
+#define THREAD_ID_Y threadIdx.y
+#define THREAD_ID_Z threadIdx.z
+
+#define BLOCK_NUM_X blockDim.x
+#define BLOCK_NUM_Y blockDim.y
+#define BLOCK_NUM_Z blockDim.z
+
+#define BLOCK_ID_X blockIdx.x
+#define BLOCK_ID_Y blockIdx.y
+#define BLOCK_ID_Z blockIdx.z
+
+#define GRID_NUM_X gridDim.x
+#define GRID_NUM_Y gridDim.y
+#define GRID_NUM_Z gridDim.z
+#endif
diff --git a/paddle/pten/kernels/reshape_kernel.cc b/paddle/pten/kernels/reshape_kernel.cc
index 9bfad22374c9f0c840634a16bfff45849e8ef60a..4b706e9e685b47af20dd23ab0855db6116623c46 100644
--- a/paddle/pten/kernels/reshape_kernel.cc
+++ b/paddle/pten/kernels/reshape_kernel.cc
@@ -31,9 +31,8 @@ void ReshapeKernel(const Context& dev_ctx,
     out->ResizeAndAllocate(out_meta.dims);
     return;
   }
-
-  out->Resize(x.dims());
-  out->mutable_data(x.place());
+  out->set_meta(out_meta);
+  out->mutable_data(dev_ctx.GetPlace());
   pten::Copy(dev_ctx, x, false, out);
   out->Resize(out_meta.dims);
   out->ResetLoD(x.lod());
diff --git a/paddle/pten/kernels/xpu/scale_kernel.cc b/paddle/pten/kernels/xpu/scale_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..116cd63f876207b39bc9b523b9f9e70876cc1b98
--- /dev/null
+++ b/paddle/pten/kernels/xpu/scale_kernel.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/scale_kernel.h"
+
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+#include "paddle/pten/backends/xpu/xpu_context.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/common/float16.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void ScaleKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const Scalar& scale,
+                 float bias,
+                 bool bias_after_scale,
+                 DenseTensor* out) {
+  out->mutable_data<T>(dev_ctx.GetPlace());
+
+  PADDLE_ENFORCE_EQ(x.dims(),
+                    out->dims(),
+                    paddle::platform::errors::InvalidArgument(
+                        "In and out should have the same dim,"
+                        " expected %s, but got %s.",
+                        x.dims().to_str().c_str(),
+                        out->dims().to_str().c_str()));
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  int r = xpu::scale(dev_ctx.x_context(),
+                     reinterpret_cast<const XPUType*>(x.data<T>()),
+                     reinterpret_cast<XPUType*>(out->data<T>()),
+                     x.numel(),
+                     bias_after_scale,
+                     scale.to<float>(),
+                     bias);
+  PADDLE_ENFORCE_EQ(
+      r,
+      XPU_SUCCESS,
+      paddle::platform::errors::External(
+          "XPU scale kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r]));
+}
+
+}  // namespace pten
+
+PT_REGISTER_KERNEL(scale,
+                   XPU,
+                   ALL_LAYOUT,
+                   pten::ScaleKernel,
+                   float,
+                   pten::dtype::float16,
+                   int64_t) {}
diff --git a/paddle/pten/ops/CMakeLists.txt b/paddle/pten/ops/CMakeLists.txt
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..910b62766ebf6e3dac9d9218b60622b6352a5e44 100644
--- a/paddle/pten/ops/CMakeLists.txt
+++ b/paddle/pten/ops/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(compat)
diff --git a/paddle/pten/ops/compat/CMakeLists.txt b/paddle/pten/ops/compat/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dd214087e1a68a624fc198a6d86596b4ff8e4ff3
--- /dev/null
+++ b/paddle/pten/ops/compat/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(op_utils_header ${PADDLE_BINARY_DIR}/paddle/pten/ops/compat/signatures.h.tmp CACHE INTERNAL "op_args_fns.cc file")
+set(op_utils_header_final ${PADDLE_BINARY_DIR}/paddle/pten/ops/compat/signatures.h)
+file(WRITE ${op_utils_header} "// Generated by the paddle/pten/ops/compat/CMakeLists.txt.  DO NOT EDIT!\n\n")
+file(APPEND ${op_utils_header} "#include \"paddle/pten/core/compat/op_utils.h\"\n\n")
+
+# Automatically generate the registration code of all arg map functions
+# and compile the corresponding target to avoid frequent code conflicts
+# when writing to same file
+register_op_utils(op_compat_infos DEPS op_utils)
+
+copy_if_different(${op_utils_header} ${op_utils_header_final})
diff --git a/paddle/pten/ops/compat/scale_args_fn.h b/paddle/pten/ops/compat/scale_sig.cc
similarity index 72%
rename from paddle/pten/ops/compat/scale_args_fn.h
rename to paddle/pten/ops/compat/scale_sig.cc
index 91f0db389d9d5094e6f6d3cf978c4c35590d1d2e..5ce159a5d84c9faba760cd7b8605f2bd0734c53f 100644
--- a/paddle/pten/ops/compat/scale_args_fn.h
+++ b/paddle/pten/ops/compat/scale_sig.cc
@@ -12,9 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
-#include "paddle/pten/core/compat/arg_map_context.h"
+#include "paddle/pten/core/compat/op_utils.h"
 
 namespace pten {
 
@@ -22,15 +20,18 @@ KernelSignature ScaleOpArgumentMapping(const ArgumentMappingContext& ctx) {
   if (ctx.IsDenseTensorInput("X")) {
     std::string scale_attr;
     if (ctx.HasInput("ScaleTensor")) {
-      scale_attr = "ScaleTensor";
+      return KernelSignature(
+          "scale", {"X"}, {"ScaleTensor", "bias", "bias_after_scale"}, {"Out"});
     } else {
-      scale_attr = "scale";
+      return KernelSignature(
+          "scale", {"X"}, {"scale", "bias", "bias_after_scale"}, {"Out"});
     }
-    return KernelSignature(
-        "scale", {"X"}, {scale_attr, "bias", "bias_after_scale"}, {"Out"});
   }
   // TODO(chenweihang): support other cases after selected rows added
   return KernelSignature("scale.unregistered", {}, {}, {});
 }
 
 }  // namespace pten
+
+// op_type, api_name, arg_mapping_fn
+PT_REGISTER_ARG_MAPPING_FN(scale, pten::ScaleOpArgumentMapping);
diff --git a/paddle/pten/tests/core/CMakeLists.txt b/paddle/pten/tests/core/CMakeLists.txt
index 43e1480e2c41e3e0a5cc2a57597a83d306e709ed..27a0173ef6f1fc8654fdbe4ef7b585f3ec3d7651 100644
--- a/paddle/pten/tests/core/CMakeLists.txt
+++ b/paddle/pten/tests/core/CMakeLists.txt
@@ -3,6 +3,7 @@ cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
 cc_test(test_convert_utils SRCS test_convert_utils.cc DEPS convert_utils)
 cc_test(test_kernel_factory SRCS test_kernel_factory.cc DEPS kernel_factory scale_kernel)
+cc_test(test_op_utils SRCS test_op_utils.cc DEPS op_compat_infos)
 cc_test(test_pten_device_context SRCS test_device_context.cc DEPS pten_context cpu_context)
 
 cc_test(test_ddim SRCS test_ddim.cc DEPS ddim)
diff --git a/paddle/pten/tests/core/test_op_utils.cc b/paddle/pten/tests/core/test_op_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c4a418685775a01af0076d26b4878d3eb91462e
--- /dev/null
+++ b/paddle/pten/tests/core/test_op_utils.cc
@@ -0,0 +1,32 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+
+#include "gtest/gtest.h"
+#include "paddle/pten/core/compat/op_utils.h"
+#include "paddle/pten/ops/compat/signatures.h"
+
+namespace pten {
+namespace tests {
+
+TEST(OpUtilsMap, ArgMappingFnExists) {
+  std::cout << "enter ArgMappingFnExists";
+  auto scale_arg_mapping_fn =
+      pten::OpUtilsMap::Instance().GetArgumentMappingFn("scale");
+  EXPECT_NE(scale_arg_mapping_fn, nullptr);
+}
+
+}  // namespace tests
+}  // namespace pten
diff --git a/paddle/pten/tests/core/test_selected_rows.cc b/paddle/pten/tests/core/test_selected_rows.cc
index 81c7ff4a838a702a1c32df2fb4a7f082b1b39f3b..c6e52ff64eab90c71ffc698c04e6e0b58cd1f6d4 100644
--- a/paddle/pten/tests/core/test_selected_rows.cc
+++ b/paddle/pten/tests/core/test_selected_rows.cc
@@ -40,7 +40,7 @@ class SelectedRowsTester : public ::testing::Test {
 
  protected:
   pten::CPUPlace place_;
-  std::unique_ptr<SelectedRows> selected_rows_{nullptr};
+  std::unique_ptr<pten::SelectedRows> selected_rows_{nullptr};
 };
 
 TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); }
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2b4803e35385403badc27f416b2c7112411fd8c7
--- /dev/null
+++ b/paddle/utils/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(string)
diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/utils/string/CMakeLists.txt
similarity index 69%
rename from paddle/fluid/string/CMakeLists.txt
rename to paddle/utils/string/CMakeLists.txt
index 9667e18bc6a1e34fee6e039a710dd1bd8b24481e..db3cb542ba3748e3bc936394a136e2ad8aaf327e 100644
--- a/paddle/fluid/string/CMakeLists.txt
+++ b/paddle/utils/string/CMakeLists.txt
@@ -1,8 +1,8 @@
 cc_library(stringpiece SRCS piece.cc DEPS flags)
 cc_library(pretty_log SRCS pretty_log.cc DEPS flags)
 cc_library(string_helper SRCS string_helper.cc DEPS flags)
-cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
-cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
+cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece gflags)
+cc_test(stringprintf_test SRCS printf_test.cc DEPS gflags)
 cc_test(to_string_test SRCS to_string_test.cc)
 cc_test(split_test SRCS split_test.cc)
 cc_test(string_helper_test SRCS string_helper_test.cc DEPS string_helper)
diff --git a/paddle/fluid/string/piece.cc b/paddle/utils/string/piece.cc
similarity index 94%
rename from paddle/fluid/string/piece.cc
rename to paddle/utils/string/piece.cc
index 971ee3ddb5ff0347e3f365c3eeb9fe9fea96e573..305ac85a5320ead7cfd784863927ad2d0913a07d 100644
--- a/paddle/fluid/string/piece.cc
+++ b/paddle/utils/string/piece.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/string/piece.h"
+#include "paddle/utils/string/piece.h"
 
 #include <string.h>
 #include <algorithm>
@@ -76,9 +76,11 @@ bool HasPrefix(Piece s, Piece x) {
 }
 
 bool HasSuffix(Piece s, Piece x) {
-  return !x.len() ? true : ((s.len() >= x.len()) &&
-                            (memcmp(s.data() + (s.len() - x.len()), x.data(),
-                                    x.len()) == 0));
+  return !x.len()
+             ? true
+             : ((s.len() >= x.len()) &&
+                (memcmp(s.data() + (s.len() - x.len()), x.data(), x.len()) ==
+                 0));
 }
 
 Piece SkipPrefix(Piece s, size_t n) {
diff --git a/paddle/utils/string/piece.h b/paddle/utils/string/piece.h
new file mode 100644
index 0000000000000000000000000000000000000000..8dda484eaac4d62b758e57ac5e81bfe68a5c60d4
--- /dev/null
+++ b/paddle/utils/string/piece.h
@@ -0,0 +1,105 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ostream>
+#include <string>
+
+namespace paddle {
+namespace string {
+
+// Piece points into a std::string object but doesn't own the
+// string.  It is for efficient access to strings.  Like Go's string
+// type.  Not that Piece doesn't mutate the underlying string,
+// so it is thread-safe given that the underlying string doesn't
+// change.  Because Piece contains a little data members, and
+// its syntax is simple as it doesn't own/manage the string, it is
+// cheap to construct Pieces and pass them around.
+class Piece {
+ public:
+  static const size_t npos = static_cast<size_t>(-1);
+
+  // We provide non-explicit singleton constructors so users can
+  // pass in a "const char*" or a "string" wherever a "Piece"
+  // is expected.  These constructors ensure that if data_ is NULL,
+  // size_ is 0.
+  Piece();
+  Piece(const char* d, size_t n);
+  Piece(const char* d);         // NOLINT: accept C string into Piece.
+  Piece(const std::string& s);  // NOLINT: accept C++ string into Piece.
+
+  const char* data() const { return data_; }
+  size_t len() const { return size_; }
+
+  char operator[](size_t n) const;
+
+  // Piece doesn't own the string, so both iterator and const
+  // iterator are const char* indeed.
+  typedef const char* const_iterator;
+  typedef const char* iterator;
+  iterator begin() const { return data_; }
+  iterator end() const { return data_ + size_; }
+
+  // Return a string that contains the copy of the referenced data.
+  std::string ToString() const { return std::string(data_, size_); }
+
+ private:
+  const char* data_;
+  size_t size_;
+
+  // Intentionally copyable
+};
+
+int Compare(Piece a, Piece b);
+
+bool operator==(Piece x, Piece y);
+bool operator!=(Piece x, Piece y);
+bool operator<(Piece x, Piece y);
+bool operator>(Piece x, Piece y);
+bool operator<=(Piece x, Piece y);
+bool operator>=(Piece x, Piece y);
+
+bool HasPrefix(Piece s, Piece prefix);
+bool HasSuffix(Piece s, Piece suffix);
+
+Piece SkipPrefix(Piece s, size_t n);
+Piece SkipSuffix(Piece s, size_t n);
+
+// Skip the prefix (or suffix) if it matches with the string.
+Piece TrimPrefix(Piece s, Piece prefix);
+Piece TrimSuffix(Piece s, Piece suffix);
+
+// Returns if s contains sub.  Any s except for empty s contains an
+// empty sub.
+bool Contains(Piece s, Piece sub);
+
+// Return the first occurrence of sub in s, or npos.  If both s and
+// sub is empty, it returns npos; otherwise, if only sub is empty, it
+// returns 0.
+size_t Index(Piece s, Piece sub);
+
+// Return the first occurrence of c in s[pos:end], or npos.
+size_t Find(Piece s, char c, size_t pos);
+
+// Search range is [0..pos] inclusive.  If pos == npos, search everything.
+size_t RFind(Piece s, char c, size_t pos);
+
+Piece SubStr(Piece s, size_t pos, size_t n);
+
+// allow Piece to be logged
+std::ostream& operator<<(std::ostream& o, Piece piece);
+
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/fluid/string/piece_test.cc b/paddle/utils/string/piece_test.cc
similarity index 99%
rename from paddle/fluid/string/piece_test.cc
rename to paddle/utils/string/piece_test.cc
index 544b5985ed21432488200768a28a3bae69f00a7f..27b189e251f8ad368895419ff9cf854a6d929893 100644
--- a/paddle/fluid/string/piece_test.cc
+++ b/paddle/utils/string/piece_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/string/piece.h"
+#include "paddle/utils/string/piece.h"
 
 #include "gtest/gtest.h"
 
diff --git a/paddle/fluid/string/pretty_log.cc b/paddle/utils/string/pretty_log.cc
similarity index 94%
rename from paddle/fluid/string/pretty_log.cc
rename to paddle/utils/string/pretty_log.cc
index c0715e644fb3302bde53564be3bf63e4e3f4657c..b014c6de20d855c16432aa8b1e898b5d12ae3a3d 100644
--- a/paddle/fluid/string/pretty_log.cc
+++ b/paddle/utils/string/pretty_log.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/string/pretty_log.h"
+#include "paddle/utils/string/pretty_log.h"
 #include "gflags/gflags.h"
 
 DEFINE_bool(color, true, "Whether to turn on pretty log");
diff --git a/paddle/utils/string/pretty_log.h b/paddle/utils/string/pretty_log.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a8038f3a8bef073c53a1b59c6fc2ed565913c85
--- /dev/null
+++ b/paddle/utils/string/pretty_log.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <utility>
+#include "gflags/gflags.h"
+
+#include "paddle/utils/string/printf.h"
+
+DECLARE_bool(color);
+
+namespace paddle {
+
+namespace string {
+
+inline std::string black() { return FLAGS_color ? "\e[30m" : ""; }
+inline std::string red() { return FLAGS_color ? "\e[31m" : ""; }
+inline std::string b_red() { return FLAGS_color ? "\e[41m" : ""; }
+inline std::string green() { return FLAGS_color ? "\e[32m" : ""; }
+inline std::string yellow() { return FLAGS_color ? "\e[33m" : ""; }
+inline std::string blue() { return FLAGS_color ? "\e[34m" : ""; }
+inline std::string purple() { return FLAGS_color ? "\e[35m" : ""; }
+inline std::string cyan() { return FLAGS_color ? "\e[36m" : ""; }
+inline std::string light_gray() { return FLAGS_color ? "\e[37m" : ""; }
+inline std::string white() { return FLAGS_color ? "\e[37m" : ""; }
+inline std::string light_red() { return FLAGS_color ? "\e[91m" : ""; }
+inline std::string dim() { return FLAGS_color ? "\e[2m" : ""; }
+inline std::string bold() { return FLAGS_color ? "\e[1m" : ""; }
+inline std::string underline() { return FLAGS_color ? "\e[4m" : ""; }
+inline std::string blink() { return FLAGS_color ? "\e[5m" : ""; }
+inline std::string reset() { return FLAGS_color ? "\e[0m" : ""; }
+
+using TextBlock = std::pair<std::string, std::string>;
+
+struct Style {
+  static std::string info() { return black(); }
+  static std::string warn() { return b_red(); }
+  static std::string suc() { return green(); }
+  static std::string H1() { return bold() + purple(); }
+  static std::string H2() { return green(); }
+  static std::string H3() { return green(); }
+  static std::string detail() { return light_gray(); }
+};
+
+template <typename... Args>
+static void PrettyLogEndl(const std::string &style,
+                          const char *fmt,
+                          const Args &... args) {
+  std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl;
+}
+template <typename... Args>
+static void PrettyLog(const std::string &style,
+                      const char *fmt,
+                      const Args &... args) {
+  std::cerr << style << Sprintf(fmt, args...) << reset();
+}
+
+template <typename... Args>
+static void PrettyLogInfo(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::info(), fmt, args...);
+}
+template <typename... Args>
+static void PrettyLogDetail(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::detail(), fmt, args...);
+}
+template <typename... Args>
+static void PrettyLogH1(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::H1(), fmt, args...);
+}
+template <typename... Args>
+static void PrettyLogH2(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::H2(), fmt, args...);
+}
+
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/utils/string/printf.h b/paddle/utils/string/printf.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4576c6bc4aa543a25604638c2047eeaeb179a74
--- /dev/null
+++ b/paddle/utils/string/printf.h
@@ -0,0 +1,124 @@
+//  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Compared with std::stringstream, there are primary purpose of
+// string::Printf:
+//
+// 1. Type-safe printing, with why and how explained in
+//    http://www.drdobbs.com/stringprintf-a-typesafe-printf-family-fo/184401999.
+//    Implementation includes
+//
+//    https://github.com/c42f/tinyformat
+//    boost::format
+//    std::stringstream
+//
+//    std::stringstream is not convenient enough in many cases.  For example:
+//
+//      std::cout << std::setprecision(2) << std::fixed << 1.23456 << "\n";
+//
+//    boost::format is the most convenient one.  We can have
+//
+//      std::cout << format("%2% %1%") % 36 % 77;
+//
+//    or
+//
+//      format fmter("%2% %1%");
+//      fmter % 36; fmter % 77;
+//      std::cout << fmter.c_str();
+//
+//    But the overloading of % might be overkilling and it would be
+//    more efficient if it can write to std::cout directly.
+//
+//    tinyformat has an interface compatible with the C-printf style,
+//    and it can writes to a stream or returns a std::string:
+//
+//      std::cout << tfm::printf(
+//                  "%s, %s %d, %.2d:%.2d\n",
+//                  weekday, month, day, hour, min);
+//
+//    or
+//
+//      tfm::format(std::cout,
+//                  "%s, %s %d, %.2d:%.2d\n",
+//                  weekday, month, day, hour, min);
+//
+// 2. High-performance -- most printed strings are not too long and
+//    doens't need dynamic memory allocation.  Many StringPrintf
+//    implementations doesn't enforce type-safe, but are
+//    high-performance, including
+//
+//    https://developers.google.com/optimization/reference/base/stringprintf/
+//    https://github.com/adobe/chromium/blob/master/base/stringprintf.h
+//    https://github.com/google/protobuf/blob/master/src/google/protobuf/stubs/stringprintf.h
+//
+// According to
+// https://github.com/c42f/tinyformat#compile-time-and-code-bloat,
+// boost::format runs too slow and results in large executable binary
+// files.  So here we port tinyformat.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "paddle/utils/string/tinyformat/tinyformat.h"  // https://github.com/c42f/tinyformat
+
+namespace paddle {
+namespace string {
+
+template <typename... Args>
+void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
+  tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...));
+}
+
+inline std::string Sprintf() { return ""; }
+
+template <typename... Args>
+std::string Sprintf(const Args&... args) {
+  std::ostringstream oss;
+  Fprintf(oss, "%s", args...);
+  return oss.str();
+}
+
+template <typename... Args>
+std::string Sprintf(const char* fmt, const Args&... args) {
+  std::ostringstream oss;
+  Fprintf(oss, fmt, args...);
+  return oss.str();
+}
+
+template <typename... Args>
+void Printf(const char* fmt, const Args&... args) {
+  Fprintf(std::cout, fmt, args...);
+}
+
+inline std::string HumanReadableSize(double f_size) {
+  size_t i = 0;
+  double orig = f_size;
+  const std::vector<std::string> units(
+      {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"});
+  while (f_size >= 1024) {
+    f_size /= 1024;
+    i++;
+  }
+  if (i >= units.size()) {
+    return Sprintf("%fB", orig);
+  }
+  return Sprintf("%f%s", f_size, units[i]);
+}
+
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/fluid/string/printf_test.cc b/paddle/utils/string/printf_test.cc
similarity index 84%
rename from paddle/fluid/string/printf_test.cc
rename to paddle/utils/string/printf_test.cc
index 544b12ef3a877a6e84c136433799301edaa4abdf..9da7bfedb72c6e3aca38921dd751824d205f7084 100644
--- a/paddle/fluid/string/printf_test.cc
+++ b/paddle/utils/string/printf_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/string/printf.h"
+#include "paddle/utils/string/printf.h"
 
 #include <string>
 
@@ -25,7 +25,7 @@ TEST(StringPrintf, StringPrintf) {
   int hour = 14;
   int min = 44;
   EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
-            paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day,
-                                    hour, min));
+            paddle::string::Sprintf(
+                "%s, %s %d, %.2d:%.2d", weekday, month, day, hour, min));
   EXPECT_EQ(std::string(""), paddle::string::Sprintf());
 }
diff --git a/paddle/utils/string/split.h b/paddle/utils/string/split.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccb96b8a9cb68f03acbca592a2149ba5001f34d2
--- /dev/null
+++ b/paddle/utils/string/split.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace string {
+
+static inline std::vector<std::string> Split(std::string const& original,
+                                             char separator) {
+  std::vector<std::string> results;
+  std::string token;
+  std::istringstream is(original);
+  while (std::getline(is, token, separator)) {
+    if (!token.empty()) {
+      results.push_back(token);
+    }
+  }
+  return results;
+}
+
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/fluid/string/split_test.cc b/paddle/utils/string/split_test.cc
similarity index 95%
rename from paddle/fluid/string/split_test.cc
rename to paddle/utils/string/split_test.cc
index c85dc1eed40dbe25d922c0f4810a747d1bd2d60f..dcb69955c86580f6d21eea1783041768073a7c29 100644
--- a/paddle/fluid/string/split_test.cc
+++ b/paddle/utils/string/split_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/string/split.h"
+#include "paddle/utils/string/split.h"
 
 #include <string>
 
diff --git a/paddle/fluid/string/string_helper.cc b/paddle/utils/string/string_helper.cc
similarity index 95%
rename from paddle/fluid/string/string_helper.cc
rename to paddle/utils/string/string_helper.cc
index db9ee7592fc84237e760d94a3ebb3eff328a8309..37b9e9ce4e513cd160ddf9f67889247741663731 100644
--- a/paddle/fluid/string/string_helper.cc
+++ b/paddle/utils/string/string_helper.cc
@@ -12,15 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/string/string_helper.h"
+#include "paddle/utils/string/string_helper.h"
 
 #include <ctype.h>
 #include <stdio.h>
 #include <cstring>
 #include <string>
 
-#include "glog/logging.h"
-
 namespace paddle {
 namespace string {
 
@@ -75,7 +73,9 @@ char* LineFileReader::getdelim(FILE* f, char delim) {
     return _buffer;
   } else {
     _length = 0;
-    CHECK(feof(f));
+    int code = feof(f);
+    (void)code;
+    assert(code);
     return NULL;
   }
 #else
diff --git a/paddle/utils/string/string_helper.h b/paddle/utils/string/string_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..a02b313ef0eba61682188f65d3d6a03d432dc7fb
--- /dev/null
+++ b/paddle/utils/string/string_helper.h
@@ -0,0 +1,236 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <cstring>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace string {
+
+inline size_t count_spaces(const char* s) {
+  size_t count = 0;
+
+  while (*s != 0 && isspace(*s++)) {
+    count++;
+  }
+
+  return count;
+}
+
+inline size_t count_nonspaces(const char* s) {
+  size_t count = 0;
+
+  while (*s != 0 && !isspace(*s++)) {
+    count++;
+  }
+
+  return count;
+}
+
+template <class... ARGS>
+void format_string_append(std::string& str,  // NOLINT
+                          const char* fmt,   // NOLINT
+                          ARGS&&... args) {
+  int len = snprintf(NULL, 0, fmt, args...);
+  assert(len == 0);
+  size_t oldlen = str.length();
+  str.resize(oldlen + len + 1);
+  int new_len =
+      snprintf(&str[oldlen], (size_t)len + 1, fmt, args...);  // NOLINT
+  (void)new_len;
+  assert(new_len == len);
+  str.resize(oldlen + len);
+}
+
+template <class... ARGS>
+void format_string_append(std::string& str,        // NOLINT
+                          const std::string& fmt,  // NOLINT
+                          ARGS&&... args) {
+  format_string_append(str, fmt.c_str(), args...);
+}
+
+template <class... ARGS>
+std::string format_string(const char* fmt, ARGS&&... args) {
+  std::string str;
+  format_string_append(str, fmt, args...);
+  return str;
+}
+
+template <class... ARGS>
+std::string format_string(const std::string& fmt, ARGS&&... args) {
+  return format_string(fmt.c_str(), args...);
+}
+
+// remove leading and tailing spaces
+std::string trim_spaces(const std::string& str);
+
+// erase all spaces in str
+std::string erase_spaces(const std::string& str);
+
+inline int str_to_float(const char* str, float* v) {
+  const char* head = str;
+  char* cursor = NULL;
+  int index = 0;
+  while (*(head += count_spaces(head)) != 0) {
+    v[index++] = std::strtof(head, &cursor);
+    if (head == cursor) {
+      break;
+    }
+    head = cursor;
+  }
+  return index;
+}
+
+// checks whether the test string is a suffix of the input string.
+bool ends_with(std::string const& input, std::string const& test);
+
+// split string by delim
+template <class T = std::string>
+std::vector<T> split_string(const std::string& str, const std::string& delim) {
+  size_t pre_pos = 0;
+  size_t pos = 0;
+  std::string tmp_str;
+  std::vector<T> res_list;
+  res_list.clear();
+  if (str.empty()) {
+    return res_list;
+  }
+  while ((pos = str.find(delim, pre_pos)) != std::string::npos) {
+    tmp_str.assign(str, pre_pos, pos - pre_pos);
+    res_list.push_back(tmp_str);
+    pre_pos = pos + 1;
+  }
+  tmp_str.assign(str, pre_pos, str.length() - pre_pos);
+  if (!tmp_str.empty()) {
+    res_list.push_back(tmp_str);
+  }
+  return res_list;
+}
+
+// split string by spaces. Leading and tailing spaces are ignored. Consecutive
+// spaces are treated as one delim.
+template <class T = std::string>
+std::vector<T> split_string(const std::string& str) {
+  std::vector<T> list;
+  const char* p;
+  int pre_pos = 0;
+  int pos = 0;
+  std::string tmp_str;
+  if (str.empty()) {
+    return list;
+  }
+  for (p = str.c_str(); *p != 0;) {
+    if (!isspace(*p)) {
+      pos = pre_pos;
+      p++;
+
+      while (*p != 0 && !isspace(*p)) {
+        pos++;
+        p++;
+      }
+      tmp_str.assign(str, pre_pos, pos - pre_pos + 1);
+      list.push_back(tmp_str);
+      pre_pos = pos + 1;
+    } else {
+      pre_pos++;
+      p++;
+    }
+  }
+  return list;
+}
+
+template <class Container>
+std::string join_strings(const Container& strs, char delim) {
+  std::string str;
+
+  size_t i = 0;
+  for (auto& elem : strs) {
+    if (i > 0) {
+      str += delim;
+    }
+
+    std::stringstream ss;
+    ss << elem;
+    str += ss.str();
+    ++i;
+  }
+
+  return str;
+}
+
+template <class Container>
+std::string join_strings(const Container& strs, const std::string& delim) {
+  std::string str;
+
+  size_t i = 0;
+  for (auto& elem : strs) {
+    if (i > 0) {
+      str += delim;
+    }
+
+    std::stringstream ss;
+    ss << elem;
+    str += ss.str();
+    ++i;
+  }
+
+  return str;
+}
+
+template <class Container, class DelimT, class ConvertFunc>
+std::string join_strings(const Container& strs,
+                         DelimT&& delim,
+                         ConvertFunc&& func) {
+  std::stringstream ss;
+  size_t i = 0;
+  for (const auto& elem : strs) {
+    if (i > 0) {
+      ss << delim;
+    }
+    ss << func(elem);
+    ++i;
+  }
+
+  return ss.str();
+}
+
+// A helper class for reading lines from file. A line buffer is maintained. It
+// doesn't need to know the maximum possible length of a line.
+
+class LineFileReader {
+ public:
+  LineFileReader() {}
+  LineFileReader(LineFileReader&&) = delete;
+  LineFileReader(const LineFileReader&) = delete;
+  ~LineFileReader() { ::free(_buffer); }
+  char* getline(FILE* f) { return this->getdelim(f, '\n'); }
+  char* getdelim(FILE* f, char delim);
+  char* get() { return _buffer; }
+  size_t length() { return _length; }
+
+ private:
+  char* _buffer = NULL;
+  size_t _buf_size = 0;
+  size_t _length = 0;
+};
+}  // end namespace string
+}  // end namespace paddle
diff --git a/paddle/fluid/string/string_helper_test.cc b/paddle/utils/string/string_helper_test.cc
similarity index 97%
rename from paddle/fluid/string/string_helper_test.cc
rename to paddle/utils/string/string_helper_test.cc
index 67456e16a93b67f39d86c5751e35148be7020f61..e0789e9a545dd66abfde8799671bb6887db91287 100644
--- a/paddle/fluid/string/string_helper_test.cc
+++ b/paddle/utils/string/string_helper_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/string/string_helper.h"
+#include "paddle/utils/string/string_helper.h"
 
 #include <string>
 
diff --git a/paddle/fluid/string/tinyformat/tinyformat.h b/paddle/utils/string/tinyformat/tinyformat.h
similarity index 91%
rename from paddle/fluid/string/tinyformat/tinyformat.h
rename to paddle/utils/string/tinyformat/tinyformat.h
index 7498c6a46e38af98e8356f9f87a0cfb6b163bddf..28a444f87c48fdde7d41aa257fe0e91538c9b7a7 100644
--- a/paddle/fluid/string/tinyformat/tinyformat.h
+++ b/paddle/utils/string/tinyformat/tinyformat.h
@@ -170,7 +170,8 @@ struct is_convertible {
 
 // Format the value by casting to type fmtT.  This default implementation
 // should never be called.
-template <typename T, typename fmtT,
+template <typename T,
+          typename fmtT,
           bool convertible = is_convertible<T, fmtT>::value>
 struct formatValueAsType {
   static void invoke(std::ostream & /*out*/, const T & /*value*/) { assert(0); }
@@ -240,8 +241,11 @@ TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(char)
 /// operator<< to format the type T, with special cases for the %c and %p
 /// conversions.
 template <typename T>
-inline void formatValue(std::ostream &out, const char * /*fmtBegin*/,
-                        const char *fmtEnd, int ntrunc, const T &value) {
+inline void formatValue(std::ostream &out,
+                        const char * /*fmtBegin*/,
+                        const char *fmtEnd,
+                        int ntrunc,
+                        const T &value) {
   // The mess here is to support the %c and %p conversions: if these
   // conversions are active we try to convert the type to a char or const
   // void* respectively and format that instead of the value itself.  For the
@@ -250,35 +254,39 @@ inline void formatValue(std::ostream &out, const char * /*fmtBegin*/,
   const bool canConvertToChar = detail::is_convertible<T, char>::value;
   const bool canConvertToVoidPtr =
       detail::is_convertible<T, const void *>::value;
-  if (canConvertToChar && *(fmtEnd - 1) == 'c')
+  if (canConvertToChar && *(fmtEnd - 1) == 'c') {
     detail::formatValueAsType<T, char>::invoke(out, value);
-  else if (canConvertToVoidPtr && *(fmtEnd - 1) == 'p')
+  } else if (canConvertToVoidPtr && *(fmtEnd - 1) == 'p') {
     detail::formatValueAsType<T, const void *>::invoke(out, value);
-  else if (ntrunc >= 0) {
+  } else if (ntrunc >= 0) {
     // Take care not to overread C strings in truncating conversions like
     // "%.4s" where at most 4 characters may be read.
     detail::formatTruncated(out, value, ntrunc);
-  } else
+  } else {
     out << value;
+  }
 }
 
 // Overloaded version for char types to support printing as an integer
-#define TINYFORMAT_DEFINE_FORMATVALUE_CHAR(charType)                      \
-  inline void formatValue(std::ostream &out, const char * /*fmtBegin*/,   \
-                          const char *fmtEnd, int /**/, charType value) { \
-    switch (*(fmtEnd - 1)) {                                              \
-      case 'u':                                                           \
-      case 'd':                                                           \
-      case 'i':                                                           \
-      case 'o':                                                           \
-      case 'X':                                                           \
-      case 'x':                                                           \
-        out << static_cast<int>(value);                                   \
-        break;                                                            \
-      default:                                                            \
-        out << value;                                                     \
-        break;                                                            \
-    }                                                                     \
+#define TINYFORMAT_DEFINE_FORMATVALUE_CHAR(charType) \
+  inline void formatValue(std::ostream &out,         \
+                          const char * /*fmtBegin*/, \
+                          const char *fmtEnd,        \
+                          int /**/,                  \
+                          charType value) {          \
+    switch (*(fmtEnd - 1)) {                         \
+      case 'u':                                      \
+      case 'd':                                      \
+      case 'i':                                      \
+      case 'o':                                      \
+      case 'X':                                      \
+      case 'x':                                      \
+        out << static_cast<int>(value);              \
+        break;                                       \
+      default:                                       \
+        out << value;                                \
+        break;                                       \
+    }                                                \
   }
 // per 3.9.1: char, signed char and unsigned char are all distinct types
 TINYFORMAT_DEFINE_FORMATVALUE_CHAR(char)
@@ -466,7 +474,7 @@ cog.outl('#define TINYFORMAT_FOREACH_ARGNUM(m) \\\n    ' +
 #define TINYFORMAT_FOREACH_ARGNUM(m)                                         \
   m(1) m(2) m(3) m(4) m(5) m(6) m(7) m(8) m(9) m(10) m(11) m(12) m(13) m(14) \
       m(15) m(16)
-//[[[end]]]
+// [[[end]]]
 
 namespace detail {
 
@@ -476,15 +484,17 @@ namespace detail {
 // whereas a naive implementation based on inheritance does not.
 class FormatArg {
  public:
-  FormatArg() {}
+  FormatArg() {}  // NOLINT
 
   template <typename T>
-  FormatArg(const T &value)
+  FormatArg(const T &value)  // NOLINT
       : m_value(static_cast<const void *>(&value)),
         m_formatImpl(&formatImpl<T>),
         m_toIntImpl(&toIntImpl<T>) {}
 
-  void format(std::ostream &out, const char *fmtBegin, const char *fmtEnd,
+  void format(std::ostream &out,
+              const char *fmtBegin,
+              const char *fmtEnd,
               int ntrunc) const {
     m_formatImpl(out, fmtBegin, fmtEnd, ntrunc, m_value);
   }
@@ -493,8 +503,11 @@ class FormatArg {
 
  private:
   template <typename T>
-  static void formatImpl(std::ostream &out, const char *fmtBegin,
-                         const char *fmtEnd, int ntrunc, const void *value) {
+  static void formatImpl(std::ostream &out,
+                         const char *fmtBegin,
+                         const char *fmtEnd,
+                         int ntrunc,
+                         const void *value) {
     formatValue(out, fmtBegin, fmtEnd, ntrunc, *static_cast<const T *>(value));
   }
 
@@ -504,14 +517,17 @@ class FormatArg {
   }
 
   const void *m_value;
-  void (*m_formatImpl)(std::ostream &out, const char *fmtBegin,
-                       const char *fmtEnd, int ntrunc, const void *value);
+  void (*m_formatImpl)(std::ostream &out,
+                       const char *fmtBegin,
+                       const char *fmtEnd,
+                       int ntrunc,
+                       const void *value);
   int (*m_toIntImpl)(const void *value);
 };
 
 // Parse and return an integer from the string c, as atoi()
 // On return, c is set to one past the end of the integer.
-inline int parseIntAndAdvance(const char *&c) {
+inline int parseIntAndAdvance(const char *&c) {  // NOLINT
   int i = 0;
   for (; *c >= '0' && *c <= '9'; ++c) i = 10 * i + (*c - '0');
   return i;
@@ -553,11 +569,13 @@ inline const char *printFormatStringLiteral(std::ostream &out,
 // and ntrunc (for truncating conversions).  argIndex is incremented if
 // necessary to pull out variable width and precision .  The function returns a
 // pointer to the character after the end of the current format spec.
-inline const char *streamStateFromFormat(std::ostream &out,
-                                         bool &spacePadPositive, int &ntrunc,
+inline const char *streamStateFromFormat(std::ostream &out,       // NOLINT
+                                         bool &spacePadPositive,  // NOLINT
+                                         int &ntrunc,             // NOLINT
                                          const char *fmtStart,
                                          const detail::FormatArg *formatters,
-                                         int &argIndex, int numFormatters) {
+                                         int &argIndex,  // NOLINT
+                                         int numFormatters) {
   if (*fmtStart != '%') {
     TINYFORMAT_ERROR(
         "tinyformat: Not enough conversion specifiers in format string");
@@ -733,8 +751,10 @@ inline const char *streamStateFromFormat(std::ostream &out,
 }
 
 //------------------------------------------------------------------------------
-inline void formatImpl(std::ostream &out, const char *fmt,
-                       const detail::FormatArg *formatters, int numFormatters) {
+inline void formatImpl(std::ostream &out,
+                       const char *fmt,
+                       const detail::FormatArg *formatters,
+                       int numFormatters) {
   // Saved stream state
   std::streamsize origWidth = out.width();
   std::streamsize origPrecision = out.precision();
@@ -746,9 +766,13 @@ inline void formatImpl(std::ostream &out, const char *fmt,
     fmt = printFormatStringLiteral(out, fmt);
     bool spacePadPositive = false;
     int ntrunc = -1;
-    const char *fmtEnd =
-        streamStateFromFormat(out, spacePadPositive, ntrunc, fmt, formatters,
-                              argIndex, numFormatters);
+    const char *fmtEnd = streamStateFromFormat(out,
+                                               spacePadPositive,
+                                               ntrunc,
+                                               fmt,
+                                               formatters,
+                                               argIndex,
+                                               numFormatters);
     if (argIndex >= numFormatters) {
       // Check args remain after reading any variable width/precision
       TINYFORMAT_ERROR("tinyformat: Not enough format arguments");
@@ -756,9 +780,9 @@ inline void formatImpl(std::ostream &out, const char *fmt,
     }
     const FormatArg &arg = formatters[argIndex];
     // Format the arg into the stream.
-    if (!spacePadPositive)
+    if (!spacePadPositive) {
       arg.format(out, fmt, fmtEnd, ntrunc);
-    else {
+    } else {
       // The following is a special case with no direct correspondence
       // between stream formatting and the printf() behaviour.  Simulate
       // it crudely by formatting into a temporary string stream and
@@ -801,7 +825,8 @@ class FormatList {
   FormatList(detail::FormatArg *formatters, int N)
       : m_formatters(formatters), m_N(N) {}
 
-  friend void vformat(std::ostream &out, const char *fmt,
+  friend void vformat(std::ostream &out,
+                      const char *fmt,
                       const FormatList &list);
 
  private:
@@ -819,7 +844,7 @@ template <int N>
 class FormatListN : public FormatList {
  public:
   template <typename... Args>
-  FormatListN(const Args &... args)
+  FormatListN(const Args &... args)  // NOLINT
       : FormatList(&m_formatterStore[0], N),
         m_formatterStore{FormatArg(args)...} {
     static_assert(sizeof...(args) == N, "Number of args must be N");
@@ -850,7 +875,7 @@ class FormatListN<0> : public FormatList {
 template <typename... Args>
 detail::FormatListN<sizeof...(Args)> makeFormatList(const Args &... args) {
   return detail::FormatListN<sizeof...(args)>(args...);
-}
+}  // NOLINT
 
 /// Format list of arguments to the stream according to the given format string.
 ///
diff --git a/paddle/utils/string/to_string.h b/paddle/utils/string/to_string.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b3332861e0fa3edbbb8915e3e3f068fed3b412f
--- /dev/null
+++ b/paddle/utils/string/to_string.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <typeindex>
+#include <vector>
+
+namespace paddle {
+namespace string {
+inline std::ostream& operator<<(std::ostream& s, const std::type_index& t) {
+  s << t.name();
+  return s;
+}
+
+template <typename T,
+          typename std::enable_if<!std::is_enum<T>::value, int>::type = 0>
+inline std::string to_string(T v) {
+  std::ostringstream sout;
+  sout << v;
+  return sout.str();
+}
+
+template <typename T,
+          typename std::enable_if<std::is_enum<T>::value, int>::type = 0>
+inline std::string to_string(T v) {
+  return std::to_string(static_cast<int>(v));
+}
+
+template <>
+inline std::string to_string(std::type_index t) {
+  return t.name();
+}
+
+// Faster std::string/const char* type
+template <>
+inline std::string to_string(std::string v) {
+  return v;
+}
+
+template <>
+inline std::string to_string(const char* v) {
+  return std::string(v);
+}
+
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/fluid/string/to_string_test.cc b/paddle/utils/string/to_string_test.cc
similarity index 96%
rename from paddle/fluid/string/to_string_test.cc
rename to paddle/utils/string/to_string_test.cc
index 1d9c0e5e0c2b6e7f44c1622d2828b21b0a4380ee..778ba8bb113a29a5831ecd751bb294d52ac7e0eb 100644
--- a/paddle/fluid/string/to_string_test.cc
+++ b/paddle/utils/string/to_string_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/string/to_string.h"
+#include "paddle/utils/string/to_string.h"
 #include <gtest/gtest.h>
 
 constexpr char kOutputString[] = "User Defined Output";
diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py
index 2e5adfa5dfbb14749dd614340768bb064d6dbaf1..b9cceafebaac4d86f34362e051934efe244e52ab 100644
--- a/python/paddle/autograd/functional.py
+++ b/python/paddle/autograd/functional.py
@@ -14,6 +14,7 @@
 
 import contextlib
 import paddle
+from paddle.static import gradients
 from ..fluid import framework
 from ..fluid.dygraph import grad
 from ..tensor.creation import assign
@@ -904,3 +905,122 @@ def vhp(func, inputs, v=None, create_graph=False, allow_unused=False):
         vhp = grad_fn(jac, xs, v)
         outputs, vhp = return_fn(outputs), return_fn(vhp)
     return outputs, vhp
+
+
+class Jacobian(object):
+    r"""
+    Object that represents the Jacobian matrix of a muli-input multi-output 
+    function.
+
+    The Jacobian values are lazily evaluated if accessed through indices.
+    In contrast, slicing access would trigger evaluating the full matrix
+    if it's not already computed.
+
+    Examples:
+        .. code-block:: python
+            import paddle        
+            import numpy as np
+
+            def func(xs):
+                x, y = xs
+                return paddle.matmul(x, y)
+            
+            main = fluid.Program()
+            startup = fluid.Program()
+            with fluid.program_guard(main, startup):
+                x = paddle.static.data(name='x', shape=[2, 2], dtype='float32')
+                JJ = paddle.autograd.functional.Jacobian(func, [x, x])
+                nrow, ncol = JJ.shape()
+                full_jacobian = JJ[:]
+            place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup)
+
+            feeds = {'x': np.array([[2., 2.], [2., 1.]]).astype('float32')}
+            jacobian = exe.run(main, feed=feeds, fetch_list=[full_jacobian])[0]
+            print(jacobian)
+            # [[4. 2. 2. 0. 4. 2. 2. 0.]
+            #  [2. 3. 0. 2. 2. 3. 0. 2.]
+            #  [2. 0. 3. 2. 2. 0. 3. 2.]
+            #  [0. 2. 2. 2. 0. 2. 2. 2.]]
+    """
+
+    def __init__(self, func, inputs, batch=False):
+        r"""Constructing a Jacobian matrix.
+
+        Parameters:
+            func (Callable): a Python function that takes as input a Tensor
+                or a Tensor list and outputs a Tensor or a Tensor list.
+            inputs (Tensor|list[Tensor]): a Tensor or a list of Tensors as
+                `func`'s input.
+            batch (bool):  if True the 0'th axis is considered the batch
+                dimension, both on input and output.
+        """
+
+        def enable_grads(inputs):
+            if isinstance(inputs, (list, tuple)):
+                for x in inputs:
+                    x.stop_gradient = False
+            else:
+                assert isinstance(inputs, paddle.fluid.framework.Variable), (
+                    f"Expecting {inputs} to be paddle.fluid.framework.Variable,"
+                    f" however it's found to be a(n) {type(inputs)}.")
+                inputs.stop_gradient = False
+            return inputs
+
+        self.batch = batch
+        self.xs = enable_grads(inputs)
+        ys = func(inputs)
+        if not isinstance(ys, list):
+            ys = [ys]
+        self.y = self.flatten_all(ys)
+        self.ydim = self.y.shape[-1]
+        self.xdim = self.flatten_all(inputs).shape[-1]
+        self.bdim = self.y.shape[0]
+        self.jacobian = {}
+
+    def flatten(self, x):
+        to = [x.shape[0], -1] if self.batch else [-1]
+        return x.reshape(to)
+
+    def flatten_all(self, xs):
+        return paddle.concat([self.flatten(x) for x in xs], axis=-1)
+
+    def shape(self):
+        return (self.ydim, self.xdim)
+
+    def __getitem__(self, tup):
+        if hasattr(tup, '__iter__'):
+            i, j = tup
+        else:
+            i, j = tup, None
+
+        if isinstance(i, slice):
+            slicing = True
+        else:
+            slicing = False
+
+        if slicing:
+            if 'full' not in self.jacobian:
+                rows = [
+                    self.flatten_all(gradients(self.y[..., i], self.xs))
+                    for i in range(self.ydim)
+                ]
+                self.jacobian['full'] = paddle.stack(rows)
+            return self.jacobian['full'][i]
+
+        assert 0 <= i < self.ydim, f"Jacobian index i={i} is not valid."
+        assert (j is None) or (
+            0 <= j < self.xdim), f"Jacobian index j={j} is not valid."
+        if 'full' in self.jacobian:
+            JJ = self.jacobian['full']
+        else:
+            JJ = self.jacobian
+            if i not in self.jacobian:
+                self.jacobian[i] = self.flatten_all(
+                    gradients(self.y[..., i], self.xs))
+
+        if j is None:
+            return JJ[i]
+        else:
+            return JJ[i][..., j]
diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py
index 6e6d2a672fd18631c4f0ac7073eaada488b37967..da0f2ebcba89ef1ffddf1870eeba75ca07c4a6bb 100644
--- a/python/paddle/distributed/auto_parallel/reshard.py
+++ b/python/paddle/distributed/auto_parallel/reshard.py
@@ -279,7 +279,7 @@ def _is_overlapped(shape_x, shape_y):
     return overlapped
 
 
-def _need_reshard(dist_tensor, dist_op):
+def _need_reshard(dist_tensor, dist_op, op_input=True):
     """Judge the tensor whether needs to be resharded."""
     is_reshard = False
     tensor_dist_attr = dist_tensor.dist_attr
@@ -289,13 +289,31 @@ def _need_reshard(dist_tensor, dist_op):
     op_dist_attr = dist_op.dist_attr
     op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
     op_process_mesh = op_dist_attr.process_mesh
-    if all(
-            map(lambda x: x is not None, [
-                tensor_dims_mapping, tensor_process_mesh, op_input_dims_mapping,
-                op_process_mesh
-            ])):
-        if tensor_dims_mapping != op_input_dims_mapping or tensor_process_mesh != op_process_mesh:
-            is_reshard = True
+    if op_input:
+        op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
+        op_process_mesh = op_dist_attr.process_mesh
+        if all(
+                map(lambda x: x is not None, [
+                    tensor_dims_mapping, tensor_process_mesh,
+                    op_input_dims_mapping, op_process_mesh
+                ])):
+            if tensor_dims_mapping != op_input_dims_mapping or tensor_process_mesh != op_process_mesh:
+                is_reshard = True
+    else:
+        op_output_dims_mapping = op_dist_attr.get_output_dims_mapping(
+            tensor_name)
+        op_process_mesh = op_dist_attr.process_mesh
+        if all(
+                map(lambda x: x is not None, [
+                    tensor_dims_mapping, tensor_process_mesh,
+                    op_output_dims_mapping, op_process_mesh
+                ])):
+            if tensor_process_mesh != op_process_mesh:
+                is_reshard = True
+            if tensor_dims_mapping != op_output_dims_mapping:
+                raise ValueError(
+                    "It is not supported that tensor dims mapping is different from op output dims mapping."
+                )
     return is_reshard
 
 
@@ -948,12 +966,13 @@ def remove_no_need_in_startup(auto_parallel_main_prog,
 def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id,
             dist_context):
     """
-    Reshard tensor in the program according to its dist attr and corresponding op dist attr.
+    Reshard tensor in the program according to its distributed attribute and corresponding op distributed attribute.
 
     Args:
         auto_parallel_main_prog (Program): An auto parallel main program.
         auto_parallel_startup_prog (Program): An auto parallel startup program.
         rank_id (int): The process id.
+        dist_context (DistributedContext): The distributed context of this rank.
     """
     assert isinstance(auto_parallel_main_prog, Program), "The type of auto_parallel_main_prog should be Program, " \
                                          "but got {}.".format(type(auto_parallel_main_prog))
@@ -1001,6 +1020,34 @@ def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id,
         else:
             idx += 1
 
+    # insert send and recv op if output process mesh is different from tensor process mesh
+    idx = 0
+    skip_ops = ["create_py_reader", "create_double_buffer_reader", "read"]
+    while idx < len(block.ops):
+        pre_op_count = len(block.ops)
+        op = block.ops[idx]
+        dist_op = dist_context.get_dist_op_for_program(op)
+        if dist_op is not None and op.type not in skip_ops:
+            for var_name in op.output_arg_names:
+                var = block.vars[var_name]
+                dist_tensor = dist_context.get_dist_tensor_for_program(var)
+                if dist_tensor is not None and _need_reshard(dist_tensor,
+                                                             dist_op, False):
+                    for index, item in enumerate(
+                            dist_op.dist_attr.process_mesh.processes):
+                        recv_rank = dist_tensor.dist_attr.process_mesh.processes[
+                            index]
+                        if rank_id == item:
+                            _insert_send_op(block, idx + 1, var, recv_rank)
+                        if rank_id == recv_rank:
+                            _insert_recv_op(block, idx + 1, var, item)
+                    cur_op_count = len(block.ops)
+                    idx_offset = idx_offset + cur_op_count - pre_op_count
+                    pre_op_count = cur_op_count
+            idx = idx + idx_offset + 1
+        else:
+            idx += 1
+
     # remove no need vars and ops in the main program
     remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id)
 
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index b08753066ca3614c02252aff1b72fbdcfceb5698..f0cf6573139d90620436ba943b1af5be5bd5cb15 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -224,6 +224,14 @@ class Gloo(object):
             self._worker_comm = gloo
         # TODO (sandyhouse): initialize gloo for server and all
 
+        # the closing of kv server may cause gloo init failure 
+        # since it depend on the full mesh connection
+        # e.g. 0 connected with 1,2,3 while 2-3 not connected yet
+        # TODO(kuizhiqing)
+        if start_http_server:
+            http_server_d["running"] = False
+            http_server.join()
+
     def _get_rank_nodes(self, role):
         nodes = 0
         rank = -1
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
index e7108b3f4f3432df04556b4cf78726a63cc8b076..50bf8a2f9c7c58b3390d2881cb5d6e8510e78ae8 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -49,8 +49,6 @@ class HybridParallelClipGrad:
 
     @imperative_base.no_grad
     def _dygraph_clip(self, params_grads):
-        params_and_grads = []
-
         sum_square_dist_fp16 = []
         sum_square_dist_fp32 = []
         sum_square_not_dist_fp16 = []
@@ -153,15 +151,14 @@ class HybridParallelClipGrad:
             if g is None:
                 continue
             if getattr(p, 'need_clip', True) is False:
-                params_and_grads.append((p, g))
                 continue
             if p.dtype == paddle.float16:
-                new_grad = layers.elementwise_mul(x=g, y=clip_var_fp16)
+                g.scale_(clip_var_fp16)
             else:
-                new_grad = layers.elementwise_mul(x=g, y=clip_var)
-            params_and_grads.append((p, new_grad))
+                g.scale_(clip_var)
+            p._reset_grad_inplace_version(True)
 
-        return params_and_grads
+        return params_grads
 
     def __getattr__(self, item):
         return getattr(self._clip, item)
@@ -201,6 +198,12 @@ class HybridParallelOptimizer:
             else:
                 self._inner_opt._grad_clip = HybridParallelClipGrad(
                     self._inner_opt._grad_clip, hcg)
+                if self._inner_opt._parameter_list and isinstance(
+                        self._inner_opt._parameter_list[0], dict):
+                    for item in self._inner_opt._param_groups:
+                        if "grad_clip" in item.keys():
+                            item["grad_clip"] = HybridParallelClipGrad(
+                                self._inner_opt._grad_clip, hcg)
 
     @imperative_base.no_grad
     @framework.dygraph_only
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index a2797adff251aea3535f86e5c423463d748c37b3..fc5b93c6e25499a0ae50c19cacae4a9395520fe9 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -109,6 +109,13 @@ class ShardingOptimizerStage2(Optimizer):
             self._optim._grad_clip = ShardingClipGrad(self._optim._grad_clip,
                                                       paddle.get_device(),
                                                       self.group)
+            if self._optim._parameter_list and isinstance(
+                    self._optim._parameter_list[0], dict):
+                for item in self._optim._param_groups:
+                    if "grad_clip" in item.keys():
+                        item["grad_clip"] = ShardingClipGrad(
+                            self._optim._grad_clip,
+                            paddle.get_device(), self.group)
 
         if offload:
             assert self._pfp16, "Only support offload strategy while using \'Adam\', \'AdamW\' and \'Momentum\' optimizer with AMP/Pure FP16"
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
index 41c6f92230ab3e0e8de9aec0abdf920fad1ef232..9d7bd937411882541d9cb1311c241d3e84316c90 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -393,6 +393,7 @@ class ShardingStage3(nn.Layer):
                 else:
                     param.bw_storage.scale_(scale=self._world_size_scaling)
             param.fw_storage = _VarBaseWrapper(param)
+            assert param.fw_storage.grad is None
             param.fw_storage._copy_gradient_from(param.bw_storage)
             update_list.append(param)
         return update_list
@@ -495,10 +496,9 @@ class ShardingStage3(nn.Layer):
     def _redefine_opt_step(self):
         params_slice_func = self._update_params_slice
         opt_step = self._optim.step
-        update_scaler = self._optim.update_scaler
 
         def _opt_step(self):
-            if not update_scaler:
+            if not self.update_scaler:
                 params_slice_func()
             if self.offload:
                 with device_guard(device="cpu"):
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index 5f696195c1abcd4921b4358b8971fdbc982609da..9c30ff5a45075ae423d6a46ef328e3b6523fbd5b 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -57,8 +57,6 @@ class ShardingClipGrad:
 
     @imperative_base.no_grad
     def _dygraph_clip(self, params_grads):
-        params_and_grads = []
-
         sum_square_fp16 = []
         sum_square_fp32 = []
 
@@ -114,15 +112,14 @@ class ShardingClipGrad:
             if g is None:
                 continue
             if getattr(p, 'need_clip', True) is False:
-                params_and_grads.append((p, g))
                 continue
             if p.dtype == paddle.float16:
-                new_grad = layers.elementwise_mul(x=g, y=clip_var_fp16)
+                g.scale_(clip_var_fp16)
             else:
-                new_grad = layers.elementwise_mul(x=g, y=clip_var)
-            params_and_grads.append((p, new_grad))
+                g.scale_(clip_var)
+            p._reset_grad_inplace_version(True)
 
-        return params_and_grads
+        return params_grads
 
     def __getattr__(self, item):
         return getattr(self._clip, item)
diff --git a/python/paddle/distributed/passes/cpp_pass.py b/python/paddle/distributed/passes/cpp_pass.py
index fe6ef74bd85c8fdbbb02d9c15f0392acc5606786..6b8ea30f3ba17711935f6c58fc44a9dd05b7b4ea 100644
--- a/python/paddle/distributed/passes/cpp_pass.py
+++ b/python/paddle/distributed/passes/cpp_pass.py
@@ -26,3 +26,16 @@ class FuseElementwiseAddActPass(CPPPassWrapper):
 
     def _type(self):
         return PassType.FUSION_OPT
+
+
+@register_pass("fuse_bn_act")
+class FuseBatchNormActPass(CPPPassWrapper):
+    def __init__(self):
+        super(FuseBatchNormActPass, self).__init__()
+
+    @property
+    def cpp_name(self):
+        return "fuse_bn_act_pass"
+
+    def _type(self):
+        return PassType.FUSION_OPT
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 5ee7b04248e4527357060839a769f7a2c726d744..84d3f5547feb4b5c4fa6d3d9b88a57b9a1e52344 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1763,7 +1763,10 @@ class Variable(object):
         Examples:
           .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+
+            paddle.enable_static()
             cur_program = fluid.Program()
             cur_block = cur_program.current_block()
             new_variable = cur_block.create_var(name="X",
@@ -1773,7 +1776,8 @@ class Variable(object):
         """
         if self.type == core.VarDesc.VarType.SELECTED_ROWS:
             raise Exception("SelectedRows DO NOT supprt lod")
-
+        if self.type == core.VarDesc.VarType.STRINGS:
+            return None
         return self.desc.lod_level()
 
     @property
diff --git a/python/paddle/fluid/tests/custom_op/attr_test_op.cc b/python/paddle/fluid/tests/custom_op/attr_test_op.cc
index 1c79d9a26aee3409fc2a32b755abcd45f4ca06c3..14cb0aa7c716d8449c672231f5399027275f8c5d 100644
--- a/python/paddle/fluid/tests/custom_op/attr_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/attr_test_op.cc
@@ -137,9 +137,7 @@ std::vector<paddle::Tensor> AttrTestForward(
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
         assign_cpu_kernel<data_t>(
-            x.data<data_t>(),
-            out.mutable_data<data_t>(paddle::PlaceType::kCPU),
-            x.size());
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
       }));
 
   // Check attrs value
@@ -177,13 +175,12 @@ std::vector<paddle::Tensor> AttrTestBackward(
     const std::vector<std::string>& str_vec_attr) {
   auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, grad_out.shape());
 
-  PD_DISPATCH_FLOATING_TYPES(
-      grad_out.type(), "assign_cpu_kernel", ([&] {
-        assign_cpu_kernel<data_t>(
-            grad_out.data<data_t>(),
-            grad_x.mutable_data<data_t>(paddle::PlaceType::kCPU),
-            grad_out.size());
-      }));
+  PD_DISPATCH_FLOATING_TYPES(grad_out.type(), "assign_cpu_kernel", ([&] {
+                               assign_cpu_kernel<data_t>(
+                                   grad_out.data<data_t>(),
+                                   grad_x.mutable_data<data_t>(),
+                                   grad_out.size());
+                             }));
 
   CheckAllBackwardAttrs(int_attr, float_vec_attr, str_vec_attr);
 
@@ -206,9 +203,7 @@ std::vector<paddle::Tensor> ConstAttrTestForward(
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
         assign_cpu_kernel<data_t>(
-            x.data<data_t>(),
-            out.mutable_data<data_t>(paddle::PlaceType::kCPU),
-            x.size());
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
       }));
 
   // Check attrs value
@@ -246,13 +241,12 @@ std::vector<paddle::Tensor> ConstAttrTestBackward(
     const std::vector<std::string>& str_vec_attr) {
   auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, grad_out.shape());
 
-  PD_DISPATCH_FLOATING_TYPES(
-      grad_out.type(), "assign_cpu_kernel", ([&] {
-        assign_cpu_kernel<data_t>(
-            grad_out.data<data_t>(),
-            grad_x.mutable_data<data_t>(paddle::PlaceType::kCPU),
-            grad_out.size());
-      }));
+  PD_DISPATCH_FLOATING_TYPES(grad_out.type(), "assign_cpu_kernel", ([&] {
+                               assign_cpu_kernel<data_t>(
+                                   grad_out.data<data_t>(),
+                                   grad_x.mutable_data<data_t>(),
+                                   grad_out.size());
+                             }));
 
   CheckAllBackwardAttrs(int_attr, float_vec_attr, str_vec_attr);
 
diff --git a/python/paddle/fluid/tests/custom_op/concat_and_split.h b/python/paddle/fluid/tests/custom_op/concat_and_split.h
index cbec4653a207d9b92da48d0cad79288159329a6a..9f24cc43699773fc531ccd68d4219ebcdfdab8eb 100644
--- a/python/paddle/fluid/tests/custom_op/concat_and_split.h
+++ b/python/paddle/fluid/tests/custom_op/concat_and_split.h
@@ -47,7 +47,7 @@ void ConcatCpuKernel(const std::vector<paddle::Tensor>& ins,
   int64_t out_cols = 0;
   auto ins_cols = GetCols(ins, out_rows, &out_cols);
 
-  auto* out_data = out->mutable_data<data_t>(paddle::PlaceType::kCPU);
+  auto* out_data = out->mutable_data<data_t>();
   int64_t col_idx = 0;
   for (size_t i = 0; i < num; ++i) {
     int64_t col_len = ins_cols[i];
@@ -76,9 +76,7 @@ void SplitCpuKernel(const paddle::Tensor& in,
     int64_t col_idx = 0;
     for (size_t j = 0; j < num; ++j) {
       int64_t col_len = out_cols[j];
-      auto* out_data =
-          outs->at(j).mutable_data<data_t>(paddle::PlaceType::kCPU) +
-          i * col_len;
+      auto* out_data = outs->at(j).mutable_data<data_t>() + i * col_len;
       std::memcpy(out_data, in_data + col_idx, sizeof(data_t) * col_len);
       col_idx += col_len;
     }
diff --git a/python/paddle/fluid/tests/custom_op/custom_conj_op.cc b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
index ae60799d239467ff8637f2e494315c2ac8c08744..b9c10f479e0a39eb8e33ffceb30e8eb9cc8efa9e 100644
--- a/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
@@ -76,9 +76,7 @@ std::vector<paddle::Tensor> ConjFunction(const paddle::Tensor& x) {
   PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
       x.type(), "ConjCPUKernel", ([&] {
         ConjCPUKernel<data_t>(
-            x.data<data_t>(),
-            x.size(),
-            out.mutable_data<data_t>(paddle::PlaceType::kCPU));
+            x.data<data_t>(), x.size(), out.mutable_data<data_t>());
       }));
 
   return {out};
diff --git a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
index d5f161fc5b775d92627bfcd0b0f4b0fa347d02be..0f7d323b5451efba5a503d9039a03531e1773efb 100644
--- a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
@@ -32,9 +32,7 @@ std::vector<paddle::Tensor> DispatchTestInterger(const paddle::Tensor& x) {
   PD_DISPATCH_INTEGRAL_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
         assign_cpu_kernel<data_t>(
-            x.data<data_t>(),
-            out.mutable_data<data_t>(paddle::PlaceType::kCPU),
-            x.size());
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
       }));
 
   return {out};
@@ -52,9 +50,7 @@ std::vector<paddle::Tensor> DispatchTestFloatAndInteger(
   PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
         assign_cpu_kernel<data_t>(
-            x.data<data_t>(),
-            out.mutable_data<data_t>(paddle::PlaceType::kCPU),
-            x.size());
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
       }));
 
   return {out};
@@ -71,9 +67,7 @@ std::vector<paddle::Tensor> DispatchTestComplex(const paddle::Tensor& x) {
   PD_DISPATCH_COMPLEX_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
         assign_cpu_kernel<data_t>(
-            x.data<data_t>(),
-            out.mutable_data<data_t>(paddle::PlaceType::kCPU),
-            x.size());
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
       }));
 
   return {out};
@@ -91,9 +85,7 @@ std::vector<paddle::Tensor> DispatchTestFloatAndComplex(
   PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
         assign_cpu_kernel<data_t>(
-            x.data<data_t>(),
-            out.mutable_data<data_t>(paddle::PlaceType::kCPU),
-            x.size());
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
       }));
 
   return {out};
@@ -111,9 +103,7 @@ std::vector<paddle::Tensor> DispatchTestFloatAndIntegerAndComplex(
   PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
         assign_cpu_kernel<data_t>(
-            x.data<data_t>(),
-            out.mutable_data<data_t>(paddle::PlaceType::kCPU),
-            x.size());
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
       }));
 
   return {out};
@@ -130,9 +120,7 @@ std::vector<paddle::Tensor> DispatchTestFloatAndHalf(const paddle::Tensor& x) {
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
         assign_cpu_kernel<data_t>(
-            x.data<data_t>(),
-            out.mutable_data<data_t>(paddle::PlaceType::kCPU),
-            x.size());
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
       }));
 
   return {out};
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 2ac5e9404c1ba52adcd4aaa86485c11a5ec881b4..2e35277d70cd62ab0a7a931cb32e2d9ead99ed73 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -156,7 +156,8 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_executor_origin_scheduler)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_mapper)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_executor_task_node)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_model_tensor)
+    LIST(REMOVE_ITEM TEST_OPS test_fleet_exe_dist_model_run)
+    LIST(REMOVE_ITEM TEST_OPS test_fleet_exe_dist_model_tensor)
 endif()
 
 # Temporally disable test_deprecated_decorator
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_jacobian_static.py b/python/paddle/fluid/tests/unittests/autograd/test_jacobian_static.py
new file mode 100644
index 0000000000000000000000000000000000000000..28fc6932b07310c4b2fced4d6e6122260e37fb2d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/autograd/test_jacobian_static.py
@@ -0,0 +1,346 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from utils import _compute_numerical_jacobian, _compute_numerical_batch_jacobian
+
+
+def approx_jacobian(f, xs, dtype, eps=1e-5, batch=False):
+    r"""Computes an approximate Jacobian matrix of a multi-valued function 
+    using finite differences.
+    
+    The function input is required to be an np array or a list of list of np 
+    arrays. 
+    """
+
+    def flatten(x):
+        if len(x.shape) > 0:
+            to = [x.shape[0], -1] if batch else [-1]
+            return x.reshape(to)
+        else:
+            return x
+
+    def flatten_all(xs):
+        if isinstance(xs, list):
+            flattened = np.concatenate([flatten(x) for x in xs], axis=-1)
+        else:
+            flattened = flatten(xs)
+        return flattened
+
+    def x_like(x, orig_x):
+        return x.reshape(orig_x.shape)
+
+    def _f(x):
+        if multi_inps:
+            _xs = np.split(x, splits, axis=-1)
+            _xs = [x_like(_x, _o) for _x, _o in zip(_xs, xs)]
+            outs = f(_xs)
+        else:
+            outs = f(x)
+        return flatten_all(outs)
+
+    multi_inps = False if isinstance(xs, np.ndarray) else True
+    x = flatten_all(xs)
+    xdim = x.shape[-1]
+    splits = []
+
+    if multi_inps:
+        split = 0
+        for inp in xs:
+            split += flatten(inp).shape[-1]
+            splits.append(split)
+
+    ds = eps * np.eye(xdim, dtype=dtype)
+
+    fprimes_by_x = [(0.5 / eps) * (_f(x + d) - _f(x - d)) for d in ds]
+    fprimes_by_y = np.stack(fprimes_by_x, axis=-1)
+    return np.transpose(fprimes_by_y, [1, 0, 2]) if batch else fprimes_by_y
+
+
+class TestJacobianFloat32(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        paddle.enable_static()
+        if fluid.core.is_compiled_with_cuda():
+            self.place = fluid.CUDAPlace(0)
+        else:
+            self.place = fluid.CPUPlace()
+        self.np_dtype = np.float32
+        self.A = np.array([[1., 2.]]).astype('float32')
+        self.B = np.array([[1., 2.], [2., 1.]]).astype('float32')
+        self.C = np.array([[2., 2.], [2., 1.]]).astype('float32')
+        self.D = np.array(
+            [[[2., 2.], [2., 1.]], [[1., 2.], [2., 1.]]]).astype('float32')
+        self.E = np.array(
+            [[[3., 4.], [2., 3.]], [[2., 1.], [1., 3.]]]).astype('float32')
+        self.eps = 1e-4
+        self.rtol = 1e-2
+        self.atol = 1e-2
+
+    def run_test(self, pd_f, np_f, inps, dtype, batch=False):
+        def make_tensors(inps):
+            if isinstance(inps, list):
+                xs = [
+                    paddle.static.data(
+                        f'x{i}', inp.shape, dtype=inp.dtype)
+                    for i, inp in enumerate(inps)
+                ]
+            else:
+                xs = paddle.static.data(
+                    name='x', shape=inps.shape, dtype=inps.dtype)
+            return xs
+
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            xs = make_tensors(inps)
+            JJ = paddle.autograd.functional.Jacobian(pd_f, xs, batch=batch)
+            nrow, ncol = JJ.shape()
+            full_jacobian = JJ[:]
+        exe = fluid.Executor(self.place)
+        exe.run(startup)
+        if isinstance(inps, list):
+            feeds = {f'x{i}': x for i, x in enumerate(inps)}
+        else:
+            feeds = {'x': inps}
+        pd_jacobians = exe.run(main, feed=feeds, fetch_list=[full_jacobian])[0]
+        np_jacobians = approx_jacobian(np_f, inps, dtype, self.eps, batch=batch)
+        self.assertTrue(
+            np.allclose(pd_jacobians, np_jacobians, self.rtol, self.atol))
+
+    def test_square(self):
+        def pd_f(x):
+            return paddle.multiply(x, x)
+
+        def np_f(x):
+            return np.multiply(x, x)
+
+        self.run_test(pd_f, np_f, self.A, np.dtype('float32'))
+
+    def test_mul(self):
+        def pd_f(xs):
+            x, y = xs
+            return paddle.multiply(x, y)
+
+        def np_f(xs):
+            x, y = xs
+            return np.multiply(x, y)
+
+        self.run_test(pd_f, np_f, [self.B, self.C], np.dtype('float32'))
+
+    def test_matmul(self):
+        def pd_f(xs):
+            x, y = xs
+            return paddle.matmul(x, y)
+
+        def np_f(xs):
+            x, y = xs
+            return np.matmul(x, y)
+
+        self.run_test(pd_f, np_f, [self.B, self.C], np.dtype('float32'))
+
+    def test_batch_matmul(self):
+        def pd_f(xs):
+            x, y = xs
+            return paddle.matmul(x, y)
+
+        def np_f(xs):
+            x, y = xs
+            return np.matmul(x, y)
+
+        self.run_test(
+            pd_f, np_f, [self.D, self.E], np.dtype('float32'), batch=True)
+
+
+class TestJacobianFloat64(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        paddle.enable_static()
+        if fluid.core.is_compiled_with_cuda():
+            self.place = fluid.CUDAPlace(0)
+        else:
+            self.place = fluid.CPUPlace()
+        self.np_dtype = np.float32
+        self.A = np.array([[1., 2.]]).astype('float64')
+        self.B = np.array([[1., 2.], [2., 1.]]).astype('float64')
+        self.C = np.array([[2., 2.], [2., 1.]]).astype('float64')
+        self.D = np.array(
+            [[[2., 2.], [2., 1.]], [[1., 2.], [2., 1.]]]).astype('float64')
+        self.E = np.array(
+            [[[3., 4.], [2., 3.]], [[2., 1.], [1., 3.]]]).astype('float64')
+        self.eps = 1e-7
+        self.rtol = 1e-6
+        self.atol = 1e-6
+
+    def run_test_by_fullmatrix(self, pd_f, np_f, inps, dtype, batch=False):
+        def make_tensors(inps):
+            if isinstance(inps, list):
+                xs = [
+                    paddle.static.data(
+                        f'x{i}', inp.shape, dtype=inp.dtype)
+                    for i, inp in enumerate(inps)
+                ]
+            else:
+                xs = paddle.static.data(
+                    name='x', shape=inps.shape, dtype=inps.dtype)
+            return xs
+
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            xs = make_tensors(inps)
+            JJ = paddle.autograd.functional.Jacobian(pd_f, xs, batch=batch)
+            nrow, ncol = JJ.shape()
+            full_jacobian = JJ[:]
+        exe = fluid.Executor(self.place)
+        exe.run(startup)
+        if isinstance(inps, list):
+            feeds = {f'x{i}': x for i, x in enumerate(inps)}
+        else:
+            feeds = {'x': inps}
+        pd_jacobians = exe.run(main, feed=feeds, fetch_list=[full_jacobian])[0]
+        np_jacobians = approx_jacobian(np_f, inps, dtype, self.eps, batch=batch)
+        self.assertTrue(
+            np.allclose(pd_jacobians, np_jacobians, self.rtol, self.atol))
+
+    def run_test_by_rows(self, pd_f, np_f, inps, dtype, batch=False):
+        def make_tensors(inps):
+            if isinstance(inps, list):
+                xs = [
+                    paddle.static.data(
+                        f'x{i}', inp.shape, dtype=inp.dtype)
+                    for i, inp in enumerate(inps)
+                ]
+            else:
+                xs = paddle.static.data(
+                    name='x', shape=inps.shape, dtype=inps.dtype)
+            return xs
+
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            xs = make_tensors(inps)
+            JJ = paddle.autograd.functional.Jacobian(pd_f, xs, batch=batch)
+            nrow, ncol = JJ.shape()
+            rows = [JJ[i] for i in range(nrow)]
+        exe = fluid.Executor(self.place)
+        exe.run(startup)
+        if isinstance(inps, list):
+            feeds = {f'x{i}': x for i, x in enumerate(inps)}
+        else:
+            feeds = {'x': inps}
+        pd_jac = exe.run(main, feed=feeds, fetch_list=[rows])
+        np_jac = approx_jacobian(np_f, inps, dtype, self.eps, batch=batch)
+        for i in range(nrow):
+            self.assertTrue(
+                np.allclose(pd_jac[i], np_jac[i], self.rtol, self.atol))
+
+    def run_test_by_entries(self, pd_f, np_f, inps, dtype, batch=False):
+        def make_tensors(inps):
+            if isinstance(inps, list):
+                xs = [
+                    paddle.static.data(
+                        f'x{i}', inp.shape, dtype=inp.dtype)
+                    for i, inp in enumerate(inps)
+                ]
+            else:
+                xs = paddle.static.data(
+                    name='x', shape=inps.shape, dtype=inps.dtype)
+            return xs
+
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            xs = make_tensors(inps)
+            JJ = paddle.autograd.functional.Jacobian(pd_f, xs, batch=batch)
+            nrow, ncol = JJ.shape()
+            entries = [JJ[i, j] for i in range(nrow) for j in range(ncol)]
+        exe = fluid.Executor(self.place)
+        exe.run(startup)
+        if isinstance(inps, list):
+            feeds = {f'x{i}': x for i, x in enumerate(inps)}
+        else:
+            feeds = {'x': inps}
+        pd_entries = exe.run(main, feed=feeds, fetch_list=[entries])
+        np_jac = approx_jacobian(np_f, inps, dtype, self.eps, batch=batch)
+        np_entries = [
+            np_jac[i, ..., j] for i in range(nrow) for j in range(ncol)
+        ]
+        for pd_entry, np_entry in zip(pd_entries, np_entries):
+            self.assertTrue(
+                np.allclose(pd_entry, np_entry, self.rtol, self.atol))
+
+    def test_square(self):
+        def pd_f(x):
+            return paddle.multiply(x, x)
+
+        def np_f(x):
+            return np.multiply(x, x)
+
+        self.run_test_by_fullmatrix(pd_f, np_f, self.A, np.dtype('float64'))
+        self.run_test_by_rows(pd_f, np_f, self.A, np.dtype('float64'))
+        self.run_test_by_entries(pd_f, np_f, self.A, np.dtype('float64'))
+
+    def test_mul(self):
+        def pd_f(xs):
+            x, y = xs
+            return paddle.multiply(x, y)
+
+        def np_f(xs):
+            x, y = xs
+            return np.multiply(x, y)
+
+        self.run_test_by_fullmatrix(pd_f, np_f, [self.B, self.C],
+                                    np.dtype('float64'))
+        self.run_test_by_rows(pd_f, np_f, [self.B, self.C], np.dtype('float64'))
+        self.run_test_by_entries(pd_f, np_f, [self.B, self.C],
+                                 np.dtype('float64'))
+
+    def test_matmul(self):
+        def pd_f(xs):
+            x, y = xs
+            return paddle.matmul(x, y)
+
+        def np_f(xs):
+            x, y = xs
+            return np.matmul(x, y)
+
+        self.run_test_by_fullmatrix(pd_f, np_f, [self.B, self.C],
+                                    np.dtype('float64'))
+        self.run_test_by_rows(pd_f, np_f, [self.B, self.C], np.dtype('float64'))
+        self.run_test_by_entries(pd_f, np_f, [self.B, self.C],
+                                 np.dtype('float64'))
+
+    def test_batch_matmul(self):
+        def pd_f(xs):
+            x, y = xs
+            return paddle.matmul(x, y)
+
+        def np_f(xs):
+            x, y = xs
+            return np.matmul(x, y)
+
+        self.run_test_by_fullmatrix(
+            pd_f, np_f, [self.D, self.E], np.dtype('float64'), batch=True)
+        self.run_test_by_rows(
+            pd_f, np_f, [self.D, self.E], np.dtype('float64'), batch=True)
+        self.run_test_by_entries(
+            pd_f, np_f, [self.D, self.E], np.dtype('float64'), batch=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
index 48a9f7204aa8d26090d1b4e9a059cad7f382612f..b3ba7c80b32265912c746db4fed76773e127255f 100755
--- a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
@@ -4,6 +4,6 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
     list(APPEND DIST_TEST_OPS ${TEST_OP})
-    set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 90)
+    set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 120)
     set_tests_properties(${TEST_OP} PROPERTIES LABELS "RUN_TYPE=DIST")
 endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_act_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7147724fbc5c35d27a6172576539fb24d41ca5a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_act_pass.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import paddle
+import paddle.distributed.fleet as fleet
+import numpy as np
+import paddle.nn as nn
+from paddle.distributed.passes import new_pass, PassManager
+import unittest
+from dist_pass_test_base import DistPassTestBase
+
+
+class BatchNormActNet(nn.Layer):
+    def __init__(self):
+        super(BatchNormActNet, self).__init__()
+
+        self.conv1 = nn.Conv2D(3, 8, (3, 3), data_format="NHWC")
+        self.bn1 = nn.BatchNorm2D(8, data_format="NHWC")
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = paddle.flatten(out, 1)
+        return out
+
+
+class TestFuseBatchNormActPass(DistPassTestBase):
+    def init(self):
+        self.atol = 1e-4
+        self.rtol = 1e-4
+
+    def get_model(self, place, batch_size=32, image_shape=[224, 224, 3]):
+        image = paddle.static.data(
+            shape=[batch_size] + image_shape, dtype='float32', name='image')
+
+        model = BatchNormActNet()
+        pred_out = model(image)
+        loss = paddle.mean(pred_out)
+        optimizer = paddle.optimizer.Adam(learning_rate=1e-3)
+
+        dist_strategy = fleet.DistributedStrategy()
+        dist_strategy.fuse_all_reduce_ops = False
+        dist_strategy.without_graph_optimization = True
+        dist_strategy.amp = True
+        dist_strategy.amp_configs = {
+            "init_loss_scaling": 32768,
+            "use_dynamic_loss_scaling": True,
+        }
+        fleet.init(is_collective=True, strategy=dist_strategy)
+        optimizer = fleet.distributed_optimizer(optimizer)
+        optimizer.minimize(loss)
+
+        rank = paddle.distributed.get_rank()
+
+        def reader():
+            seed = int(os.environ.get("SEED", 0))
+            np.random.seed(seed + rank)
+            for _ in range(10):
+                image_np = np.random.random(size=image.shape).astype('float32')
+                yield image_np,
+
+        main_program = paddle.static.default_main_program()
+        startup_program = paddle.static.default_startup_program()
+        return main_program, startup_program, [image], [loss], reader
+
+    def apply_passes(self, main_prog, startup_prog):
+        pass_manager = PassManager([new_pass("fuse_bn_act")])
+        pass_manager.apply([main_prog], [startup_prog])
+        print(pass_manager.names)
+
+        op_type = []
+        for op in main_prog.global_block().ops:
+            op_type.append(op.type)
+        self.assertTrue("fused_batch_norm_act" in op_type)
+        self.assertTrue("fused_batch_norm_act_grad" in op_type)
+
+    def test_fuse_bn_act(self):
+        self.check_main()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
index 9206d744990008496e7af43d67e000f9d00f6dab..80acf7217e76fb996e6b76aa519307c44952636e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
@@ -159,10 +159,13 @@ def test_dp_stage2():
     mlp2 = MLP()
     mlp3 = MLP()
     mlp4 = MLP()
+    mlp5 = MLP()
     mlp1.set_state_dict(state_dict)
     mlp2.set_state_dict(state_dict)
     mlp3.set_state_dict(state_dict)
     mlp4.set_state_dict(state_dict)
+    mlp5.set_state_dict(state_dict)
+
     dp_params = train_mlp(
         mlp1, sharding_stage="dp", use_pure_fp16=False, opt_group=False)
     stage2_params = train_mlp(
@@ -181,6 +184,11 @@ def test_dp_stage2():
             rtol=1e-5,
             atol=1e-5)
 
+    stage2_params = train_mlp(
+        mlp2, sharding_stage=2, use_pure_fp16=False, opt_group=True)
+    for i in range(len(dp_params)):
+        np.testing.assert_allclose(
+            dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6)
     return
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
index f7e426377382bb089d9a4c4f968759f38c40e647..84ffe9094d8126ac75f864022659cbf2e101ad65 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
@@ -49,7 +49,7 @@ def train_mlp(model, offload=False):
     optimizer = ShardingOptimizerStage2(
         params=model.parameters(), optim=optimizer, offload=offload)
     model = ShardingStage2(
-        model, optimizer, buffer_max_size=2**21, accumulate_grads=True)
+        model, optimizer, buffer_max_size=2**21, accumulate_grads=False)
 
     train_reader = paddle.batch(
         reader_decorator(linear_size), batch_size=batch_size, drop_last=True)
@@ -98,12 +98,11 @@ def test_sharding_stage2_offload():
     mlp_offload_params = train_mlp(mlp_offload, offload=True)
 
     for i in range(len(mlp_params)):
-        for j in range(len(mlp_offload_params)):
-            if mlp_params[i].name == mlp_offload_params[j].name:
-                np.testing.assert_allclose(
-                    mlp_params[i].numpy(),
-                    mlp_offload_params[j].numpy(),
-                    rtol=1e-6)
+        np.testing.assert_allclose(
+            mlp_params[i].numpy(),
+            mlp_offload_params[i].numpy(),
+            rtol=5e-3,
+            atol=5e-3)
     return
 
 
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py
index de980f3c3f787e4e55a9ac06b92609d0cbbfb9c6..430c6e0884822dc9d38f593b4cee26f96ed18b3b 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py
@@ -31,5 +31,19 @@ class TestPPClipGrad(TestDistPPTraning):
         return scheduler, optimizer
 
 
+class TestPPClipGradParamGroup(TestDistPPTraning):
+    def build_optimizer(self, model):
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5)
+        scheduler = paddle.optimizer.lr.PiecewiseDecay(
+            boundaries=[2], values=[0.001, 0.002], verbose=True)
+        optimizer = paddle.optimizer.Momentum(
+            learning_rate=scheduler,
+            grad_clip=grad_clip,
+            parameters=[{
+                "params": model.parameters()
+            }])
+        return scheduler, optimizer
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/ernie_training.py b/python/paddle/fluid/tests/unittests/ipu/ernie_training.py
index bedf0a38549b8ebd23563389e05dff6f26967933..ddda666db2c0cb043624ff7249d0ea08c455c0a4 100644
--- a/python/paddle/fluid/tests/unittests/ipu/ernie_training.py
+++ b/python/paddle/fluid/tests/unittests/ipu/ernie_training.py
@@ -856,16 +856,15 @@ if __name__ == "__main__":
     paddle.static.load(main_prog, "model/ernie")
 
     if args.run_on_ipu:
-        ipu_strategy = compiler.get_ipu_strategy()
-        ipu_strategy.num_ipus = args.num_ipus
-        ipu_strategy.enable_manual_shard = args.num_ipus > 1
-        ipu_strategy.enable_pipelining = args.enable_pipelining
-        if args.enable_pipelining:
-            if args.is_training:
-                ipu_strategy.batches_per_step = args.num_ipus + 1
-            else:
-                ipu_strategy.batches_per_step = args.num_ipus
-        ipu_strategy.is_training = args.is_training
+        ipu_strategy = paddle.static.IpuStrategy()
+        ipu_strategy.SetGraphConfig(
+            num_ipus=args.num_ipus,
+            is_training=args.is_training,
+            enable_manual_shard=args.num_ipus > 1)
+        ipu_strategy.SetPipeliningConfig(
+            enable_pipelining=args.enable_pipelining,
+            batches_per_step=args.num_ipus + 1)
+
         ipu_compiler = compiler.IPUCompiledProgram(
             main_prog, ipu_strategy=ipu_strategy)
         program = ipu_compiler.compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py
index 0f726accfa83c66784ef51bddd660c20e7968ffd..58a88c113fc0b6b82c1c58d50a1b0824cb530632 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py
@@ -72,8 +72,8 @@ class TestRelu(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IpuCompiler(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
index c4d8b3ee89f439091e065dbfe5d3277c3e16b64b..a23cacf47636b434b3211ca377e8e6a5e79fa64b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
@@ -81,10 +81,12 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
-                # enable avg shard pass
-                ipu_strategy.need_avg_shard = True
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(
+                    num_ipus=2,
+                    is_training=self.is_training,
+                    enable_manual_shard=True,
+                    need_avg_shard=True)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
index ee81354c44620e3807e3c191be7fa62c0a9473c4..87f783dbd1c1aef2f5bcc40b407dff9f4bbe0916 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
@@ -79,8 +79,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
index 19026f5e05989c97b655da020a94a15f7b4fe0fe..6e58f809046000bb7a41b9875f9ebf945b86fd07 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
@@ -81,8 +81,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
index 2b59f2bb729e4f5e12eec0c1f72dec2ff292536f..094b19ce99da9c73c188f000dc7080504bc9ff3e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
@@ -83,8 +83,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
index fb237c30d49cdc7345275558de3350c69fcd480b..f28733de6b1a12a8aac362e30c8478a145520506 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
@@ -81,8 +81,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
index 9d49b054370b7a9bad1f85bd2a57dfcf840d8b25..3987c6cd5b386ae22a2fcac1a985e5e915a3e5ae 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
@@ -93,8 +93,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
index 1cd2037f2283e012918303b051396b9b0abb1977..8b1560edfd81de65f495a1bd0609bfa09c5f7810 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
@@ -83,8 +83,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
index 4aa202ace2fc278c9c0835d7910448e9d0871cce..07b06d77c90ffb41d3a221c94d52bdc2c7d1a3a5 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
@@ -75,8 +75,8 @@ class TestMul(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
index b2548a17634d721b882509ea5bd27bc459120f38..c319894bfae250789bf9931343ac29106629a148 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
@@ -84,8 +84,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
index 018c9a7876f886093c9e8345f073ceaa08eb237a..5b7ea61568ecd5772d574dd3cc63fac74535c903 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
@@ -76,8 +76,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
@@ -142,8 +142,8 @@ class TestCase1(TestBase):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
index 6a43fb46eea496ca42f2863cdbf4b20ace1677fe..c62e0c08f9c79c2ee1217b2b74df84c26cf30f1f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
@@ -78,8 +78,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
index af220a34ddb02652edf690c700309830b5914733..d5be8ae0cf77526a6aefa1fd060a510689c43cc0 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
@@ -83,8 +83,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
index 6c9d3f29adf3c0674045e653572e892141b699c7..ca8c0935d782cc275838871e2c73a3eba9454e5b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
@@ -81,8 +81,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
index 1afc0cb9ed330d95de68e29743980e9920922beb..eb644c2c6670f5beedbb3ad0b1868bc8a0f9434e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
@@ -101,8 +101,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
index b50ed7bdbab52fd7d49b1bd202406930da109ced..ee9cd875cf29884da58dc7f0488e7f9bfc50d4e5 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
@@ -97,8 +97,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py
index 6b549b306f0d379d46ed3597c116fbcbfabccbb4..9b485d7794db2cbb538317b0d664d6ece9799a83 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py
@@ -62,9 +62,14 @@ class TestFunc(unittest.TestCase):
             if run_ipu:
                 feed_list = [image.name]
                 fetch_list = [out.name]
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = False
-                ipu_strategy.batches_per_step = bps
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(
+                    num_ipus=2,
+                    is_training=False,
+                    enable_manual_shard=True,
+                    need_avg_shard=True)
+                ipu_strategy.SetPipeliningConfig(
+                    enable_pipelinin=True, batches_per_step=bps)
                 program = compiler.IPUCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
                                                                   fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py
index d135e5a586e7dffd667adc88179a8df830deb245..aa6c05dc59a87f844c19912be484a4b007f0adfc 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py
@@ -83,9 +83,9 @@ class TestBase(IPUOpTest):
             feed = self.feed_ipu if run_ipu else self.feed_cpu
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = False
-                ipu_strategy.enable_fp16 = True
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=False)
+                ipu_strategy.SetHalfConfig(enable_fp16=True)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py
index f8ab3f81e9d3d63c2fe460dd359e3a2a54e02b7d..0a331d804545d49eeaffdf0c8054db89041f2c29 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py
@@ -94,8 +94,9 @@ class TestBase(IPUOpTest):
                 exe = paddle.static.Executor(place)
                 exe.run(startup_prog)
 
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.attrs['is_training']
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(
+                    is_training=self.attrs['is_training'])
                 program = compiler.IPUCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(
                         self.feed_list, fetch_list)
@@ -123,8 +124,8 @@ class TestBase(IPUOpTest):
         if run_ipu:
             feed_list = feed_target_names
             fetch_list = [fetch_targets[0].name]
-            ipu_strategy = compiler.get_ipu_strategy()
-            ipu_strategy.is_training = False
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.SetGraphConfig(is_training=False)
             program = compiler.IPUCompiledProgram(
                 inference_program,
                 ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py
index 00fc0dd6633aed5b3dbc08a5170f476ba6d160ef..e1ed7603ed6272ba91cf485d91f512f07b72a258 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py
@@ -59,11 +59,9 @@ class TestCastNet(unittest.TestCase):
             if run_ipu:
                 feed_list = [image.name]
                 fetch_list = [loss.name]
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.num_ipus = 2
-                ipu_strategy.is_training = False
-                ipu_strategy.enable_manual_shard = True
-                ipu_strategy.enable_pipelining = False
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(
+                    num_ipus=2, is_training=False, enable_manual_shard=True)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py
index 741ca8784bb602c5d8ab855b11d2478b3606c130..afeec9ee1b6fa75961aa76bd2f2c2f6701e200b5 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py
@@ -29,7 +29,7 @@ SEED = 2021
                  "core is not compiled with IPU")
 class TestConvNet(unittest.TestCase):
     def test_training(self):
-        ipu_strategy = compiler.get_ipu_strategy()
+        ipu_strategy = paddle.static.IpuStrategy()
 
         assert ipu_strategy.num_ipus == 1, "Default num_ipus must be 1"
         assert ipu_strategy.is_training == True, "Default is_training is True"
@@ -38,17 +38,16 @@ class TestConvNet(unittest.TestCase):
         assert ipu_strategy.enable_manual_shard == False, \
             "Default enable_manual_shard is False"
 
-        ipu_strategy.num_ipus = 2
+        ipu_strategy.SetGraphConfig(
+            num_ipus=2, is_training=False, enable_manual_shard=True)
+        ipu_strategy.SetPipeliningConfig(enable_pipelining=True)
         assert ipu_strategy.num_ipus == 2, "Set num_ipus Failed"
 
-        ipu_strategy.is_training = False
         assert ipu_strategy.is_training == False, "Set is_training Failed"
 
-        ipu_strategy.enable_pipelining = True
         assert ipu_strategy.enable_pipelining == True, \
             "Set enable_pipelining Failed"
 
-        ipu_strategy.enable_manual_shard = True
         assert ipu_strategy.enable_manual_shard == True, \
             "Set enable_manual_shard Failed"
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
index 043bc8ad36296228624c4b8c1e9f34b8e872f962..196f94b68f94a08f1b08871c805a9d7ceee14ffa 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
@@ -104,8 +104,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
index 6f85c4f381e5d74a0e09acb00587fe8c275f09c5..dc3cab6ac5e114f7083937687c1dfedf0ebd1c44 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
@@ -79,8 +79,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
index 2443541c79991d23f20f93b91eb6773b7e9419df..31b0c99603c3f707328eef5a3713bcd9b882b948 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
@@ -96,8 +96,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py
index 0aac2344b3cf2811c990ae90288c966158e53673..38b91785aeec8c061a5d4f6203363645569c2824 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py
@@ -70,8 +70,8 @@ class TestConvNet(unittest.TestCase):
             if run_ipu:
                 feed_list = [image.name]
                 fetch_list = [loss.name]
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = True
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=True)
                 program = compiler.IPUCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
                                                                   fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
index 7133a76c607cbfaa34882106b459e71b9df48edd..c6702b92ab969ec7b1b97c44bc9245c6df41d83a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
@@ -87,8 +87,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
index 87c67fb72c3df8a1b5f171465f30146f1f8fbab8..f04d712755deadd44158431569430909f19a093e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
@@ -76,8 +76,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
index 678276eba317862b3e9385badddf04c9e57c2e61..78a2589d9aca59ec72d22aa6fa35f9f934f94caf 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
@@ -86,8 +86,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
index cb9967831feee363b75a3a1fccc7ec37754dffdd..e81591ad68368033bc9b5223753203ae3126c005 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
@@ -87,8 +87,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
index fffac1218576bf747548be6016e2bdb340e82b30..a7c45c6686f10e3c13462d49ab8e34fe72c4f03a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
@@ -87,8 +87,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
index 8ede44d7f92297745bd4c621adcf0d3846afff59..5059de7ba77b1baf65c511da11735a73a87aa1e8 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
@@ -78,8 +78,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
@@ -141,8 +141,8 @@ class TestCase1(TestBase):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
index f8c96684afe8f3c8222fcc4ec6f9c179a73158fd..ac8ad08e8b28c00555c5c78f5b8a834b0024acc7 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
@@ -71,8 +71,8 @@ class TestMean(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
index a6e0552691b857c6f53bf8dc327c0e874bab8708..f312b7b69ad79b721fd768f6762a48ca68793d6d 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
@@ -82,8 +82,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
index 2cc0d1770be13cea265d521c20dc4dabe186dbd5..5163838bc0cd633f69e2e446294250686e4fe04f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
@@ -76,8 +76,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load.py
index 5d9ff4eb886c02a551bdc6ef9ffc2c2e653b999a..24bb8e111842cb93ed35cdc796868c5a911ee36f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_save_load.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load.py
@@ -93,8 +93,9 @@ class TestBase(IPUOpTest):
                 if not save_otherwise_load:
                     paddle.static.load(main_prog, "model/model")
 
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.attrs['is_training']
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(
+                    is_training=self.attrs['is_training'])
                 program = compiler.IPUCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(
                         self.feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
index 94758155b35a82d3971f6f25192eb5516276e7ed..6ad2a89a738b7090ce082a637c3d531cf4566f9a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
@@ -82,8 +82,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
@@ -175,8 +175,8 @@ class TestCase4(TestBase):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
index 335e90b4607b2fe84b2c37b2a1bae75fca2a0044..93945b98ef0a26b35b28e1cf9d2bb5e88d21f308 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
@@ -81,10 +81,9 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
-                # set batch size
-                ipu_strategy.batch_size = 2
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(
+                    batch_size=2, is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py b/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py
index ee469c5fc1de992dc24626116c1d9ed06410690f..df0e2a040bd3e55aac20e383c7aa2ff50c567de4 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py
@@ -59,8 +59,8 @@ class TestSGD(unittest.TestCase):
             if run_ipu:
                 feed_list = [image.name]
                 fetch_list = [loss.name]
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = True
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=True)
                 program = compiler.IPUCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
                                                                   fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
index 7261f26c0ec108a50514e0361d28f5de274f1176..3bdfeabce6592cd25067adfdabe8b7c74a6848c7 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
@@ -80,8 +80,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
@@ -159,8 +159,8 @@ class TestCase2(TestBase):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
index 36cb529c231a0fe9c875fa7f2d4827c63d5ca447..a4a4b83baf35e558ff1f8b0982ffc44287919dfc 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
@@ -77,8 +77,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
index 672e6ede0ede10c490edbb7c41fafb05fdae235d..ccd2796590838faa8980ef7d214f73a944c9220d 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
@@ -77,11 +77,11 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+                    iipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
index 360e81f2862cddce6d54a4d2fb6f5cda631d7ae7..3d5de11b5e213e3fe68561d60c1c0dbcb6fbcbf1 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
@@ -88,8 +88,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
index 6bba02942a713daf5be6faa0344e355e2f211f8d..003350cd7a01e284d28ff8904a38ab3755e07642 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
@@ -83,8 +83,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
@@ -149,11 +149,11 @@ class TestCase1(TestBase):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+                    iipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
index 4cbbc9e478d2e8b3719c01562e57780cede1049b..9915a7a1fd89f91f71814ebbe577abcf9327cb37 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
@@ -102,8 +102,8 @@ class TestTopKOp(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
index 715f81b684e35b1aad955883f1198696e42d4c04..77d2f4131014965ef6cfba9cdf5efffa34c2dbc6 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
@@ -78,8 +78,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
index 9d8cab6a697081d86635fd25eeb4c1c0e9f82fce..75ed5a07315c775e4ea3e105a1fa4d6731666c70 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
@@ -76,8 +76,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py
index 801794804da0a5e3df210decbfd20bbca6f450c7..fabad936decb975214f0416000d77c76a2f7ddfb 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py
@@ -91,8 +91,8 @@ class TestBase(IPUOpTest):
 
             if run_ipu:
                 feed_list = self.feed_list
-                ipu_strategy = compiler.get_ipu_strategy()
-                ipu_strategy.is_training = self.is_training
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.SetGraphConfig(is_training=self.is_training)
                 program = compiler.IPUCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
index 1c5b640fe4b0bd3a2e401541ee945e64c131dd32..505060e31a0a27be64e7755cbb25844ebefe66df 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
@@ -36,7 +36,7 @@ class TrtConvertElementwiseTest_one_input(TrtLayerAutoScanTest):
             for shape in [[32], [batch, 32], [batch, 32, 32],
                           [batch, 32, 16, 32]]:
                 for op_type in ["elementwise_add", "elementwise_mul"]:
-                    for axis in [len(shape) - 1, -1]:
+                    for axis in [-1 if len(shape) == 1 else 1]:
                         self.dims = len(shape)
                         dics = [{"axis": axis}]
                         ops_config = [{
@@ -129,33 +129,7 @@ class TrtConvertElementwiseTest_one_input(TrtLayerAutoScanTest):
                                                                      True), 1e-5
 
     def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if self.dims == 2 and len(self.dynamic_shape.max_input_shape) == 0:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output shape are not equal between gpu and tensorrt when input dim is 2."
-        )
-
-        def teller2(program_config, predictor_config):
-            if self.dims == 3:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output has diff between gpu and tensorrt when input dim is 3.")
-
-        def teller3(program_config, predictor_config):
-            if self.dims == 4:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller3, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output has diff between gpu and tensorrt when input dim is 4.")
+        pass
 
     def test(self):
         self.add_skip_trt_case()
@@ -287,15 +261,7 @@ class TrtConvertElementwiseTest_two_input_without_broadcast(
         yield self.create_inference_config(), (1, 3), 1e-5
 
     def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if self.dims == 2:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output shape are not equal between gpu and tensorrt when input dim is 2."
-        )
+        pass
 
     def test(self):
         self.add_skip_trt_case()
@@ -418,15 +384,7 @@ class TrtConvertElementwiseTest_two_input_with_broadcast(TrtLayerAutoScanTest):
         yield self.create_inference_config(), (1, 3), 1e-5
 
     def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if len(self.shape1) == 2:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output shape are not equal between gpu and tensorrt when input dim is 2."
-        )
+        pass
 
     def test(self):
         self.add_skip_trt_case()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py
index ba648042dabf755326e709a46218caa9d857b506..2e1e04870b926b05e7191b335aa6403a6380a68d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py
@@ -36,26 +36,32 @@ class TrtConvertReduceMeanTest(TrtLayerAutoScanTest):
                 return False
         if len(attrs[0]["dim"]) == 0:
             return False
-        ## skip not use 
-        if attrs[0]["out_dtype"] != -1:
-            return False
+
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 < 7000:
+            if attrs[0]['out_dtype'] == 2:
+                return False
 
         return True
 
     def sample_program_configs(self):
-        def generate_input1(attrs: List[Dict[str, Any]]):
-            return np.random.random([1, 3, 64, 64]).astype(np.float32)
+        def generate_input1(dtype, attrs: List[Dict[str, Any]]):
+            if dtype == -1 or dtype == 5:
+                return np.random.random([1, 3, 64, 64]).astype(np.float32)
+            elif dtype == 2:
+                return np.random.random([1, 3, 64, 64]).astype(np.int32)
 
-        for keep_dim in [False, True]:
+        for keep_dim in [True, False]:
             for dim in [[], [1], [0], [0, 1], [1, 2, 3], [-2, 0, 3], [-3],
                         [-4, 1], [3, 4, 5]]:
-                for reduce_all in [False, True]:
-                    for out_dtype in [-1, 0, 1]:
+                for reduce_all in [True, False]:
+                    for out_dtype in [-1, 2, 5]:
                         dics = [{
                             "keep_dim": keep_dim,
                             "dim": dim,
                             "reduce_all": reduce_all,
-                            "out_dtype": out_dtype
+                            "out_dtype": out_dtype,
+                            "in_dtype": out_dtype,
                         }, {}]
 
                         ops_config = [{
@@ -75,7 +81,7 @@ class TrtConvertReduceMeanTest(TrtLayerAutoScanTest):
                             weights={},
                             inputs={
                                 "input_data": TensorConfig(data_gen=partial(
-                                    generate_input1, dics))
+                                    generate_input1, out_dtype, dics))
                             },
                             outputs=["reduce_output_data"])
 
@@ -134,16 +140,6 @@ class TrtConvertReduceMeanTest(TrtLayerAutoScanTest):
         pass
 
     def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if program_config.ops[0].attrs['out_dtype'] != -1:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "NOT Implemented: we will add out_dtype not equal to  -1 in the future"
-        )
-
         pass
 
     def test(self):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py
index ba0f61a2768988505856a0cdf2d481e182384110..2a7e673d4203a9f73b543e32a15b3a2479f5b14e 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py
@@ -37,26 +37,27 @@ class TrtConvertReduceSumTest(TrtLayerAutoScanTest):
                 return False
         if len(attrs[0]["dim"]) == 0:
             return False
-        ## skip not use 
-        if attrs[0]["out_dtype"] != -1:
-            return False
 
         return True
 
     def sample_program_configs(self):
-        def generate_input1(attrs: List[Dict[str, Any]]):
-            return np.random.random([1, 3, 64, 64]).astype(np.float32)
+        def generate_input1(dtype, attrs: List[Dict[str, Any]]):
+            if dtype == -1 or dtype == 5:
+                return np.random.random([1, 3, 64, 64]).astype(np.float32)
+            elif dtype == 2:
+                return np.random.random([1, 3, 64, 64]).astype(np.int32)
 
-        for keep_dim in [False, True]:
+        for keep_dim in [True, False]:
             for dim in [[], [1], [0], [0, 1], [1, 2, 3], [-2, 0, 3], [-3],
                         [-4, 1], [3, 4, 5]]:
-                for reduce_all in [False, True]:
-                    for out_dtype in [-1, 0, 1]:
+                for reduce_all in [True, False]:
+                    for out_dtype in [-1, 2, 5]:
                         dics = [{
                             "keep_dim": keep_dim,
                             "dim": dim,
                             "reduce_all": reduce_all,
-                            "out_dtype": out_dtype
+                            "out_dtype": out_dtype,
+                            "in_dtype": out_dtype,
                         }, {}]
 
                         ops_config = [{
@@ -76,7 +77,7 @@ class TrtConvertReduceSumTest(TrtLayerAutoScanTest):
                             weights={},
                             inputs={
                                 "input_data": TensorConfig(data_gen=partial(
-                                    generate_input1, dics))
+                                    generate_input1, out_dtype, dics))
                             },
                             outputs=["reduce_output_data"])
 
@@ -134,16 +135,6 @@ class TrtConvertReduceSumTest(TrtLayerAutoScanTest):
         pass
 
     def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if program_config.ops[0].attrs['out_dtype'] != -1:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "NOT Implemented: we will add out_dtype not equal to  -1 in the future"
-        )
-
         pass
 
     def test(self):
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..2150e06381fac37e527d9d593f8752eb38ba1596
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
@@ -0,0 +1,702 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+import sys
+sys.path.append('..')
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+_set_use_system_allocator(True)
+
+
+def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
+    x_shape = x.shape
+    if len(x_shape) == 2:
+        if data_format == "NCHW":
+            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+        else:
+            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
+    if len(x_shape) == 3:
+        if data_format == "NCHW":  # NCL -> NCL1
+            x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1))
+        else:  # NLC -> NL1C
+            x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2]))
+
+    if data_format == "NCHW":
+        n, c, h, w = x.shape
+        mean_tile = np.reshape(mean, (1, c, 1, 1))
+        mean_tile = np.tile(mean_tile, (n, 1, h, w))
+        var_tile = np.reshape(var, (1, c, 1, 1))
+        var_tile = np.tile(var_tile, (n, 1, h, w))
+        normalized = (x - mean_tile) / np.sqrt(var_tile + epsilon)
+        scale_tile = np.reshape(scale, (1, c, 1, 1))
+        scale_tile = np.tile(scale_tile, (n, 1, h, w))
+        offset_tile = np.reshape(offset, (1, c, 1, 1))
+        offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
+        y = normalized * scale_tile + offset_tile
+    elif data_format == "NHWC":
+        normalized = (x - mean) / np.sqrt(var + epsilon)
+        y = normalized * scale + offset
+    else:
+        raise ValueError("Unknown data order.")
+
+    if len(x_shape) == 2 or len(x_shape) == 3:
+        y = np.reshape(y, x_shape)
+    return y
+
+
+def _cal_mean_variance(x, epsilon, data_format):
+    assert data_format in ['NCHW', 'NHWC']
+    x_shape = x.shape
+    if len(x_shape) == 3:
+        if data_format == "NCHW":  # NCL -> NCL1
+            x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1))
+        else:  # NLC -> NL1C
+            x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2]))
+    x_square = x * x
+    axis = (0, 2, 3) if data_format == 'NCHW' else (0, 1, 2)
+    C = x.shape[1] if data_format == 'NCHW' else x.shape[-1]
+    x_square_sum = np.sum(x_square, axis)
+    x_sum = np.sum(x, axis=axis)
+    element_count = np.size(x) / C
+    mean = x_sum / element_count
+    var = x_square_sum / element_count - mean * mean
+    return mean, var
+
+
+def _reference_training(x, scale, offset, epsilon, data_format):
+    x_shape = x.shape
+
+    if len(x_shape) == 3:
+        if data_format == "NCHW":  # NCL -> NCL1
+            x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1))
+        else:  # NLC -> NL1C
+            x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2]))
+
+    if data_format == "NCHW":
+        n, c, h, w = x.shape
+        x_square = x * x
+        x_square_sum = np.sum(x_square, (0, 2, 3))
+        x_sum = np.sum(x, axis=(0, 2, 3))
+        element_count = np.size(x) / int(np.shape(x)[1])
+        mean = x_sum / element_count
+        var = x_square_sum / element_count - mean * mean
+        mean_tile = np.reshape(mean, (1, c, 1, 1))
+        mean_tile = np.tile(mean_tile, (n, 1, h, w))
+        var_tile = np.reshape(var, (1, c, 1, 1))
+        var_tile = np.tile(var_tile, (n, 1, h, w))
+        normalized = (x - mean_tile) / np.sqrt(var_tile + epsilon)
+        scale_tile = np.reshape(scale, (1, c, 1, 1))
+        scale_tile = np.tile(scale_tile, (n, 1, h, w))
+        offset_tile = np.reshape(offset, (1, c, 1, 1))
+        offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
+        y = normalized * scale_tile + offset_tile
+    elif data_format == "NHWC":
+        x_square = x * x
+        x_square_sum = np.sum(x_square, (0, 1, 2))
+        x_sum = np.sum(x, axis=(0, 1, 2))
+        element_count = np.size(x) / int(np.shape(x)[-1])
+        mean = x_sum / element_count
+        var = x_square_sum / element_count - mean * mean
+        normalized = (x - mean) / np.sqrt(var + epsilon)
+        y = normalized * scale + offset
+    else:
+        raise ValueError("Unknown data order.")
+
+    if len(x_shape) == 3:
+        y = np.reshape(y, x_shape)
+    return y, mean, var
+
+
+def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
+    # Use the following formulas to calculate gradients:
+    # grad_scale =
+    #   sum(grad_y * (x - mean)) * rsqrt(var + epsilon)
+    #
+    # grad_offset = sum(output_y)
+    #
+    # x_grad =
+    #   1/N * scale * rsqrt(var + epsilon) * (N * grad_y - sum(grad_y) -
+    #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
+
+    # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
+    if data_format != "NCHW" and data_format != "NHWC":
+        raise ValueError("Unknown data order.")
+
+    x_shape = x.shape
+    if len(x_shape) == 3:
+        if data_format == "NCHW":  # NCL -> NCL1
+            x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1))
+            y_grad = np.reshape(y_grad, (x_shape[0], x_shape[1], x_shape[2], 1))
+        else:  # NLC -> NL1C
+            x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2]))
+            y_grad = np.reshape(y_grad, (x_shape[0], x_shape[1], 1, x_shape[2]))
+
+    if data_format == "NCHW":
+        x = np.transpose(x, (0, 2, 3, 1))
+        y_grad = np.transpose(y_grad, (0, 2, 3, 1))
+
+    x_grad = scale * (y_grad - np.mean(
+        y_grad, axis=(0, 1, 2)) - (x - mean) * np.mean(
+            y_grad * (x - mean), axis=(0, 1, 2)) /
+                      (var + epsilon)) / np.sqrt(var + epsilon)
+    grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon),
+                        axis=(0, 1, 2))
+    grad_offset = np.sum(y_grad, axis=(0, 1, 2))
+
+    # transfer back to N, C, H, W
+    if data_format == "NCHW":
+        x_grad = np.transpose(x_grad, (0, 3, 1, 2))
+        x = np.transpose(x, (0, 3, 1, 2))
+        y_grad = np.transpose(y_grad, (0, 3, 1, 2))
+
+    if len(x_shape) == 3:
+        x_grad = np.reshape(x_grad, x_shape)
+
+    return x_grad, grad_scale, grad_offset
+
+
+def create_or_get_tensor(scope, var_name, var, place):
+    tensor = scope.var(var_name).get_tensor()
+    if var is not None:
+        assert isinstance(var, np.ndarray)
+        tensor.set_recursive_sequence_lengths([])
+        tensor.set(var, place)
+    return tensor
+
+
+def set_output_grad(scope, outputs, place, feed_dict=None):
+    def __set_tensor__(name, data=None):
+        out_tensor = scope.find_var(name).get_tensor()
+        grad_tensor = scope.var(grad_var_name(name)).get_tensor()
+        out_dtype = out_tensor.dtype()
+        if data is None:
+            if out_dtype == core.VarDesc.VarType.FP64:
+                data = np.ones(out_tensor.shape(), dtype=np.float64)
+            elif out_dtype == core.VarDesc.VarType.FP32:
+                data = np.ones(out_tensor.shape(), dtype=np.float32)
+            else:
+                raise ValueError("Not supported data type " + str(out_dtype))
+        grad_tensor.set(data, place)
+
+    for output in outputs:
+        data = None
+        if output in feed_dict:
+            data = feed_dict[output]
+        __set_tensor__(output, data)
+
+
+class TestBatchNormOpInference(unittest.TestCase):
+    def setUp(self):
+        self.dtype = np.float32
+        self.fuse_with_relu = False
+        self.init_kernel_type()
+
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
+
+    def check_with_place(self, place, data_layout, dtype, shape):
+        epsilon = 0.00001
+        if len(shape) == 2:
+            x_shape = shape
+            c = x_shape[1]
+        else:
+            n, h, w, c = shape[0], shape[1], shape[2], shape[3]
+            if data_layout == "NHWC":
+                x_shape = [n, h, w, c]
+            elif data_layout == "NCHW":
+                x_shape = [n, c, h, w]
+            else:
+                raise ValueError("Unknown data layout.")
+        scale_shape = [c]
+
+        x_val = np.random.random_sample(x_shape).astype(dtype)
+        # generate some negative values to test case with relu fused
+        x_val = x_val - 0.5
+        scale_val = np.random.random_sample(scale_shape).astype(np.float32)
+        bias_val = np.random.random_sample(scale_shape).astype(np.float32)
+
+        mean = np.zeros(scale_shape).astype(np.float32)
+        variance = np.ones(scale_shape).astype(np.float32)
+
+        y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance,
+                                   epsilon, data_layout).astype(dtype)
+        if self.fuse_with_relu:
+            y_out = np.maximum(y_out, 0)
+
+        scope = core.Scope()
+
+        # create input
+        x_tensor = create_or_get_tensor(scope, "x_val",
+                                        OpTest.np_dtype_to_fluid_dtype(x_val),
+                                        place)
+        scale_tensor = create_or_get_tensor(
+            scope, "scale_val",
+            OpTest.np_dtype_to_fluid_dtype(scale_val), place)
+        bias_tensor = create_or_get_tensor(
+            scope, "bias_val", OpTest.np_dtype_to_fluid_dtype(bias_val), place)
+        mean_tensor = create_or_get_tensor(scope, "mean",
+                                           OpTest.np_dtype_to_fluid_dtype(mean),
+                                           place)
+        variance_tensor = create_or_get_tensor(
+            scope, "variance", OpTest.np_dtype_to_fluid_dtype(variance), place)
+
+        # create output
+        y_tensor = create_or_get_tensor(scope, "y_out", None, place)
+        saved_mean_tensor = create_or_get_tensor(scope, "saved_mean", None,
+                                                 place)
+        saved_variance_tensor = create_or_get_tensor(scope, "saved_variance",
+                                                     None, place)
+        mean_out_tensor = mean_tensor
+        variance_out_tensor = variance_tensor
+
+        batch_norm_op = Operator(
+            "batch_norm",
+            # inputs
+            X="x_val",
+            Scale="scale_val",
+            Bias="bias_val",
+            Mean="mean",
+            Variance="variance",
+            # outputs
+            Y="y_out",
+            MeanOut="mean",
+            VarianceOut="variance",
+            SavedMean="saved_mean",
+            SavedVariance="saved_variance",
+            # attrs
+            is_test=True,
+            data_layout=data_layout,
+            use_mkldnn=False,
+            fuse_with_relu=self.fuse_with_relu,
+            epsilon=epsilon)
+
+        batch_norm_op.run(scope, place)
+
+        # check inference result
+        self.__assert_close(
+            y_tensor,
+            y_out,
+            "inference output are different at " + str(place) + ", " +
+            data_layout + ", " + str(np.dtype(dtype)) +
+            str(np.array(y_tensor)) + str(y_out),
+            atol=1e-3)
+
+    def test_check_output(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_mlu():
+            places.append(core.MLUPlace(0))
+
+        for place in places:
+            for data_format in ["NCHW", "NHWC"]:
+                self.check_with_place(place, data_format, self.dtype,
+                                      [2, 3, 4, 5])
+                self.check_with_place(place, data_format, self.dtype, [2, 3])
+
+    def init_kernel_type(self):
+        pass
+
+
+class TestFP16BatchNormOpInference(TestBatchNormOpInference):
+    def setUp(self):
+        self.dtype = np.float16
+        self.fuse_with_relu = False
+        self.init_kernel_type()
+
+    def test_check_output(self):
+        places = []
+        if core.is_compiled_with_mlu():
+            places.append(core.MLUPlace(0))
+
+        for place in places:
+            for data_format in ["NCHW", "NHWC"]:
+                self.check_with_place(place, data_format, self.dtype,
+                                      [2, 3, 4, 5])
+                self.check_with_place(place, data_format, self.dtype, [2, 3])
+
+
+class TestBatchNormOpTraining(unittest.TestCase):
+    def setUp(self):
+        self.fuse_with_relu = False
+        self.data_formats = ["NCHW", "NHWC"]
+        self.momentum = 0.9
+        self.use_momentum_variable = False
+        self.epsilon = 0.00001
+        self.init_kernel_type()
+        self.init_test_case()
+
+    def init_test_case(self):
+        self.use_global_stats = False
+        self.no_grad_set = set()
+        self.fetch_list = [
+            'y', 'mean', 'variance', 'saved_mean', 'saved_variance', 'x@GRAD',
+            'scale@GRAD', 'bias@GRAD'
+        ]
+
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        np.allclose(np.array(tensor), np_array, atol=atol)
+
+    def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
+                             epsilon, momentum, shape, data_layout):
+        # run forward
+        y, saved_mean, var_ref = _reference_training(x, scale, bias, epsilon,
+                                                     data_layout)
+        mean_out = saved_mean * (1. - momentum) + momentum * mean
+        variance_out = var_ref * (1. - momentum) + momentum * variance
+        saved_variance = 1. / np.sqrt(var_ref + epsilon)
+        # run backward
+        x_grad, scale_grad, bias_grad = _reference_grad(
+            x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout)
+
+        return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
+
+    def set_mean_variance(self, scale_shape, x, data_layout):
+        mean, variance = _cal_mean_variance(x, self.epsilon, data_layout)
+        mean_pre = np.zeros(scale_shape).astype(np.float32)
+        variance_pre = np.ones(scale_shape).astype(np.float32)
+        # computing global mean/variance for one step
+        if self.use_global_stats:
+            mom = self.momentum
+            mean = mean * (1. - mom) + mom * mean_pre
+            variance = variance * (1. - mom) + mom * variance_pre
+        return mean, variance
+
+    def test_forward_backward(self):
+        def test_with_place(place, data_layout, shape):
+            # attr
+            epsilon = self.epsilon
+            momentum = self.momentum
+            if data_layout == "NCHW":
+                n, c, h, w = shape[0], shape[1], shape[2], shape[3]
+            else:
+                n, h, w, c = shape[0], shape[1], shape[2], shape[3]
+            scale_shape = [c]
+
+            np.random.seed(123)
+            x = np.random.random_sample(shape).astype(np.float32)
+            scale = np.random.random_sample(scale_shape).astype(np.float32)
+            bias = np.random.random_sample(scale_shape).astype(np.float32)
+            mean, variance = self.set_mean_variance(scale_shape, x, data_layout)
+            y_grad = np.random.random_sample(shape).astype(np.float32)
+            momentum_var = np.array([momentum]).astype(np.float32)
+
+            y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward(
+                x, y_grad, scale, bias, mean, variance, epsilon, momentum,
+                shape, data_layout)
+
+            var_dict = locals()
+            var_dict['y@GRAD'] = y_grad
+            var_dict['x@GRAD'] = x_grad
+            var_dict['scale@GRAD'] = scale_grad
+            var_dict['bias@GRAD'] = bias_grad
+
+            var_names = [
+                'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean',
+                'saved_variance', 'momentum_var'
+            ]
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape)
+                inputs = {
+                    "X": block.var('x'),
+                    "Scale": block.var('scale'),
+                    "Bias": block.var('bias'),
+                    "Mean": block.var('mean'),
+                    "Variance": block.var('variance')
+                }
+                attrs = {
+                    "epsilon": epsilon,
+                    "is_test": False,
+                    "data_layout": data_layout,
+                    "use_mkldnn": False,
+                    "fuse_with_relu": self.fuse_with_relu,
+                    "use_global_stats": self.use_global_stats
+                }
+                if self.use_momentum_variable:
+                    inputs['MomentumTensor'] = block.var('momentum_var')
+                else:
+                    attrs['momentum'] = momentum
+
+                outputs = {
+                    "Y": block.var('y'),
+                    "MeanOut": block.var('mean'),  # share memory
+                    "VarianceOut": block.var('variance'),  # share memory
+                    "SavedMean": block.var('saved_mean'),
+                    "SavedVariance": block.var('saved_variance')
+                }
+                block.create_var(name="reserve_space", dtype='float32')
+                outputs["ReserveSpace"] = block.var('reserve_space')
+                bn_op = block.append_op(
+                    type="batch_norm",
+                    inputs=inputs,
+                    outputs=outputs,
+                    attrs=attrs)
+                block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
+
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    bn_op.desc, self.no_grad_set, [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                program._sync_with_cpp()
+
+                exe = fluid.Executor(place)
+                out = exe.run(program,
+                              feed={
+                                  name: var_dict[name]
+                                  for name in [
+                                      'x', 'scale', 'bias', 'mean', 'variance',
+                                      'y@GRAD', 'momentum_var'
+                                  ]
+                              },
+                              fetch_list=self.fetch_list)
+
+            for id, name in enumerate(self.fetch_list):
+                if name == 'variance':
+                    self.__assert_close(
+                        var_dict[name], out[id], name, atol=1e-3)
+                    continue
+                self.__assert_close(var_dict[name], out[id], name)
+            print("op test forward passed: ", str(place), data_layout)
+
+        places = [core.CPUPlace()]
+
+        if core.is_compiled_with_mlu():
+            places.append(core.MLUPlace(0))
+
+        for place in places:
+            for data_format in self.data_formats:
+                test_with_place(place, data_format, [2, 3, 4, 5])
+
+    def init_kernel_type(self):
+        pass
+
+
+class TestBatchNormOpTrainingCase1(TestBatchNormOpTraining):
+    def init_test_case(self):
+        self.use_global_stats = False
+        self.no_grad_set = set(['scale@GRAD', 'bias@GRAD'])
+        self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD']
+
+
+class TestBatchNormOpTrainingCase2(TestBatchNormOpTraining):
+    def init_test_case(self):
+        self.use_global_stats = False
+        self.no_grad_set = set()
+        self.fetch_list = [
+            'y', 'mean', 'variance', 'saved_mean', 'saved_variance', 'x@GRAD',
+            'scale@GRAD', 'bias@GRAD'
+        ]
+        os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = "1"
+
+
+class TestBatchNormOpTrainingCase3(TestBatchNormOpTraining):
+    def init_test_case(self):
+        self.use_global_stats = False
+        self.no_grad_set = set(['x@GRAD'])
+        self.fetch_list = ['y', 'mean', 'variance', 'scale@GRAD', 'bias@GRAD']
+
+
+class TestBatchNormOpTrainingMomentumVariable(TestBatchNormOpTraining):
+    def init_test_case(self):
+        self.use_momentum_variable = True
+        self.use_global_stats = False
+        self.no_grad_set = set()
+        self.fetch_list = [
+            'y', 'mean', 'variance', 'saved_mean', 'saved_variance', 'x@GRAD',
+            'scale@GRAD', 'bias@GRAD'
+        ]
+
+
+class TestBatchNormOpFreezeStatsTraining(TestBatchNormOpTraining):
+    def init_test_case(self):
+        self.use_global_stats = True
+        self.no_grad_set = set()
+        self.fetch_list = [
+            'y', 'mean', 'variance', 'x@GRAD', 'scale@GRAD', 'bias@GRAD'
+        ]
+
+    def reference_grad(self, x, y_grad, scale, mean, var, epsilon, data_format):
+        if data_format == "NCHW":
+            x = np.transpose(x, (0, 2, 3, 1))
+            y_grad = np.transpose(y_grad, (0, 2, 3, 1))
+
+        x_grad = scale * y_grad / np.sqrt(var + epsilon)
+        grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon),
+                            axis=(0, 1, 2))
+        grad_offset = np.sum(y_grad, axis=(0, 1, 2))
+
+        # transfer back to N, C, H, W
+        if data_format == "NCHW":
+            x_grad = np.transpose(x_grad, (0, 3, 1, 2))
+            x = np.transpose(x, (0, 3, 1, 2))
+            y_grad = np.transpose(y_grad, (0, 3, 1, 2))
+
+        return x_grad, grad_scale, grad_offset
+
+    def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
+                             epsilon, momentum, shape, data_layout):
+        if data_layout != "NCHW" and data_layout != "NHWC":
+            raise ValueError("Unknown data order.")
+
+        if data_layout == "NCHW":
+            x = np.transpose(x, (0, 2, 3, 1))
+
+        # run normalizaton
+        normalized = (x - mean) / np.sqrt(variance + epsilon)
+        y = normalized * scale + bias
+
+        # transfer back to N, C, H, W
+        if data_layout == "NCHW":
+            x = np.transpose(x, (0, 3, 1, 2))
+            y = np.transpose(y, (0, 3, 1, 2))
+
+        mean_out = mean
+        variance_out = variance
+        saved_variance = 1. / np.sqrt(variance + epsilon)
+        # run backward
+        x_grad, scale_grad, bias_grad = self.reference_grad(
+            x, y_grad, scale, mean, variance, epsilon, data_layout)
+
+        return y, mean_out, variance_out, mean, saved_variance, x_grad, scale_grad, bias_grad
+
+
+class TestBatchNormOpFreezeStatsAndScaleBiasTraining(
+        TestBatchNormOpFreezeStatsTraining):
+    def init_test_case(self):
+        self.use_global_stats = True
+        self.no_grad_set = set(['scale@GRAD', 'bias@GRAD'])
+        self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD']
+
+
+class TestBatchNormOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input of batch_norm must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, fluid.layers.batch_norm, x1)
+
+            # the input dtype of batch_norm must be float16 or float32 or float64
+            # float16 only can be set on GPU place
+            x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="int32")
+            self.assertRaises(TypeError, fluid.layers.batch_norm, x2)
+
+
+class TestDygraphBatchNormAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            batch_norm = fluid.dygraph.BatchNorm(10)
+            # the input of BatchNorm must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, batch_norm, x1)
+
+            # the input dtype of BatchNorm must be float16 or float32 or float64
+            # float16 only can be set on GPU place
+            x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="int32")
+            self.assertRaises(TypeError, batch_norm, x2)
+
+
+class TestDygraphBatchNormTrainableStats(unittest.TestCase):
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_mlu():
+            places.append(fluid.MLUPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute(x, is_test, trainable_statistics):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute(x, False, False)
+            y2 = compute(x, True, True)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_mlu():
+            places.append(fluid.MLUPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute(x_np, is_test, trainable_statistics):
+                with program_guard(Program(), Program()):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = bn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute(x, False, False)
+            y2 = compute(x, True, True)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
+    def test_reservespace(self):
+        with program_guard(Program(), Program()):
+            paddle.enable_static()
+            x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
+            x = fluid.data(name='x', shape=x.shape, dtype=x.dtype)
+            # Set this FLAG, the BatchNorm API will pass "reserve_space" argument into batch_norm op.
+            os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1'
+            batch_norm = fluid.dygraph.BatchNorm(7, data_layout="NHWC")
+            hidden1 = batch_norm(x)
+            os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '0'
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..f608344f6e0363864a76a23f8d8c10dace130149
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
@@ -0,0 +1,295 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+import sys
+sys.path.append("..")
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestBatchNorm(unittest.TestCase):
+    def test_name(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_mlu():
+            places.append(fluid.MLUPlace(0))
+        for p in places:
+            with fluid.dygraph.guard(p):
+                batch_norm1d = paddle.nn.BatchNorm1D(1, name="test")
+
+    def test_error(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_mlu():
+            places.append(fluid.MLUPlace(0))
+        for p in places:
+            #paddle.disable_static()
+            x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+            x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+
+            def error1d_dataformat():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm1d = paddle.nn.BatchNorm1D(1, data_format='NCDHW')
+                batch_norm1d(fluid.dygraph.to_variable(x_data_4))
+
+            def error2d_dataformat():
+                x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+                batch_norm2d = paddle.nn.BatchNorm2D(1, data_format='NCDHW')
+                batch_norm2d(fluid.dygraph.to_variable(x_data_3))
+
+            def error3d_dataformat():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm3d = paddle.nn.BatchNorm3D(1, data_format='NCL')
+                batch_norm3d(fluid.dygraph.to_variable(x_data_4))
+
+            def error1d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm1d = paddle.nn.BatchNorm1D(1)
+                batch_norm1d(fluid.dygraph.to_variable(x_data_4))
+
+            def error2d():
+                x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+                batch_norm2d = paddle.nn.BatchNorm2D(1)
+                batch_norm2d(fluid.dygraph.to_variable(x_data_3))
+
+            def error3d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm3d = paddle.nn.BatchNorm3D(1)
+                batch_norm3d(fluid.dygraph.to_variable(x_data_4))
+
+            with fluid.dygraph.guard(p):
+                self.assertRaises(ValueError, error1d)
+                self.assertRaises(ValueError, error2d)
+                self.assertRaises(ValueError, error3d)
+                self.assertRaises(ValueError, error1d_dataformat)
+                self.assertRaises(ValueError, error2d_dataformat)
+                self.assertRaises(ValueError, error3d_dataformat)
+
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_mlu():
+            places.append(fluid.MLUPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x, is_test, trainable_statistics):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.BatchNorm2D(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v3(x, is_test, trainable_statistics):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        param_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(1.0),
+                            trainable=False),
+                        bias_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(0.0),
+                            trainable=False),
+                        trainable_statistics=trainable_statistics)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v4(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.BatchNorm2D(
+                        shape[1], weight_attr=False, bias_attr=False)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x, False, False)
+            y2 = compute_v2(x)
+            y3 = compute_v3(x, False, False)
+            y4 = compute_v4(x)
+            self.assertTrue(np.allclose(y1, y2))
+            self.assertTrue(np.allclose(y3, y4))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_mlu():
+            places.append(fluid.MLUPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np, is_test, trainable_statistics):
+                with program_guard(Program(), Program()):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = bn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    bn = paddle.nn.BatchNorm2D(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = bn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x, False, False)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+class TestBatchNormChannelLast(unittest.TestCase):
+    def setUp(self):
+        self.original_dtyep = paddle.get_default_dtype()
+        paddle.set_default_dtype("float32")
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_mlu():
+            self.places.append(fluid.MLUPlace(0))
+
+    def tearDown(self):
+        paddle.set_default_dtype(self.original_dtyep)
+
+    def test_1d(self):
+        for p in self.places:
+            with fluid.dygraph.guard(p):
+                x = paddle.randn([2, 6, 4])
+                net1 = paddle.nn.BatchNorm1D(4, data_format="NLC")
+                net2 = paddle.nn.BatchNorm1D(4)
+                net2.weight = net1.weight
+                net2.bias = net1.bias
+                y1 = net1(x)
+                channel_first_x = paddle.transpose(x, [0, 2, 1])
+                y2 = net2(channel_first_x)
+                y2 = paddle.transpose(y2, [0, 2, 1])
+                self.assertEqual(
+                    np.allclose(
+                        y1.numpy(), y2.numpy(), atol=1e-07), True)
+
+    def test_2d(self):
+        for p in self.places:
+            with fluid.dygraph.guard(p):
+                x = paddle.randn([2, 6, 6, 4])
+                net1 = paddle.nn.BatchNorm2D(4, data_format="NHWC")
+                net2 = paddle.nn.BatchNorm2D(4)
+                net2.weight = net1.weight
+                net2.bias = net1.bias
+                y1 = net1(x)
+                channel_first_x = paddle.transpose(x, [0, 3, 1, 2])
+                y2 = net2(channel_first_x)
+                y2 = paddle.transpose(y2, [0, 2, 3, 1])
+                self.assertEqual(
+                    np.allclose(
+                        y1.numpy(), y2.numpy(), atol=1e-07), True)
+
+    def test_3d(self):
+        for p in self.places:
+            with fluid.dygraph.guard(p):
+                x = paddle.randn([2, 6, 6, 6, 4])
+                net1 = paddle.nn.BatchNorm3D(4, data_format="NDHWC")
+                net2 = paddle.nn.BatchNorm3D(4)
+                net2.weight = net1.weight
+                net2.bias = net1.bias
+                y1 = net1(x)
+                channel_first_x = paddle.transpose(x, [0, 4, 1, 2, 3])
+                y2 = net2(channel_first_x)
+                y2 = paddle.transpose(y2, [0, 2, 3, 4, 1])
+                self.assertEqual(
+                    np.allclose(
+                        y1.numpy(), y2.numpy(), atol=1e-07), True)
+                # res = np.allclose(y1.numpy(), y2.numpy())
+                # if res == False:
+                #   np.savetxt("./y1.txt", y1.numpy().flatten(), fmt='%.10f', delimiter='\n')
+                #   np.savetxt("./y2.txt", y2.numpy().flatten(), fmt='%.10f', delimiter='\n')
+                # self.assertEqual(res, True)
+
+
+class TestBatchNormUseGlobalStats(unittest.TestCase):
+    def setUp(self):
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_mlu():
+            self.places.append(fluid.MLUPlace(0))
+        self.init_test()
+
+    ### train mode
+    def init_test(self):
+        self.use_global_stats = True
+        self.trainable_statistics = False
+
+    def test_global_stats(self):
+        for p in self.places:
+            with fluid.dygraph.guard(p):
+                x = paddle.randn([2, 6, 6, 4])
+                net1 = paddle.fluid.dygraph.BatchNorm(
+                    6,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(1.0)),
+                    use_global_stats=self.use_global_stats,
+                    trainable_statistics=self.trainable_statistics)
+                net2 = paddle.nn.BatchNorm2D(
+                    6, use_global_stats=self.use_global_stats)
+                net2.weight = net1.weight
+                net2.bias = net1.bias
+                if self.trainable_statistics == True:
+                    net1.training = False
+                    net2.training = False
+                y1 = net1(x)
+                y2 = net2(x)
+                self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
+
+
+class TestBatchNormUseGlobalStatsCase1(TestBatchNormUseGlobalStats):
+    ### test mode
+    def init_test(self):
+        self.use_global_stats = False
+        self.trainable_statistics = True
+
+
+class TestBatchNormUseGlobalStatsCase2(TestBatchNormUseGlobalStats):
+    ### train mode
+    def init_test(self):
+        self.use_global_stats = False
+        self.trainable_statistics = False
+
+
+class TestBatchNormUseGlobalStatsCase3(TestBatchNormUseGlobalStats):
+    ### test mode
+    def init_test(self):
+        self.use_global_stats = True
+        self.trainable_statistics = True
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bfa96b70011238b48f55d628ad17794b84ff5de
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py
@@ -0,0 +1,223 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+class TestConcatOp(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "concat"
+        self.place = paddle.device.MLUPlace(0)
+        self.init_dtype()
+        self.init_test_data()
+
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {'axis': self.axis}
+        if self.axis < 0:
+            self.actual_axis = self.axis + len(self.x0.shape)
+            self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0
+        else:
+            self.actual_axis = self.axis
+
+        self.outputs = {
+            'Out': np.concatenate(
+                (self.x0, self.x1, self.x2), axis=self.actual_axis)
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['x0', 'x2'], 'Out')
+        self.check_grad_with_place(self.place, ['x1'], 'Out')
+        self.check_grad_with_place(self.place, ['x2'], 'Out')
+
+    def init_test_data(self):
+        self.x0 = np.random.random((1, 4, 50)).astype(self.dtype)
+        self.x1 = np.random.random((2, 4, 50)).astype(self.dtype)
+        self.x2 = np.random.random((3, 4, 50)).astype(self.dtype)
+        self.axis = 0
+
+
+class TestConcatOp2(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
+        self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
+        self.x2 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
+        self.axis = 1
+
+
+@skip_check_grad_ci(
+    reason="The function 'check_grad' for large inputs is too slow.")
+class TestConcatOp3(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = np.random.random((1, 256, 170, 256)).astype(self.dtype)
+        self.x1 = np.random.random((1, 128, 170, 256)).astype(self.dtype)
+        self.x2 = np.random.random((1, 128, 170, 256)).astype(self.dtype)
+        self.axis = 1
+
+    def test_check_grad(self):
+        pass
+
+
+@skip_check_grad_ci(
+    reason="This test will meet fetch error when there is a null grad. The detailed information is in PR#17015."
+)
+class TestConcatOp4(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
+        self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
+        self.x2 = np.random.random((0, 3, 4, 5)).astype(self.dtype)
+        self.axis = 0
+
+    def test_check_grad(self):
+        pass
+
+
+class TestConcatOp5(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = np.random.random((5, 1, 4, 5)).astype(self.dtype)
+        self.x1 = np.random.random((5, 2, 4, 5)).astype(self.dtype)
+        self.x2 = np.random.random((5, 3, 4, 5)).astype(self.dtype)
+        self.axis = -3
+
+
+#----------------Concat Fp16----------------
+def create_test_fp16(parent):
+    class TestConcatFp16(parent):
+        def init_dtype(self):
+            self.dtype = np.float16
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestConcatFp16.__name__ = cls_name
+    globals()[cls_name] = TestConcatFp16
+
+
+create_test_fp16(TestConcatOp)
+create_test_fp16(TestConcatOp2)
+create_test_fp16(TestConcatOp3)
+create_test_fp16(TestConcatOp4)
+create_test_fp16(TestConcatOp5)
+
+
+#----------------Concat Int64----------------
+def create_test_int64(parent):
+    class TestConcatInt64(parent):
+        def init_dtype(self):
+            self.dtype = np.int64
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Int64")
+    TestConcatInt64.__name__ = cls_name
+    globals()[cls_name] = TestConcatInt64
+
+
+create_test_int64(TestConcatOp)
+create_test_int64(TestConcatOp2)
+create_test_int64(TestConcatOp3)
+create_test_int64(TestConcatOp4)
+create_test_int64(TestConcatOp5)
+
+
+#----------------Concat Int32----------------
+def create_test_int32(parent):
+    class TestConcatInt32(parent):
+        def init_dtype(self):
+            self.dtype = np.int32
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Int32")
+    TestConcatInt32.__name__ = cls_name
+    globals()[cls_name] = TestConcatInt32
+
+
+create_test_int32(TestConcatOp)
+create_test_int32(TestConcatOp2)
+create_test_int32(TestConcatOp3)
+create_test_int32(TestConcatOp4)
+create_test_int32(TestConcatOp5)
+
+
+#----------------Concat AxisTensor----------------
+def create_test_AxisTensor(parent):
+    class TestConcatAxisTensor(parent):
+        def setUp(self):
+            self.op_type = "concat"
+            self.dtype = self.init_dtype()
+            self.init_test_data()
+
+            self.inputs = {
+                'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)],
+                'AxisTensor': np.array([self.axis]).astype("int32")
+            }
+            self.attrs = {}
+
+            if self.axis < 0:
+                self.actual_axis = self.axis + len(self.x0.shape)
+                self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0
+            else:
+                self.actual_axis = self.axis
+
+            self.outputs = {
+                'Out': np.concatenate(
+                    (self.x0, self.x1, self.x2), axis=self.actual_axis)
+            }
+
+            self.place = paddle.device.MLUPlace(0)
+            self.__class__.use_mlu = True
+
+        def init_test_data(self):
+            self.x0 = np.random.random((1, 4, 50)).astype(self.dtype)
+            self.x1 = np.random.random((2, 4, 50)).astype(self.dtype)
+            self.x2 = np.random.random((3, 4, 50)).astype(self.dtype)
+            self.axis = 0
+
+        def init_dtype(self):
+            self.dtype = np.float32
+
+    cls_name = "{0}_{1}".format(parent.__name__, "AxisTensor")
+    TestConcatAxisTensor.__name__ = cls_name
+    globals()[cls_name] = TestConcatAxisTensor
+
+
+create_test_AxisTensor(TestConcatOp)
+create_test_AxisTensor(TestConcatOp2)
+create_test_AxisTensor(TestConcatOp3)
+create_test_AxisTensor(TestConcatOp4)
+create_test_AxisTensor(TestConcatOp5)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_conv2d_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_conv2d_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..b09d892554bab6dc2951a72d72773935e5f60ddb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_conv2d_op_mlu.py
@@ -0,0 +1,555 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from op_test import OpTest
+
+from test_conv2d_op import conv2d_forward_naive
+
+paddle.enable_static()
+
+
+def create_test_channel_last_class(parent):
+    class TestChannelLastCase(parent):
+        def init_data_format(self):
+            self.data_format = "NHWC"
+
+        def init_test_case_2(self):
+            N, C, H, W = self.input_size
+            self.input_size = [N, H, W, C]
+
+    cls_name = "{0}_{1}".format(parent.__name__, "ChannelLast")
+    TestChannelLastCase.__name__ = cls_name
+    globals()[cls_name] = TestChannelLastCase
+
+
+def create_test_padding_SAME_class(parent):
+    class TestPaddingSMAECase(parent):
+        def init_paddings(self):
+            self.pad = [0, 0]
+            self.padding_algorithm = "SAME"
+
+    cls_name = "{0}_{1}".format(parent.__name__, "PaddingSAMEOp")
+    TestPaddingSMAECase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingSMAECase
+
+
+def create_test_padding_VALID_class(parent):
+    class TestPaddingVALIDCase(parent):
+        def init_paddings(self):
+            self.pad = [1, 1]
+            self.padding_algorithm = "VALID"
+
+    cls_name = "{0}_{1}".format(parent.__name__, "PaddingVALIDOp")
+    TestPaddingVALIDCase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingVALIDCase
+
+
+def create_test_fp16_class(parent):
+    class TestFp16Case(parent):
+        def init_dtype(self):
+            self.dtype = np.float16
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestFp16Case
+
+
+class TestConv2DOp(OpTest):
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_data_format(self):
+        self.data_format = "NCHW"
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "conv2d"
+        self.init_data_format()
+        self.init_dtype()
+        self.init_group()
+        self.init_dilation()
+        self.init_test_case()
+
+        conv2d_param = {
+            'stride': self.stride,
+            'pad': self.pad,
+            'dilation': self.dilations
+        }
+
+        input = np.random.random(self.input_size).astype(self.dtype)
+        filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
+
+        output, _, _, _, _ = conv2d_forward_naive(
+            input,
+            filter,
+            self.groups,
+            conv2d_param,
+            data_format=self.data_format)
+        output = output.astype(self.dtype)
+
+        self.inputs = {
+            'Input': OpTest.np_dtype_to_fluid_dtype(input),
+            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
+        }
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'groups': self.groups,
+            'dilations': self.dilations,
+            'data_format': self.data_format,
+        }
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-2)
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad_with_place(
+            self.place, {'Input', 'Filter'},
+            'Output',
+            max_relative_error=0.03,
+            numeric_place=paddle.CPUPlace())
+
+    def test_check_grad_no_filter(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad_with_place(
+            self.place, ['Input'],
+            'Output',
+            max_relative_error=0.03,
+            no_grad_set=set(['Filter']),
+            numeric_place=paddle.CPUPlace())
+
+    def test_check_grad_no_input(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad_with_place(
+            self.place, ['Filter'],
+            'Output',
+            max_relative_error=0.03,
+            no_grad_set=set(['Input']),
+            numeric_place=paddle.CPUPlace())
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+
+class TestWithPad(TestConv2DOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+
+class TestWithStride(TestConv2DOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+
+class TestWithGroup(TestConv2DOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.group = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [18, f_c, 3, 3]
+
+
+class TestWith1x1(TestConv2DOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [120, f_c, 1, 1]
+
+    def init_group(self):
+        # FIXME: Supporting group = 3 in this case.
+        # NOTE(wangran16): There is an unknown error (acl error code is : 507015) 
+        # when group = 3, which needs to be fixed.
+        self.groups = 1
+
+
+class TestWithDepthWise5x5(TestConv2DOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 4, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [8, f_c, 5, 5]
+
+    def init_group(self):
+        self.groups = 4
+
+
+class TestWithDepthWise7x7(TestConv2DOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 8, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [16, f_c, 7, 7]
+
+    def init_group(self):
+        self.groups = 8
+
+
+class TestWithDilation(TestConv2DOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    # TODO(MLU): Depthwise opration does not support dilation yet
+    # it will throw an error of CNNL_STATUS_NOT_SUPPORTED.
+    # def init_group(self):
+    #     self.groups = 3
+
+
+class TestWithInput1x1Filter1x1(TestConv2DOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [100, 1, 1, 1]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [120, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+
+class TestConv2DOp_v2(OpTest):
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "conv2d"
+        self.dtype = np.float32
+        self.init_kernel_type()
+        self.init_group()
+        self.init_dilation()
+        self.init_data_format()
+        self.init_test_case()
+        self.init_paddings()
+        self.init_test_case_2()
+
+        conv2d_param = {
+            'stride': self.stride,
+            'pad': self.pad,
+            'dilation': self.dilations
+        }
+
+        input = np.random.random(self.input_size).astype(self.dtype)
+        filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
+        output, _, _, _, _ = conv2d_forward_naive(
+            input, filter, self.groups, conv2d_param, self.padding_algorithm,
+            self.data_format)
+        output = output.astype(self.dtype)
+
+        self.inputs = {
+            'Input': OpTest.np_dtype_to_fluid_dtype(input),
+            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
+        }
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'padding_algorithm': self.padding_algorithm,
+            'groups': self.groups,
+            'dilations': self.dilations,
+            'data_format': self.data_format,
+        }
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-2)
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad_with_place(
+            self.place, {'Input', 'Filter'},
+            'Output',
+            max_relative_error=0.02,
+            numeric_place=paddle.CPUPlace())
+
+    def test_check_grad_no_filter(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad_with_place(
+            self.place, ['Input'],
+            'Output',
+            max_relative_error=0.02,
+            no_grad_set=set(['Filter']),
+            numeric_place=paddle.CPUPlace())
+
+    def test_check_grad_no_input(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad_with_place(
+            self.place, ['Filter'],
+            'Output',
+            no_grad_set=set(['Input']),
+            numeric_place=paddle.CPUPlace())
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 4, 3]
+
+    def init_dilation(self):
+        self.dilations = [1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_kernel_type(self):
+        pass
+
+    def init_paddings(self):
+        self.pad = [0, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+    def init_data_format(self):
+        self.data_format = "NCHW"
+
+    def init_test_case_2(self):
+        pass
+
+
+class TestConv2DOp_AsyPadding(TestConv2DOp_v2):
+    def init_paddings(self):
+        self.pad = [0, 0, 1, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithPad_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_paddings(self):
+        self.pad = [2, 1, 3, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithStride_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_paddings(self):
+        self.pad = [2, 1, 3, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithGroup_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.group = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 4, 3]
+
+
+class TestWith1x1_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [120, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_paddings(self):
+        self.pad = [2, 2, 4, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithDepthWise3x3_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [3, 4, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [16, f_c, 3, 3]
+
+    # TODO(MLU): Depthwise opration does not support dilation yet
+    # it will throw an error of CNNL_STATUS_NOT_SUPPORTED.
+    # def init_dilation(self):
+    #     self.dilations = [2, 2]
+
+    def init_group(self):
+        self.groups = 4
+
+    def init_paddings(self):
+        self.pad = [1, 3, 2, 1]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithDepthWise5x5_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 4, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [8, f_c, 5, 5]
+
+    def init_group(self):
+        self.groups = 4
+
+    def init_paddings(self):
+        self.pad = [0, 1, 1, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithDepthWise7x7_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.stride = [2, 2]
+        self.input_size = [2, 8, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [16, f_c, 7, 7]
+
+    def init_group(self):
+        self.groups = 8
+
+    def init_paddings(self):
+        self.pad = [1, 3, 4, 1]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithDilation_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    # TODO(MLU): Depthwise opration does not support dilation yet
+    # it will throw an error of CNNL_STATUS_NOT_SUPPORTED.
+    # def init_group(self):
+    #     self.groups = 3
+
+    def init_paddings(self):
+        self.pad = [0, 1, 3, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithInput1x1Filter1x1_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [100, 1, 1, 1]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [120, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_paddings(self):
+        self.pad = [0, 3, 4, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+create_test_padding_SAME_class(TestConv2DOp_AsyPadding)
+create_test_padding_SAME_class(TestWithPad_AsyPadding)
+create_test_padding_SAME_class(TestWithStride_AsyPadding)
+create_test_padding_SAME_class(TestWithGroup_AsyPadding)
+create_test_padding_SAME_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+create_test_padding_VALID_class(TestConv2DOp_AsyPadding)
+create_test_padding_VALID_class(TestWithPad_AsyPadding)
+create_test_padding_VALID_class(TestWithStride_AsyPadding)
+create_test_padding_VALID_class(TestWithGroup_AsyPadding)
+create_test_padding_VALID_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+create_test_channel_last_class(TestConv2DOp_AsyPadding)
+create_test_channel_last_class(TestWithPad_AsyPadding)
+create_test_channel_last_class(TestWithGroup_AsyPadding)
+create_test_channel_last_class(TestWith1x1_AsyPadding)
+create_test_channel_last_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+create_test_fp16_class(TestConv2DOp_AsyPadding)
+create_test_fp16_class(TestWithPad_AsyPadding)
+create_test_fp16_class(TestWithStride_AsyPadding)
+create_test_fp16_class(TestWithGroup_AsyPadding)
+create_test_fp16_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..6610127d382bd3a715b64ad359c500fefc595936
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py
@@ -0,0 +1,453 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append('..')
+from op_test import OpTest, convert_float_to_uint16
+
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+import numpy as np
+from paddle.fluid import compiler, Program, program_guard
+
+
+# Situation 1: Attr(shape) is a list(without tensor)
+class TestFillConstantOp1(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value
+        '''
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {'shape': [123, 92], 'value': 3.8}
+        self.outputs = {'Out': np.full((123, 92), 3.8)}
+
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestFillConstantOp2(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with default value
+        '''
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {'shape': [123, 92]}
+        self.outputs = {'Out': np.full((123, 92), 0.0)}
+
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestFillConstantOp3(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified int64 value
+        '''
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {'shape': [123, 92], 'value': 10000000000}
+        self.outputs = {'Out': np.full((123, 92), 10000000000)}
+
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestFillConstantOp4(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified int value
+        '''
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {'shape': [123, 92], 'value': 3}
+        self.outputs = {'Out': np.full((123, 92), 3)}
+
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestFillConstantOpWithSelectedRows(unittest.TestCase):
+    def check_with_place(self, place):
+        scope = core.Scope()
+        # create Out Variable
+        out = scope.var('Out').get_selected_rows()
+
+        # create and run fill_constant_op operator
+        fill_constant_op = Operator(
+            "fill_constant", shape=[123, 92], value=3.8, Out='Out')
+        fill_constant_op.run(scope, place)
+
+        # get result from Out
+        result_array = np.array(out.get_tensor())
+        full_array = np.full((123, 92), 3.8, 'float32')
+
+        self.assertTrue(np.array_equal(result_array, full_array))
+
+    def test_fill_constant_with_selected_rows(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            self.check_with_place(place)
+
+
+# Situation 2: Attr(shape) is a list(with tensor)
+class TestFillConstantOp1_ShapeTensorList(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value
+        '''
+        self.op_type = "fill_constant"
+        self.init_data()
+        shape_tensor_list = []
+        for index, ele in enumerate(self.shape):
+            shape_tensor_list.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {"ShapeTensorList": shape_tensor_list}
+        self.attrs = {'shape': self.infer_shape, 'value': self.value}
+        self.outputs = {'Out': np.full(self.shape, self.value)}
+
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def init_data(self):
+        self.shape = [123, 92]
+        self.infer_shape = [-1, 92]
+        self.value = 3.8
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestFillConstantOp2_ShapeTensorList(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with default value
+        '''
+        self.op_type = "fill_constant"
+        self.init_data()
+        shape_tensor_list = []
+        for index, ele in enumerate(self.shape):
+            shape_tensor_list.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {"ShapeTensorList": shape_tensor_list}
+        self.attrs = {'shape': self.infer_shape}
+        self.outputs = {'Out': np.full(self.shape, 0.0)}
+
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def init_data(self):
+        self.shape = [123, 92]
+        self.infer_shape = [-1, -1]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestFillConstantOp3_ShapeTensorList(TestFillConstantOp1_ShapeTensorList):
+    def init_data(self):
+        self.shape = [123, 92]
+        self.infer_shape = [123, -1]
+        self.value = 10000000000
+
+
+class TestFillConstantOp4_ShapeTensorList(TestFillConstantOp1_ShapeTensorList):
+    def init_data(self):
+        self.shape = [123, 92]
+        self.infer_shape = [123, -1]
+        self.value = 3
+
+
+# Situation 3: shape is a tensor
+class TestFillConstantOp1_ShapeTensor(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value
+        '''
+        self.op_type = "fill_constant"
+        self.init_data()
+
+        self.inputs = {"ShapeTensor": np.array(self.shape).astype("int32")}
+        self.attrs = {'value': self.value}
+        self.outputs = {'Out': np.full(self.shape, self.value)}
+
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def init_data(self):
+        self.shape = [123, 92]
+        self.value = 3.8
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+# Situation 4: value is a tensor
+class TestFillConstantOp1_ValueTensor(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value
+        '''
+        self.op_type = "fill_constant"
+        self.init_data()
+
+        self.inputs = {
+            "ShapeTensor": np.array(self.shape).astype("int32"),
+            'ValueTensor': np.array([self.value]).astype("float32")
+        }
+        self.attrs = {'value': self.value + 1.0}
+        self.outputs = {'Out': np.full(self.shape, self.value)}
+
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def init_data(self):
+        #self.shape = [123, 92]
+        self.shape = [2, 2]
+        self.value = 3.8
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+# Situation 5: value is a tensor
+class TestFillConstantOp2_ValueTensor(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value
+        '''
+        self.op_type = "fill_constant"
+        self.init_data()
+
+        self.inputs = {
+            "ShapeTensor": np.array(self.shape).astype("int32"),
+            'ValueTensor': np.array([self.value]).astype("int32")
+        }
+        self.attrs = {'value': self.value, 'dtype': 2}
+        self.outputs = {'Out': np.full(self.shape, self.value)}
+
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def init_data(self):
+        self.shape = [123, 92]
+        self.value = 3
+        self.dtype = np.int32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+# Test python API
+class TestFillConstantAPI(unittest.TestCase):
+    def test_api(self):
+
+        positive_2_int32 = fluid.layers.fill_constant([1], "int32", 2)
+        positive_2_int64 = fluid.layers.fill_constant([1], "int64", 2)
+
+        shape_tensor_int32 = fluid.data(
+            name="shape_tensor_int32", shape=[2], dtype="int32")
+        shape_tensor_int64 = fluid.data(
+            name="shape_tensor_int64", shape=[2], dtype="int64")
+
+        out_1 = fluid.layers.fill_constant(
+            shape=[1, 2], dtype="float32", value=1.1)
+
+        out_2 = fluid.layers.fill_constant(
+            shape=[1, positive_2_int32], dtype="float32", value=1.1)
+
+        out_3 = fluid.layers.fill_constant(
+            shape=[1, positive_2_int64], dtype="float32", value=1.1)
+
+        out_4 = fluid.layers.fill_constant(
+            shape=shape_tensor_int32, dtype="float32", value=1.1)
+
+        out_5 = fluid.layers.fill_constant(
+            shape=shape_tensor_int64, dtype="float32", value=1.1)
+
+        out_6 = fluid.layers.fill_constant(
+            shape=shape_tensor_int64, dtype=np.float32, value=1.1)
+
+        val1 = fluid.layers.fill_constant(
+            shape=[1], dtype=np.float32, value=1.1)
+        val2 = fluid.layers.fill_constant(
+            shape=[1], dtype=np.float64, value=1.1)
+        out_7 = fluid.layers.fill_constant(
+            shape=shape_tensor_int64, dtype=np.float32, value=val1)
+
+        out_8 = fluid.layers.fill_constant(
+            shape=shape_tensor_int64, dtype=np.float32, value=val2)
+
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        res_1, res_2, res_3, res_4, res_5, res_6, res_7, res_8 = exe.run(
+            fluid.default_main_program(),
+            feed={
+                "shape_tensor_int32": np.array([1, 2]).astype("int32"),
+                "shape_tensor_int64": np.array([1, 2]).astype("int64"),
+            },
+            fetch_list=[
+                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
+            ])
+
+        assert np.array_equal(res_1, np.full([1, 2], 1.1, dtype="float32"))
+        assert np.array_equal(res_2, np.full([1, 2], 1.1, dtype="float32"))
+        assert np.array_equal(res_3, np.full([1, 2], 1.1, dtype="float32"))
+        assert np.array_equal(res_4, np.full([1, 2], 1.1, dtype="float32"))
+        assert np.array_equal(res_5, np.full([1, 2], 1.1, dtype="float32"))
+        assert np.array_equal(res_6, np.full([1, 2], 1.1, dtype="float32"))
+        assert np.array_equal(res_7, np.full([1, 2], 1.1, dtype="float32"))
+        assert np.array_equal(res_8, np.full([1, 2], 1.1, dtype="float32"))
+
+
+class TestFillConstantImperative(unittest.TestCase):
+    def test_api(self):
+        with fluid.dygraph.guard():
+            data1 = np.array([1, 2]).astype('int32')
+            data2 = np.array([1.1]).astype('float32')
+            data3 = np.array([88]).astype('int32')
+            shape = fluid.dygraph.to_variable(data1)
+            val = fluid.dygraph.to_variable(data2)
+            value = fluid.dygraph.to_variable(data3)
+            res1 = fluid.layers.fill_constant(
+                shape=[1, 2], dtype='float32', value=1.1)
+            res2 = fluid.layers.fill_constant(
+                shape=shape, dtype='float32', value=1.1)
+            res3 = fluid.layers.fill_constant(
+                shape=shape, dtype='float32', value=val)
+            res4 = fluid.layers.fill_constant(
+                shape=shape, dtype='int32', value=value)
+            assert np.array_equal(
+                res1.numpy(), np.full(
+                    [1, 2], 1.1, dtype="float32"))
+            assert np.array_equal(
+                res2.numpy(), np.full(
+                    [1, 2], 1.1, dtype="float32"))
+            assert np.array_equal(
+                res3.numpy(), np.full(
+                    [1, 2], 1.1, dtype="float32"))
+            assert np.array_equal(
+                res4.numpy(), np.full(
+                    [1, 2], 88, dtype="int32"))
+
+    def test_nan(self):
+        with fluid.dygraph.guard():
+            res = fluid.layers.fill_constant([1], 'float32', np.nan)
+            self.assertTrue(np.isnan(res.numpy().item(0)))
+
+    def test_inf(self):
+        with fluid.dygraph.guard():
+            res = fluid.layers.fill_constant([1], 'float32', np.inf)
+            self.assertTrue(np.isinf(res.numpy().item(0)))
+
+    def test_ninf(self):
+        with fluid.dygraph.guard():
+            res = fluid.layers.fill_constant([1], 'float32', np.NINF)
+            self.assertTrue(np.isinf(res.numpy().item(0)))
+            self.assertEqual(np.NINF, res.numpy().item(0))
+
+
+class TestFillConstantOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            #for ci coverage
+            x1 = fluid.layers.data(name='x1', shape=[1], dtype="int16")
+            self.assertRaises(
+                TypeError,
+                fluid.layers.fill_constant,
+                shape=[1],
+                value=5,
+                dtype='uint4')
+
+            self.assertRaises(
+                TypeError,
+                fluid.layers.fill_constant,
+                shape=[1.1],
+                value=5,
+                dtype='float32',
+                out=x1)
+
+            # The argument dtype of fill_constant_op must be one of bool, float16,
+            #float32, float64, uint8, int16, int32 or int64
+            x2 = fluid.layers.data(name='x2', shape=[1], dtype="int32")
+
+            self.assertRaises(
+                TypeError,
+                fluid.layers.fill_constant,
+                shape=[1],
+                value=5,
+                dtype='float64',
+                out=x2)
+
+            x3 = np.random.randn(100, 100).astype('int32')
+            self.assertRaises(
+                TypeError,
+                fluid.layers.fill_constant,
+                shape=[100, 100],
+                value=5,
+                dtype='float64',
+                out=x3)
+
+            # The argument shape's type of fill_constant_op must be list, tuple or Variable.
+            def test_shape_type():
+                fluid.layers.fill_constant(shape=1, dtype="float32", value=1)
+
+            self.assertRaises(TypeError, test_shape_type)
+
+            # The argument shape's size of fill_constant_op must not be 0.
+            def test_shape_size():
+                fluid.layers.fill_constant(shape=[], dtype="float32", value=1)
+
+            self.assertRaises(AssertionError, test_shape_size)
+
+            # The shape dtype of fill_constant_op must be int32 or int64.
+            def test_shape_tensor_dtype():
+                shape = fluid.data(
+                    name="shape_tensor", shape=[2], dtype="float32")
+                fluid.layers.fill_constant(
+                    shape=shape, dtype="float32", value=1)
+
+            self.assertRaises(TypeError, test_shape_tensor_dtype)
+
+            def test_shape_tensor_list_dtype():
+                shape = fluid.data(
+                    name="shape_tensor_list", shape=[1], dtype="bool")
+                fluid.layers.fill_constant(
+                    shape=[shape, 2], dtype="float32", value=1)
+
+            self.assertRaises(TypeError, test_shape_tensor_list_dtype)
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_split_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_split_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8363545d228892c4c7209499caf13aec4b4805b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_split_op_mlu.py
@@ -0,0 +1,234 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+paddle.enable_static()
+SEED = 2021
+
+
+class TestCase1(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.set_example()
+        self.op_type = "split"
+        self.place = paddle.device.MLUPlace(0)
+        ipt = self.x.astype(self.dtype)
+        axis = self.axis if isinstance(self.axis, int) else int(self.axis[0])
+        tmp_outs = np.split(
+            ipt, axis=axis, indices_or_sections=self.num_or_sections)
+        tmp_outs = [o.astype(self.dtype) for o in tmp_outs]
+        self.outputs = {'Out': []}
+        self.outs = []
+        for i, o in enumerate(tmp_outs):
+            self.outputs["Out"].append((str(i), o))
+            self.outs.append(str(i))
+
+        self.attrs = {"axis": self.axis, "num": self.num_or_sections}
+        self.inputs = {}
+        self.inputs.update({'X': ipt.astype(self.dtype)})
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.__class__.op_type = "split"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def set_example(self):
+        self.dtype = "float32"
+        self.x = np.random.random((2, 4, 6))
+        self.axis = 1
+        self.num_or_sections = 2
+
+
+class TestCase2(TestCase1):
+    def set_example(self):
+        self.dtype = "float32"
+        self.x = np.random.random((20, 4, 50))
+        self.axis = 0
+        self.num_or_sections = 4
+
+
+class TestCase4(TestCase1):
+    def set_example(self):
+        self.dtype = "float16"
+        self.x = np.random.random((4, 50, 20))
+        self.axis = 2
+        self.num_or_sections = 4
+
+
+# Test Sections
+class TestCase5(TestCase1):
+    def set_example(self):
+        super().set_example()
+        self.x = np.random.random((2, 10, 4))
+        self.axis = 1
+        self.num_or_sections = [2, 4, 8]
+
+    def setUp(self):
+        super().setUp()
+        self.attrs.update({"sections": [2, 2, 4, 2], "num": 0})
+
+
+class API_TestSplit(unittest.TestCase):
+    def test_out(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data = fluid.layers.data('data', shape=[-1, 10], dtype='float32')
+            x0, x1 = paddle.split(data, num_or_sections=(3, 7), axis=1)
+            place = fluid.MLUPlace(0)
+            exe = fluid.Executor(place)
+            input1 = np.random.random([1, 10]).astype('float32')
+            r0, r1 = exe.run(feed={"data": input1}, fetch_list=[x0, x1])
+            ex_x0, ex_x1 = np.split(input1, (3, ), axis=1)
+            self.assertTrue(np.allclose(ex_x0, r0))
+            self.assertTrue(np.allclose(ex_x1, r1))
+
+
+class API_TestSplit2(unittest.TestCase):
+    def test_out(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data = fluid.layers.data('data', shape=[-1, 10], dtype='float32')
+            x0, x1 = paddle.split(data, num_or_sections=2, axis=1)
+            place = fluid.MLUPlace(0)
+            exe = fluid.Executor(place)
+            input1 = np.random.random([1, 10]).astype('float32')
+            r0, r1 = exe.run(feed={"data": input1}, fetch_list=[x0, x1])
+            ex_x0, ex_x1 = np.split(input1, 2, axis=1)
+            self.assertTrue(np.allclose(ex_x0, r0))
+            self.assertTrue(np.allclose(ex_x1, r1))
+
+
+class API_TestDygraphSplit(unittest.TestCase):
+    def test_out1(self):
+        with fluid.dygraph.guard(paddle.MLUPlace(0)):
+            input_1 = np.random.random([4, 6, 6]).astype("int32")
+            # input is a variable which shape is [4, 6, 6]
+            input = fluid.dygraph.to_variable(input_1)
+            x0, x1, x2 = paddle.split(input, num_or_sections=3, axis=1)
+            x0_out = x0.numpy()
+            x1_out = x1.numpy()
+            x2_out = x2.numpy()
+            ex_x0, ex_x1, ex_x2 = np.split(input_1, 3, axis=1)
+        self.assertTrue(np.allclose(ex_x0, x0_out))
+        self.assertTrue(np.allclose(ex_x1, x1_out))
+        self.assertTrue(np.allclose(ex_x2, x2_out))
+
+    def test_out2(self):
+        with fluid.dygraph.guard(paddle.MLUPlace(0)):
+            input_1 = np.random.random([4, 6, 6]).astype("int32")
+            # input is a variable which shape is [4, 6, 6]
+            input = fluid.dygraph.to_variable(input_1)
+            x0, x1, x2 = paddle.split(input, num_or_sections=[1, 2, 3], axis=1)
+            x0_out = x0.numpy()
+            x1_out = x1.numpy()
+            x2_out = x2.numpy()
+            ex_x0, ex_x1, ex_x2 = np.split(input_1, (1, 3), axis=1)
+        self.assertTrue(np.allclose(ex_x0, x0_out))
+        self.assertTrue(np.allclose(ex_x1, x1_out))
+        self.assertTrue(np.allclose(ex_x2, x2_out))
+
+
+# attr(axis) is Tensor
+class TestSplitOp_AxisTensor(OpTest):
+    def setUp(self):
+        self._set_op_type()
+        self.dtype = self.get_dtype()
+        self.init_data()
+        self.inputs = {
+            'X': self.x,
+            'AxisTensor': np.array([self.axis]).astype("int32")
+        }
+        self.attrs = {'sections': self.sections, 'num': self.num}
+
+        out = np.split(self.x, self.indices_or_sections, self.axis)
+        self.outputs = {'Out': [('out%d' % i, out[i]) \
+                                for i in range(len(out))]}
+
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def init_data(self):
+        self.x = np.random.random((4, 5, 6)).astype(self.dtype)
+        self.axis = 2
+        self.sections = []
+        self.num = 3
+        self.indices_or_sections = 3
+
+    def get_dtype(self):
+        return "float"
+
+    def _set_op_type(self):
+        self.op_type = "split"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestSplitOp_SectionsTensor(OpTest):
+    def setUp(self):
+        self._set_op_type()
+        self.dtype = self.get_dtype()
+        self.init_data()
+        self.inputs = {'X': self.x}
+
+        sections_tensor = []
+        for index, ele in enumerate(self.sections):
+            sections_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs['SectionsTensorList'] = sections_tensor
+
+        self.attrs = {
+            'axis': self.axis,
+            'sections': self.sections_infer,
+            'num': self.num
+        }
+
+        out = np.split(self.x, self.indices_or_sections, self.axis)
+        self.outputs = {'Out': [('out%d' % i, out[i]) \
+                                for i in range(len(out))]}
+
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def init_data(self):
+        self.x = np.random.random((4, 5, 6)).astype(self.dtype)
+        self.axis = 1
+        self.sections = [2, 1, 2]
+        self.sections_infer = [-1, -1, -1]
+        self.num = 0
+        self.indices_or_sections = [2, 3]
+
+    def get_dtype(self):
+        return "float"
+
+    def _set_op_type(self):
+        self.op_type = "split"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_sum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_sum_op_mlu.py
new file mode 100755
index 0000000000000000000000000000000000000000..e9db14de46ab58ebc300cc282f150244d02e0b48
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_sum_op_mlu.py
@@ -0,0 +1,116 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+paddle.enable_static()
+SEED = 2021
+
+
+class TestSum1(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.init_dtype()
+        self.op_type = "sum"
+        self.place = paddle.MLUPlace(0)
+
+        x0 = np.random.random((3, 40)).astype(self.dtype)
+        x1 = np.random.random((3, 40)).astype(self.dtype)
+        x2 = np.random.random((3, 40)).astype(self.dtype)
+        self.inputs = {'X': [("x0", x0), ("x1", x1), ("x2", x2)]}
+        y = x0 + x1 + x2
+        self.outputs = {'Out': y}
+
+        self.attrs = {'use_mkldnn': False}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestSum2(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.init_dtype()
+        self.op_type = "sum"
+        self.place = paddle.MLUPlace(0)
+
+        x0 = np.random.random((3, 3)).astype(self.dtype)
+        x1 = np.random.random((3, 3)).astype(self.dtype)
+        x2 = np.random.random((3, 3)).astype(self.dtype)
+        x3 = np.random.random((3, 3)).astype(self.dtype)
+        self.inputs = {'X': [("x0", x0), ("x1", x1), ("x2", x2), ("x3", x3)]}
+        # There will be a problem if just using `y=x0+x1+x2+x3` to calculate the
+        # summation result as the reference standard result. The reason is that 
+        # numpy's fp16 data has precision loss when doing `add` operation.
+        # For example, the results of `x0+x1+x2+x3` is different from that of
+        # `x3+x2+x1+x0` if the dtype is fp16.
+        # Therefore, converting the input to fp32 for calculation.
+        y = (x0.astype(np.float32) + x1.astype(np.float32) +
+             x2.astype(np.float32) + x3.astype(np.float32)).astype(self.dtype)
+        self.outputs = {'Out': y}
+
+        self.attrs = {'use_mkldnn': False}
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestSum3(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.init_dtype()
+        self.op_type = "sum"
+        self.place = paddle.MLUPlace(0)
+
+        x0 = np.random.random((3, 3)).astype(self.dtype)
+
+        self.inputs = {'X': [("x0", x0)]}
+        y = x0
+        self.outputs = {'Out': y}
+
+        self.attrs = {'use_mkldnn': False}
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
index b234e25823f4b370b9a4150ee3f8b7d635468952..a93abd3c1277681234209c27f54f0d019bf4e9df 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
@@ -27,7 +27,7 @@ from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
 from paddle.distributed.auto_parallel.partitioner import Partitioner
-from paddle.distributed.auto_parallel.reshard import reshard
+from paddle.distributed.auto_parallel.reshard import reshard, HAS_SENT, HAS_RECV, HAS_ALLGATHER
 from paddle.distributed.auto_parallel.process_group import _g_process_group_map
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 
@@ -143,7 +143,11 @@ def mlp_forward(train_program, start_program):
     return loss, train_program, start_program
 
 
-def get_dist_prog(train_program, startup_program, dist_context, rank_id):
+def get_dist_prog(train_program,
+                  startup_program,
+                  dist_context,
+                  rank_id,
+                  change_process_mesh=False):
     loss, train_program, startup_program = mlp_forward(train_program,
                                                        startup_program)
 
@@ -157,6 +161,12 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     complete_train_program = completer.complete_forward_annotation(
         train_program)
 
+    if change_process_mesh:
+        global PP_MESH_1
+        dist_context.get_tensor_dist_attr_for_program(
+            train_program.global_block().vars[
+                "gelu_0.tmp_0"]).process_mesh = PP_MESH_1
+
     params_grads = parallelizer._generate_backward(
         complete_train_program,
         startup_program,
@@ -308,6 +318,25 @@ class TestMLPReshard(unittest.TestCase):
         # parameter initialization of every rank should be different in the pipeline scene
         self.assertTrue(check_initialization(dist_startup_prog, rank_id))
 
+    def test_mlp_pp_diff_process_mesh(self):
+        HAS_SENT.clear()
+        HAS_RECV.clear()
+        HAS_ALLGATHER.clear()
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        dist_context = DistributedContext()
+        rank_id = 1
+        dist_main_prog, dist_startup_prog = get_dist_prog(
+            train_program, startup_program, dist_context, rank_id, True)
+        for key in list(_g_process_group_map.keys()):
+            del _g_process_group_map[key]
+        reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context)
+        print_program_with_dist_attr(dist_main_prog, dist_context)
+
+        # check send and recv result
+        self.assertTrue(check_send_recv_result(dist_main_prog, rank_id))
+        self.assertTrue(check_initialization(dist_startup_prog, rank_id))
+
     def test_mlp_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
diff --git a/python/paddle/fluid/tests/unittests/test_eager_trace_op.py b/python/paddle/fluid/tests/unittests/test_eager_trace_op.py
index 79de17fdb66412b32164574acb6e3d9446e0f29b..b67dbd0ba622d9f5dda96fd448d08ea71eb999fa 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_trace_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_trace_op.py
@@ -41,9 +41,6 @@ class TestEagerTraceOp(unittest.TestCase):
             paddle.fluid.framework._dygraph_tracer().trace_op(
                 'instance_norm', {'Scale': [scale],
                                   'X': [x]}, {'Y': [x]}, {})
-            paddle.fluid.framework._dygraph_tracer().trace_op(
-                'coalesce_tensor', {'Input': [x]}, {'Output': [x]},
-                {'dtype': int(core.VarDesc.VarType.FP32)})
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_run.py b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_run.py
new file mode 100644
index 0000000000000000000000000000000000000000..544fe4dd43e6b93a2a34215744fe0bdf506ad655
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_run.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import numpy as np
+import os
+from paddle.fluid import core
+
+paddle.enable_static()
+
+
+class TestDistModelRun(unittest.TestCase):
+    def test_dist_model_run(self):
+        # step 0: declare folder to save the model and params
+        folder = './dist_model_run_test/'
+        file = 'inf'
+        path_prefix = folder + file
+
+        # step 1: saving the inference model and params
+        x = paddle.static.data(name='x', shape=[28, 28], dtype='float32')
+        y = paddle.static.data(name='y', shape=[28, 1], dtype='int64')
+        predict = paddle.static.nn.fc(x, 10, activation='softmax')
+        loss = paddle.nn.functional.cross_entropy(predict, y)
+        avg_loss = paddle.tensor.stat.mean(loss)
+        exe = paddle.static.Executor(paddle.CUDAPlace(0))
+        exe.run(paddle.static.default_startup_program())
+        x_data = np.random.randn(28, 28).astype('float32')
+        y_data = np.random.randint(0, 9, size=[28, 1]).astype('int64')
+        exe.run(paddle.static.default_main_program(),
+                feed={'x': x_data,
+                      'y': y_data},
+                fetch_list=[avg_loss])
+        paddle.static.save_inference_model(path_prefix, [x, y], [avg_loss], exe)
+        print('save model to', path_prefix)
+
+        # step 2: prepare fake data for the inference
+        x_tensor = np.random.randn(28, 28).astype('float32')
+        y_tensor = np.random.randint(0, 9, size=[28, 1]).astype('int64')
+
+        # step 3: init the dist model to inference with fake data
+        config = core.DistModelConfig()
+        config.model_dir = path_prefix
+        config.place = 'GPU'
+        dist = core.DistModel(config)
+        dist.init()
+        dist_x = core.DistModelTensor(x_tensor, 'x')
+        dist_y = core.DistModelTensor(y_tensor, 'y')
+        input_data = [dist_x, dist_y]
+        output_rst = dist.run(input_data)
+        dist_model_rst = output_rst[0].as_ndarray().ravel().tolist()
+        print("dist model rst:", dist_model_rst)
+
+        # step 4: use framework's api to inference with fake data
+        [inference_program, feed_target_names, fetch_targets] = (
+            paddle.static.load_inference_model(path_prefix, exe))
+        results = exe.run(inference_program,
+                          feed={'x': x_tensor,
+                                'y': y_tensor},
+                          fetch_list=fetch_targets)
+        load_inference_model_rst = results[0]
+        print("load inference model api rst:", load_inference_model_rst)
+
+        # step 5: compare two results
+        self.assertTrue(np.allclose(dist_model_rst, load_inference_model_rst))
+
+        # step 6: clean up the env, delete the saved model and params
+        os.remove(path_prefix + '.pdiparams')
+        os.remove(path_prefix + '.pdmodel')
+        os.rmdir(folder)
+        print('cleaned up the env')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/test_dist_model_tensor.py
rename to python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py
index da25550c4f47ed1bf5f694618afce722989139ca..a74b4f0d224ef6c165cfadc785f1de9c50d8de4a 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index d2d931f148078d124a25ddbb888b3e9cb5911211..7dd310d2b88a90e09ba5ceedb541da4be263e559 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -278,6 +278,8 @@ class TestLayerNormOp(unittest.TestCase):
             has_scale=False,
             has_bias=False,
             y_grad_scale=0.1)
+        self.check_forward_backward(
+            shape=[512, 1024], begin_norm_axis=1, has_scale=True, has_bias=True)
 
 
 class TestLayerNormAPI(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index 352089e1fb75fa4c3423d29012fd85c3d611c81b..b20305b78efe2dfe73e069e13f0d0eca3bb84057 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -19,11 +19,12 @@ import numpy as np
 from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 
 
-def p_norm(x, axis, porder, keepdims=False):
+def p_norm(x, axis, porder, keepdims=False, reduce_all=False):
     r = []
-    if axis is None:
+    if axis is None or reduce_all:
         x = x.flatten()
         if porder == np.inf:
             r = np.amax(np.abs(x), keepdims=keepdims)
@@ -53,8 +54,8 @@ def p_norm(x, axis, porder, keepdims=False):
     else:
         if isinstance(axis, list):
             axis = tuple(axis)
-        r = np.linalg.norm(
-            x, ord=porder, axis=axis, keepdims=keepdims).astype(x.dtype)
+        r = np.linalg.norm(x, ord=porder, axis=axis, keepdims=keepdims)
+    r = r.astype(x.dtype)
 
     return r
 
@@ -111,13 +112,14 @@ class TestPnormOp(OpTest):
         self.op_type = "p_norm"
         self.init_test_case()
         x = (np.random.random(self.shape) + 0.5).astype(self.dtype)
-        norm = p_norm(x, self.axis, self.porder, self.keepdim)
+        norm = p_norm(x, self.axis, self.porder, self.keepdim, self.asvector)
         self.inputs = {'X': x}
         self.attrs = {
             'epsilon': self.epsilon,
             'axis': self.axis,
             'keepdim': self.keepdim,
-            'porder': float(self.porder)
+            'porder': float(self.porder),
+            'asvector': self.asvector
         }
         self.outputs = {'Out': norm}
         self.gradient = self.calc_gradient()
@@ -135,34 +137,42 @@ class TestPnormOp(OpTest):
         self.porder = 2.0
         self.keepdim = False
         self.dtype = "float64"
+        self.asvector = False
 
     def calc_gradient(self):
         self.attrs = {
             'epsilon': self.epsilon,
             'axis': self.axis,
             'keepdim': self.keepdim,
-            'porder': float(self.porder)
+            'porder': float(self.porder),
+            'asvector': self.asvector
         }
         x = self.inputs["X"]
         porder = self.attrs["porder"]
         axis = self.attrs["axis"]
+        asvector = self.attrs["asvector"]
+        x_dtype = x.dtype
+        x = x.astype(np.float32) if x.dtype == np.float16 else x
         if porder == 0:
             grad = np.zeros(x.shape).astype(x.dtype)
         elif porder in [float("inf"), float("-inf")]:
-            norm = p_norm(x, axis=axis, porder=porder, keepdims=True)
+            norm = p_norm(
+                x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector)
             x_abs = np.abs(x)
             grad = np.sign(x)
             grad[x_abs != norm] = 0.0
         else:
-            norm = p_norm(x, axis=axis, porder=porder, keepdims=True)
+            norm = p_norm(
+                x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector)
             grad = np.power(norm, 1 - porder) * np.power(
                 np.abs(x), porder - 1) * np.sign(x)
 
         numel = 1
         for s in x.shape:
             numel *= s
-        numel /= x.shape[axis]
-        return [grad.astype(x.dtype) * 1 / numel]
+        divisor = numel if asvector else x.shape[axis]
+        numel /= divisor
+        return [grad.astype(x_dtype) * 1 / numel]
 
 
 class TestPnormOp2(TestPnormOp):
@@ -173,6 +183,7 @@ class TestPnormOp2(TestPnormOp):
         self.porder = 2.0
         self.keepdim = True
         self.dtype = "float32"
+        self.asvector = False
 
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
@@ -186,6 +197,7 @@ class TestPnormOp3(TestPnormOp):
         self.porder = np.inf
         self.keepdim = True
         self.dtype = "float32"
+        self.asvector = False
 
     def test_check_grad(self):
         self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
@@ -199,6 +211,7 @@ class TestPnormOp4(TestPnormOp):
         self.porder = -np.inf
         self.keepdim = True
         self.dtype = "float32"
+        self.asvector = False
 
     def test_check_grad(self):
         self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
@@ -212,11 +225,63 @@ class TestPnormOp5(TestPnormOp):
         self.porder = 0
         self.keepdim = True
         self.dtype = "float32"
+        self.asvector = False
 
     def test_check_grad(self):
         self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
 
 
+class TestPnormOp6(TestPnormOp):
+    def init_test_case(self):
+        self.shape = [3, 20, 3]
+        self.axis = -1
+        self.epsilon = 1e-12
+        self.porder = 2
+        self.keepdim = False
+        self.dtype = "float32"
+        self.asvector = True
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestPnormOpFP16(TestPnormOp):
+    def init_test_case(self):
+        self.shape = [2, 3, 4, 5]
+        self.axis = 1
+        self.epsilon = 1e-12
+        self.porder = 2.0
+        self.keepdim = False
+        self.dtype = "float16"
+        self.asvector = False
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_output_with_place(place, atol=1e-3)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_grad_with_place(
+                place, ['X'], 'Out', user_defined_grads=self.gradient)
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestPnormOpFP161(TestPnormOpFP16):
+    def init_test_case(self):
+        self.shape = [2, 3, 4, 5]
+        self.axis = -1
+        self.epsilon = 1e-12
+        self.porder = 2.0
+        self.keepdim = False
+        self.dtype = "float16"
+        self.asvector = True
+
+
 def run_fro(self, p, axis, shape_x, dtype, keep_dim, check_dim=False):
     with fluid.program_guard(fluid.Program()):
         data = fluid.data(name="X", shape=shape_x, dtype=dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
index 5ba54daa0d4cbc49d4693090a853347f2e4355ab..a3bfe3864a2493fdcf100a1a86648a159701ec11 100644
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -63,6 +63,12 @@ class TestVariable(unittest.TestCase):
         self.assertRaises(ValueError,
                           lambda: b.create_var(name="fc.w", shape=(24, 100)))
 
+        w = b.create_var(
+            dtype=paddle.fluid.core.VarDesc.VarType.STRINGS,
+            shape=[1],
+            name="str_var")
+        self.assertEqual(None, w.lod_level)
+
     def test_element_size(self):
         with fluid.program_guard(Program(), Program()):
             x = paddle.static.data(name='x1', shape=[2], dtype='bool')
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py
index cb54d12488d542e515b01c5a04407884eac41152..a1eb0af2bc978dd46b9f25b81f972669d7b93d94 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py
@@ -52,8 +52,9 @@ class XPUTestArgsortOp1(XPUOpTestWrapper):
         classes = []
         for descending in [True, False]:
             for axis in [0, 1, 2, -1, -2]:
-                class_name = 'XPUTestArgsortOp_axis_' + str(axis)
-                attr_dict = {'init_axis': axis, 'descending': descending}
+                class_name = 'XPUTestArgsortOp_axis_' + str(axis) + '_' + str(
+                    descending)
+                attr_dict = {'init_axis': axis, 'init_descending': descending}
                 classes.append([class_name, attr_dict])
         return base_class, classes
 
@@ -64,8 +65,9 @@ class XPUTestArgsortOp1(XPUOpTestWrapper):
             self.place = paddle.XPUPlace(0)
             self.dtype = self.in_type
             self.input_shape = (2, 2, 2, 3, 3)
-            self.axis = -1
-            self.descending = False
+            self.axis = -1 if not hasattr(self, 'init_axis') else self.init_axis
+            self.descending = False if not hasattr(
+                self, 'init_descending') else self.init_descending
 
             if self.in_type == 'float32':
                 self.x = np.random.random(self.input_shape).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py
index 4ceacd52092341347ce5633c5b439ad49e7ca8de..9cb31d4270552d23435a6bebc71ae6ef208b204d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py
@@ -73,6 +73,39 @@ class TestSigmoidCrossEntropyWithLogitsOp1(XPUOpTest):
         self.dtype = np.float32
 
 
+class TestSigmoidCrossEntropyWithLogitsOp2(
+        TestSigmoidCrossEntropyWithLogitsOp1):
+    """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
+    """
+
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        self.set_xpu()
+        self.init_dtype()
+
+        batch_size = 64
+        num_classes = 20
+        ignore_index = -1
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype(self.dtype)),
+            'Label': np.random.randint(-1, 2, (batch_size, num_classes))
+            .astype(self.dtype)
+        }
+        self.attrs = {'ignore_index': ignore_index, }
+
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        out = -term1 - term2
+        out[np.where(self.inputs['Label'] == ignore_index)] = 0
+        self.outputs = {'Out': out}
+
+
 class TestSigmoidCrossEntropyWithLogitsOp3(
         TestSigmoidCrossEntropyWithLogitsOp1):
     """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
@@ -102,6 +135,42 @@ class TestSigmoidCrossEntropyWithLogitsOp3(
         self.outputs = {'Out': -term1 - term2}
 
 
+class TestSigmoidCrossEntropyWithLogitsOp4(
+        TestSigmoidCrossEntropyWithLogitsOp1):
+    """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
+    """
+
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        self.set_xpu()
+        self.init_dtype()
+
+        batch_size = 64
+        num_classes = 20
+        ignore_index = -1
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype(self.dtype)),
+            'Label': np.random.randint(-1, 2, (batch_size, num_classes))
+            .astype(self.dtype)
+        }
+        self.attrs = {'ignore_index': ignore_index, 'normalize': True}
+
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        out = -term1 - term2
+        out[np.where(self.inputs['Label'] == ignore_index)] = 0
+        if self.attrs['normalize']:
+            out = out / float(
+                np.where(self.inputs['Label'] != ignore_index)[0].size)
+        self.outputs = {'Out': out}
+
+
 class TestSigmoidCrossEntropyWithLogitsOp5(
         TestSigmoidCrossEntropyWithLogitsOp1):
     """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
@@ -131,6 +200,42 @@ class TestSigmoidCrossEntropyWithLogitsOp5(
         self.outputs = {'Out': -term1 - term2}
 
 
+class TestSigmoidCrossEntropyWithLogitsNorm(
+        TestSigmoidCrossEntropyWithLogitsOp1):
+    """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
+    """
+
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        self.set_xpu()
+        self.init_dtype()
+
+        batch_size = [10, 10]
+        num_classes = 20
+        ignore_index = -1
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
+                .astype(self.dtype)),
+            'Label': np.random.randint(-1, 2, tuple(batch_size + [num_classes]))
+            .astype(self.dtype)
+        }
+        self.attrs = {'ignore_index': ignore_index, 'normalize': True}
+
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        out = -term1 - term2
+        out[np.where(self.inputs['Label'] == ignore_index)] = 0
+        if self.attrs['normalize']:
+            out = out / float(
+                np.where(self.inputs['Label'] != ignore_index)[0].size)
+        self.outputs = {'Out': out}
+
+
 class TestSigmoidCrossEntropyWithLogitsOp6(
         TestSigmoidCrossEntropyWithLogitsOp1):
     """Test sigmoid_cross_entropy_with_logit_op with binary label
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index f37b45eef1b80211cbb749c20b489af43cdafdee..e5ccd6b04054a27c6df9963735dc63c7db9c7c56 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -3,7 +3,7 @@
   output : Tensor
   infer_meta :
     func : ElementwiseInferMeta
-    param : [x, y, -1]
+    param : [x, y]
   kernel :
     func : add
 
@@ -40,7 +40,7 @@
   output : Tensor
   infer_meta :
     func : ElementwiseInferMeta
-    param : [x, y, -1]
+    param : [x, y]
   kernel :
     func : divide
 
@@ -135,7 +135,7 @@
   output : Tensor
   infer_meta :
     func : ElementwiseInferMeta
-    param : [x, y, -1]
+    param : [x, y]
   kernel :
     func : multiply
 
@@ -166,19 +166,19 @@
   output : Tensor
   infer_meta :
     func : ElementwiseInferMeta
-    param : [x, y, -1]
+    param : [x, y]
   kernel :
     func : subtract
 
 - api : sum
   args : (const Tensor& x, const std::vector<int64_t>& axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
   output : Tensor
-  infer_meta :
-    func : ReduceInferMeta
-    param: [x, axis, keep_dim, dtype]
-  kernel :
+  infer_meta : 
+    func : SumInferMeta
+    param: [x, axis, dtype, keep_dim]
+  kernel : 
     func : sum
-    param : [x, axis, keep_dim, dtype]
+    param : [x, axis, dtype, keep_dim]
     data_type : x
 
 - api : zeros_like
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 6bb02ab9d40dbe28b01bf669417a8d521c6458da..09182768f242760bc0b6c74cc37a4e3a0a0fb60e 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -31,7 +31,12 @@ class API:
         #     names : [], list of attribute names
         #     attr_info : { attr_name : (type, default_values)}
         self.args = gen_utils.parse_args(self.api, api_item_yaml['args'])
-        self.output = api_item_yaml['output']
+        self.out_type_list, _ = gen_utils.parse_output(self.api,
+                                                       api_item_yaml['output'])
+        self.return_type = self.out_type_list[0] if len(
+            self.out_type_list) == 1 else "std::tuple<" + ",".join(
+                self.out_type_list) + ">"
+
         self.is_base_api = True
         if 'invoke' in api_item_yaml:
             self.is_base_api = False
@@ -54,18 +59,44 @@ class API:
 
     def gene_api_declaration(self):
         return f"""
-PADDLE_API {self.output} {self.api}({self.args['args_declare']});
+PADDLE_API {self.return_type} {self.api}({self.args['args_declare']});
 """
 
+    def gene_output(self, output_type_list):
+        kernel_output = ""
+        output_create = ""
+
+        if len(output_type_list) == 1:
+            kernel_output = 'dense_out'
+            output_create = f"""
+  {self.return_type} out;
+  auto dense_out = SetKernelOutput(out_meta, kernel_backend, &out);"""
+
+        elif len(output_type_list) > 1:
+            output_create = f"""
+  {self.return_type} out;"""
+
+            for i in range(len(output_type_list)):
+                kernel_output = kernel_output + f'dense_out_{i}, '
+                output_create = output_create + f"""
+  auto dense_out_{i} = SetKernelOutput(std::get<{i}>(out_meta), kernel_backend, &std::get<{i}>(out));"""
+
+            kernel_output = kernel_output[:-2]
+        else:
+            raise ValueError(
+                "{} : Output error: the output should not be empty.".format(
+                    self.api))
+
+        return kernel_output, output_create
+
     def gene_api_code(self):
         if self.is_base_api:
             input_tensors, kernel_args = gen_utils.get_kernel_args(
                 self.args['inputs']['names'], self.args['attrs'],
                 self.kernel['param'])
-            out_type, _ = gen_utils.parse_output(self.api, self.output)
-            outputs_args, output_create = gen_utils.gene_output(out_type)
+            outputs_args, output_create = self.gene_output(self.out_type_list)
             return f"""
-PADDLE_API {self.output} {self.api}({self.args["args_define"]}) {{
+PADDLE_API {self.return_type} {self.api}({self.args["args_define"]}) {{
 {gen_utils.gene_kernel_select(self.api, self.args['inputs']['names'], self.args['attrs'], self.kernel)}
 
   auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
@@ -82,7 +113,7 @@ PADDLE_API {self.output} {self.api}({self.args["args_define"]}) {{
 
         else:
             return f"""
-PADDLE_API {self.output} {self.api}({self.args["args_define"]}) {{
+PADDLE_API {self.return_type} {self.api}({self.args["args_define"]}) {{
   return {self.invoke};
 }}
 """
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index 0cb14327f6e09092bbce0229ae26f1b456238802..d55759b51c2e79be59f5881b0546338334a54342 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -23,9 +23,11 @@ import gen_utils
 class BackwardAPI:
     def __init__(self, backward_item_yaml):
         self.backward_api = backward_item_yaml['backward_api']
-        self.args, self.output_type, self.return_comment = self.parse_and_check_args(
+        self.args, self.output_type_list, self.return_comment = self.parse_and_check_args(
             backward_item_yaml['forward'], backward_item_yaml['args'],
             backward_item_yaml['output'])
+        self.return_type = self.output_type_list[0] if len(
+            self.output_type_list) == 1 else "std::vector<std::vector<Tensor>>"
 
         self.is_base_api = True
         if 'invoke' in backward_item_yaml:
@@ -81,36 +83,65 @@ class BackwardAPI:
                  Please check the args of {self.backward_api} in yaml."
 
         # check the output of backward
-        output_type, return_comment = gen_utils.parse_output(self.backward_api,
-                                                             output_config)
-        assert output_type.count('Tensor') <= len(fw_inputs['names']), \
+        out_type_list, return_comment = gen_utils.parse_output(
+            self.backward_api, output_config)
+        assert len(out_type_list) <= len(fw_inputs['names']), \
             f"{self.backward_api} : Output error: The number of ouputs should be less then the number of inputs of forward api. \
              Please check the output of {self.backward_api} in yaml."
 
-        return bw_args, output_type, return_comment
+        return bw_args, out_type_list, return_comment
 
     def gene_api_declaration(self):
         if self.return_comment:
             return f"""
 // {self.return_comment}
-{self.output_type} {self.backward_api}({self.args['args_declare']});
+{self.return_type} {self.backward_api}({self.args['args_declare']});
 """
 
         else:
             return f"""
-{self.output_type} {self.backward_api}({self.args['args_declare']});
+{self.return_type} {self.backward_api}({self.args['args_declare']});
 """
 
+    def gene_output(self, output_type_list):
+        kernel_output = ""
+        output_create = ""
+
+        if len(output_type_list) == 1:
+            return_type = output_type_list[0]
+            kernel_output = 'dense_out'
+            output_create = f"""
+  {self.return_type} out;
+  auto dense_out = SetKernelOutput(out_meta, kernel_backend, &out);"""
+
+        elif len(output_type_list) > 1:
+            output_create = f"""
+  {self.return_type} out;"""
+
+            for i, out_type_item in enumerate(output_type_list):
+                kernel_output = kernel_output + f'dense_out_{i}, '
+                get_out_code = f'&out[{i}][0]' if out_type_item == 'Tensor' else f'&out[{i}]'
+                output_create = output_create + f"""
+  auto dense_out_{i} = SetKernelOutput(std::get<{i}>(out_meta), kernel_backend, {get_out_code});"""
+
+            kernel_output = kernel_output[:-2]
+        else:
+            raise ValueError(
+                "{} : Output error: the output should not be empty.".format(
+                    self.backward_api))
+
+        return kernel_output, output_create
+
     def gene_api_code(self):
         if self.is_base_api:
             input_tensors, kernel_args = gen_utils.get_kernel_args(
                 self.args['inputs']['names'], self.args['attrs'],
                 self.kernel['param'])
-            outputs_args, output_create = gen_utils.gene_output(
-                self.output_type)
+            outputs_args, output_create = self.gene_output(
+                self.output_type_list)
             return f"""
 // {self.return_comment}
-{self.output_type} {self.backward_api}({self.args["args_define"]}) {{
+{self.return_type} {self.backward_api}({self.args["args_define"]}) {{
 {gen_utils.gene_kernel_select(self.backward_api, self.args['inputs']['names'], self.args['attrs'], self.kernel)}
 
   auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
@@ -143,7 +174,7 @@ class BackwardAPI:
                 params_code = self.args["args_define"]
             return f"""
 // {self.return_comment}
-{self.output_type} {self.backward_api}({params_code}) {{
+{self.return_type} {self.backward_api}({params_code}) {{
   return {invoke_code};
 }}
 """
diff --git a/python/paddle/utils/code_gen/gen_utils.py b/python/paddle/utils/code_gen/gen_utils.py
index 9d368c292b7cfefb0121aba9f0c0fcdc7b0a4caf..bdc29420558e910417e3b3deb7781e6b2d836766 100644
--- a/python/paddle/utils/code_gen/gen_utils.py
+++ b/python/paddle/utils/code_gen/gen_utils.py
@@ -124,7 +124,7 @@ def parse_output(api_name, output_config):
 
     if len(temp_list) == 1:
         out_type, out_name = parse_output_item(temp_list[0])
-        return out_type, out_name
+        return [out_type], out_name
     else:
         out_type_list = []
         out_name_list = []
@@ -133,8 +133,7 @@ def parse_output(api_name, output_config):
             out_type_list.append(out_type)
             out_name_list.append(out_name)
 
-        return "std::tuple<" + ",".join(out_type_list) + ">", ", ".join(
-            out_name_list)
+        return out_type_list, ", ".join(out_name_list)
 
 
 def gene_kernel_select(api, input_names, attrs, kernel) -> str:
@@ -241,7 +240,7 @@ def gene_kernel_select(api, input_names, attrs, kernel) -> str:
 
     if len(input_names) > 0:
         kernel_select_code = kernel_select_code + f"""
-  if (kernel_backend == Backend::UNDEFINED 
+  if (kernel_backend == Backend::UNDEFINED
         || kernel_layout == DataLayout::UNDEFINED
         || kernel_data_type == DataType::UNDEFINED ) {{
     auto kernel_key_set = ParseKernelKeyByInputArgs({kernel_select_args});
@@ -315,24 +314,3 @@ def get_kernel_args(input_names, attrs, kernel_param):
         else:
             kernel_args = kernel_args + str(param) + ", "
     return input_tensor_code, kernel_args[:-2]
-
-
-def gene_output(output_type):
-    kernel_output = ""
-    output_create = f"""
-  {output_type} out;"""
-
-    if output_type == 'Tensor' or output_type == 'std::vector<Tensor>':
-        kernel_output = 'dense_out'
-        output_create = output_create + """
-  auto dense_out = SetKernelOutput(out_meta, kernel_backend, &out);"""
-    elif re.match(r'std::tuple<.*>$', output_type):
-        out_num = output_type.count('Tensor')
-        for i in range(out_num):
-            kernel_output = kernel_output + f'dense_out_{i}, '
-            output_create = output_create + f"""
-  auto dense_out_{i} = SetKernelOutput(std::get<{i}>(out_meta), kernel_backend, &std::get<{i}>(out));"""
-
-        kernel_output = kernel_output[:-2]
-
-    return kernel_output, output_create
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index f085eac1e358dd90e6377e015e360fc86ab6ca5c..853a98a62b504d94617127bd35212d2412719e1c 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -449,8 +449,8 @@ def _get_include_dirs_when_compiling(compile_dir):
     include_dirs_file = 'includes.txt'
     path = os.path.abspath(compile_dir)
     include_dirs_file = os.path.join(path, include_dirs_file)
-    if not os.path.isfile(include_dirs_file):
-        return []
+    assert os.path.isfile(include_dirs_file), "File {} does not exist".format(
+        include_dirs_file)
     with open(include_dirs_file, 'r') as f:
         include_dirs = [line.strip() for line in f.readlines() if line.strip()]
 
diff --git a/python/setup.py.in b/python/setup.py.in
index aee4e149b0afeeb4c33d617b7db2a19bf2c36a91..e8cc2914521f323fd50e491dab1f8e7eb1421a3b 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -519,6 +519,10 @@ if '${WITH_XPU_BKCL}' == 'ON':
     shutil.copy('${XPU_BKCL_LIB}', libs_path)
     package_data['paddle.libs']+=['${XPU_BKCL_LIB_NAME}']
 
+if '${WITH_IPU}' == 'ON':
+    shutil.copy('${PADDLE_IPU_LIB}', libs_path)
+    package_data['paddle.libs'] += ['libpaddle_ipu' + ext_name]
+
 # remove unused paddle/libs/__init__.py
 if os.path.isfile(libs_path+'/__init__.py'):
     os.remove(libs_path+'/__init__.py')