From c1e5a393694041046f6556019c73e16ddf53d5e3 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Tue, 25 Jan 2022 11:04:48 +0800
Subject: [PATCH] [PTEN] Add xpu context. (#39098)

---
 paddle/fluid/framework/pten_utils.h           |   7 +
 .../amp/check_finite_and_unscale_op_xpu.cc    |   4 +-
 paddle/fluid/operators/dropout_op_xpu.cc      |   4 +-
 paddle/fluid/operators/reshape_op.cc          |  10 +-
 paddle/fluid/operators/softmax_op_xpu.cc      |   4 +-
 paddle/fluid/platform/CMakeLists.txt          |   3 +
 .../fluid/platform/device/xpu/CMakeLists.txt  |   2 +-
 .../fluid/platform/device/xpu/enforce_xpu.h   | 157 +-------------
 paddle/fluid/platform/device/xpu/xpu_header.h |  39 +---
 paddle/fluid/platform/device/xpu/xpu_info.cc  | 120 ++---------
 paddle/fluid/platform/device/xpu/xpu_info.h   |  27 +--
 .../fluid/platform/device/xpu/xpu_op_list.cc  |  14 +-
 .../fluid/platform/device/xpu/xpu_op_list.h   |   6 +-
 paddle/fluid/platform/device_context.cc       |  44 +---
 paddle/fluid/platform/device_context.h        |  30 +--
 paddle/fluid/pybind/pybind.cc                 |  23 +-
 paddle/pten/backends/CMakeLists.txt           |  10 +-
 paddle/pten/backends/cpu/cpu_context.cc       |  22 +-
 paddle/pten/backends/xpu/CMakeLists.txt       |   2 +
 paddle/pten/backends/xpu/enforce_xpu.h        | 194 +++++++++++++++++
 paddle/pten/backends/xpu/forwards.h           |  28 +++
 paddle/pten/backends/xpu/xpu_context.cc       | 169 +++++++++++++++
 paddle/pten/backends/xpu/xpu_context.h        |  59 +++++-
 paddle/pten/backends/xpu/xpu_header.h         |  56 +++++
 paddle/pten/backends/xpu/xpu_info.cc          | 199 ++++++++++++++++++
 paddle/pten/backends/xpu/xpu_info.h           |  93 ++++++++
 paddle/pten/core/device_context.cc            |  51 ++++-
 paddle/pten/core/device_context.h             |  29 ++-
 28 files changed, 958 insertions(+), 448 deletions(-)
 create mode 100644 paddle/pten/backends/xpu/CMakeLists.txt
 create mode 100644 paddle/pten/backends/xpu/enforce_xpu.h
 create mode 100644 paddle/pten/backends/xpu/forwards.h
 create mode 100644 paddle/pten/backends/xpu/xpu_context.cc
 create mode 100644 paddle/pten/backends/xpu/xpu_header.h
 create mode 100644 paddle/pten/backends/xpu/xpu_info.cc
 create mode 100644 paddle/pten/backends/xpu/xpu_info.h
diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
index ab129c6313d..4985e53ee65 100644
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -86,5 +86,12 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> {
   using TYPE = pten::CPUContext;
 };
 
+#ifdef PADDLE_WITH_XPU
+template <>
+struct ConvertToPtenContext<platform::XPUDeviceContext> {
+  using TYPE = pten::XPUContext;
+};
+#endif
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
index 979ae5c508c..5d769214df4 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
@@ -94,11 +94,11 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
         inverse_scale = 0.0;
       }
 
-      paddle::platform::XPUVersion version = dev_ctx.xpu_version();
+      auto version = dev_ctx.xpu_version();
       framework::Tensor float_x;
       framework::Tensor float_out;
       if (std::is_same<T, paddle::platform::float16>::value &&
-          (version == paddle::platform::XPUVersion::XPU1)) {
+          (version == pten::backends::xpu::XPUVersion::XPU1)) {
         float_x.mutable_data<MPDType>(dev_ctx.GetPlace(),
                                       x->numel() * sizeof(MPDType));
         float_out.mutable_data<MPDType>(dev_ctx.GetPlace(),
diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc
index cded525b030..e80797bd9b9 100644
--- a/paddle/fluid/operators/dropout_op_xpu.cc
+++ b/paddle/fluid/operators/dropout_op_xpu.cc
@@ -107,8 +107,8 @@ class DropoutGradXPUKernel : public framework::OpKernel<T> {
       return;
     }
 
-    paddle::platform::XPUVersion version = dev_ctx.xpu_version();
-    if (version == paddle::platform::XPUVersion::XPU1) {
+    auto version = dev_ctx.xpu_version();
+    if (version == pten::backends::xpu::XPUVersion::XPU1) {
       xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
       XPUType* mask_new = RAII_GUARD.alloc_l3_or_gm<XPUType>(mask->numel());
       float scale =
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index dc82d7c6c1e..5170729a769 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -448,7 +448,8 @@ class ReshapeKernel {
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
-      pten::ReshapeKernel(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out);
+      pten::ReshapeKernel(static_cast<const pten::XPUContext &>(dev_ctx),
+                          *pt_x.get(), pt_scalar_shape, pt_out);
     }
 #endif
     // non-inplace need move all result from pt_out to out, inplace need set
@@ -485,7 +486,8 @@ class ReshapeGradKernel {
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
-      pten::ReshapeGradKernel(dev_ctx, *pt_d_out.get(), pt_d_x.get());
+      pten::ReshapeGradKernel(static_cast<const pten::XPUContext &>(dev_ctx),
+                              *pt_d_out.get(), pt_d_x.get());
     }
 #endif
   }
@@ -516,7 +518,9 @@ class ReshapeDoubleGradKernel {
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
-      pten::ReshapeDoubleGradKernel(dev_ctx, *pt_dd_x.get(), pt_dd_out.get());
+      pten::ReshapeDoubleGradKernel(
+          static_cast<const pten::XPUContext &>(dev_ctx), *pt_dd_x.get(),
+          pt_dd_out.get());
     }
 #endif
   }
diff --git a/paddle/fluid/operators/softmax_op_xpu.cc b/paddle/fluid/operators/softmax_op_xpu.cc
index 0adc12e684c..a0d4b4c4eb4 100644
--- a/paddle/fluid/operators/softmax_op_xpu.cc
+++ b/paddle/fluid/operators/softmax_op_xpu.cc
@@ -45,8 +45,8 @@ class SoftmaxXPUKernel : public framework::OpKernel<T> {
     auto& dev_ctx = context.template device_context<DeviceContext>();
 
     int r = XPU_SUCCESS;
-    paddle::platform::XPUVersion version = dev_ctx.xpu_version();
-    if (version == paddle::platform::XPUVersion::XPU1) {
+    auto version = dev_ctx.xpu_version();
+    if (version == pten::backends::xpu::XPUVersion::XPU1) {
       xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
       XPUType* clip_x_data_l3 = RAII_GUARD.alloc_l3_or_gm<XPUType>(x->numel());
       r = xpu::clip_v2(dev_ctx.x_context(),
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 97a31752333..5695fd03bac 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -121,6 +121,9 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
     place pten_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
     ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} cpu_context)
+if(WITH_XPU)
+  target_link_libraries(device_context xpu_context)
+endif()
 
 cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)
 if(WITH_ASCEND_CL)
diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt
index f89c8c193ae..d292ce130eb 100644
--- a/paddle/fluid/platform/device/xpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -4,7 +4,7 @@ endif()
 
 set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl)
 
-cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place)
+cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place pten_xpu_info)
 cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context)
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/platform/device/xpu/enforce_xpu.h b/paddle/fluid/platform/device/xpu/enforce_xpu.h
index 4c85168f68d..ae5ec8e851d 100644
--- a/paddle/fluid/platform/device/xpu/enforce_xpu.h
+++ b/paddle/fluid/platform/device/xpu/enforce_xpu.h
@@ -15,177 +15,36 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "xpu/bkcl.h"
+
+#include "paddle/pten/backends/xpu/enforce_xpu.h"
 
 namespace paddle {
 namespace platform {
 
 // Note: XPU runtime api return int, not XPUError_t
 inline const char* xpuGetErrorString(int stat) {
-  switch (stat) {
-    case XPU_SUCCESS:
-      return "Success";
-    case XPUERR_INVALID_DEVICE:
-      return "Invalid XPU device";
-    case XPUERR_UNINIT:
-      return "XPU runtime not properly inited";
-    case XPUERR_NOMEM:
-      return "Device memory not enough";
-    case XPUERR_NOCPUMEM:
-      return "CPU memory not enough";
-    case XPUERR_INVALID_PARAM:
-      return "Invalid parameter";
-    case XPUERR_NOXPUFUNC:
-      return "Cannot get XPU Func";
-    case XPUERR_LDSO:
-      return "Error loading dynamic library";
-    case XPUERR_LDSYM:
-      return "Error loading func from dynamic library";
-    case XPUERR_SIMULATOR:
-      return "Error from XPU Simulator";
-    case XPUERR_NOSUPPORT:
-      return "Operation not supported";
-    case XPUERR_ABNORMAL:
-      return "Device abnormal due to previous error";
-    case XPUERR_KEXCEPTION:
-      return "Exception in kernel execution";
-    case XPUERR_TIMEOUT:
-      return "Kernel execution timed out";
-    case XPUERR_BUSY:
-      return "Resource busy";
-    case XPUERR_USEAFCLOSE:
-      return "Use a stream after closed";
-    case XPUERR_UCECC:
-      return "Uncorrectable ECC";
-    case XPUERR_OVERHEAT:
-      return "Overheat";
-    case XPUERR_UNEXPECT:
-      return "Execution error, reach unexpected control flow";
-    case XPUERR_DEVRESET:
-      return "Device is being reset, try again later";
-    case XPUERR_HWEXCEPTION:
-      return "Hardware module exception";
-    case XPUERR_HBM_INIT:
-      return "Error init HBM";
-    case XPUERR_DEVINIT:
-      return "Error init device";
-    case XPUERR_PEERRESET:
-      return "Device is being reset, try again later";
-    case XPUERR_MAXDEV:
-      return "Device count exceed limit";
-    case XPUERR_NOIOC:
-      return "Unknown IOCTL command";
-    case XPUERR_DMATIMEOUT:
-      return "DMA timed out, a reboot maybe needed";
-    case XPUERR_DMAABORT:
-      return "DMA aborted due to error, possibly wrong address or hardware "
-             "state";
-    case XPUERR_MCUUNINIT:
-      return "Firmware not initialized";
-    case XPUERR_OLDFW:
-      return "Firmware version too old (<15), please update.";
-    case XPUERR_PCIE:
-      return "Error in PCIE";
-    case XPUERR_FAULT:
-      return "Error copy between kernel and user space";
-    case XPUERR_INTERRUPTED:
-      return "Execution interrupted by user";
-    default:
-      return "unkonwn error";
-  }
+  return pten::backends::xpu::xpuGetErrorString(stat);
 }
 
 inline const char* bkclGetErrorString(BKCLResult_t stat) {
-  switch (stat) {
-    case BKCL_SUCCESS:
-      return "BKCL_SUCCESS";
-    case BKCL_INVALID_ARGUMENT:
-      return "BKCL_INVALID_ARGUMENT";
-    case BKCL_RUNTIME_ERROR:
-      return "BKCL_RUNTIME_ERROR";
-    case BKCL_SYSTEM_ERROR:
-      return "BKCL_SYSTEM_ERROR";
-    case BKCL_INTERNAL_ERROR:
-      return "BKCL_INTERNAL_ERROR";
-    default:
-      return "Unknown BKCL status";
-  }
+  return pten::backends::xpu::bkclGetErrorString(stat);
 }
 
 inline const char* xdnnGetErrorString(int stat) {
-  switch (stat) {
-    case xpu::Error_t::SUCCESS:
-      return "XDNN_SUCCESS";
-    case xpu::Error_t::INVALID_PARAM:
-      return "XDNN_INVALID_PARAM";
-    case xpu::Error_t::RUNTIME_ERROR:
-      return "XDNN_RUNTIME_ERROR";
-    case xpu::Error_t::NO_ENOUGH_WORKSPACE:
-      return "XDNN_NO_ENOUGH_WORKSPACE";
-    case xpu::Error_t::NOT_IMPLEMENT:
-      return "XDNN_NOT_IMPLEMENT";
-    default:
-      return "Unknown XDNN status";
-  }
+  return pten::backends::xpu::xdnnGetErrorString(stat);
 }
 
 inline std::string build_xpu_error_msg(int stat) {
-  std::string msg("XPU Error <" + std::to_string(stat) + ">, ");
-  return msg + xpuGetErrorString(stat) + " ";
+  return pten::backends::xpu::build_xpu_error_msg(stat);
 }
 
 inline std::string build_xpu_error_msg(BKCLResult_t stat) {
-  std::string msg("BKCL Error, ");
-  return msg + bkclGetErrorString(stat) + " ";
+  return pten::backends::xpu::build_xpu_error_msg(stat);
 }
 
 inline std::string build_xpu_xdnn_error_msg(int stat, std::string msg) {
-  return msg + " XDNN Error, " + xdnnGetErrorString(stat) + " ";
+  return pten::backends::xpu::build_xpu_xdnn_error_msg(stat, msg);
 }
 
-namespace details {
-
-template <typename T>
-struct ExternalApiType {};
-
-#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
-  template <>                                         \
-  struct ExternalApiType<type> {                      \
-    using Type = type;                                \
-    static constexpr Type kSuccess = success_value;   \
-  }
-
-DEFINE_EXTERNAL_API_TYPE(int, XPU_SUCCESS);
-DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
-
-#undef DEFINE_EXTERNAL_API_TYPE
-
-}  // namespace details
-
-#define PADDLE_ENFORCE_XPU_SUCCESS(COND)                      \
-  do {                                                        \
-    auto __cond__ = (COND);                                   \
-    using __XPU_STATUS_TYPE__ = decltype(__cond__);           \
-    constexpr auto __success_type__ =                         \
-        ::paddle::platform::details::ExternalApiType<         \
-            __XPU_STATUS_TYPE__>::kSuccess;                   \
-    if (UNLIKELY(__cond__ != __success_type__)) {             \
-      auto __summary__ = paddle::platform::errors::External(  \
-          ::paddle::platform::build_xpu_error_msg(__cond__)); \
-      __THROW_ERROR_INTERNAL__(__summary__);                  \
-    }                                                         \
-  } while (0)
-
-#define PADDLE_ENFORCE_XDNN_SUCCESS(COND, MSG)                          \
-  do {                                                                  \
-    auto __cond__ = (COND);                                             \
-    if (UNLIKELY(__cond__ != xpu::Error_t::SUCCESS)) {                  \
-      auto __summary__ = paddle::platform::errors::External(            \
-          ::paddle::platform::build_xpu_xdnn_error_msg(__cond__, MSG)); \
-      __THROW_ERROR_INTERNAL__(__summary__);                            \
-    }                                                                   \
-  } while (0)
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/xpu/xpu_header.h b/paddle/fluid/platform/device/xpu/xpu_header.h
index 1177fd63742..6b5c32fd511 100644
--- a/paddle/fluid/platform/device/xpu/xpu_header.h
+++ b/paddle/fluid/platform/device/xpu/xpu_header.h
@@ -15,42 +15,5 @@ limitations under the License. */
 #pragma once
 
 #ifdef PADDLE_WITH_XPU
-#include <map>
-#include <string>
-#include <unordered_map>
-
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-
-#include "xpu/runtime.h"
-#include "xpu/runtime_ex.h"
-#include "xpu/xdnn.h"
-
-namespace xpu = baidu::xpu::api;
-
-static std::map<int, std::string> XPUAPIErrorMsg = {
-    {xpu::Error_t::SUCCESS, "xpu api success"},
-    {xpu::Error_t::INVALID_PARAM, "xpu api invalid param"},
-    {xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"},
-    {xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}};
-
-template <typename T>
-class XPUTypeTrait {
- public:
-  using Type = T;
-};
-
-template <>
-class XPUTypeTrait<paddle::platform::float16> {
- public:
-  using Type = float16;
-};
-
-template <>
-class XPUTypeTrait<paddle::platform::bfloat16> {
- public:
-  using Type = bfloat16;
-};
-
+#include "paddle/pten/backends/xpu/xpu_header.h"
 #endif
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc
index a8c6ee8f3b0..cf08f9ada6b 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_info.cc
@@ -14,22 +14,14 @@ limitations under the License. */
 #include <cstdlib>
 #include <string>
 #include "gflags/gflags.h"
+
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"
-
-PADDLE_DEFINE_EXPORTED_string(
-    selected_xpus, "",
-    "A list of device ids separated by comma, like: 0,1,2,3. "
-    "This option is useful when doing multi process training and "
-    "each process have only one device (XPU). If you want to use "
-    "all visible devices, set this to empty string. NOTE: the "
-    "reason of doing this is that we want to use P2P communication"
-    "between XPU devices, use XPU_VISIBLE_DEVICES can only use"
-    "share-memory only.");
+
+#include "paddle/pten/backends/xpu/xpu_info.h"
 
 namespace paddle {
 namespace platform {
@@ -37,101 +29,40 @@ namespace platform {
 /**************************** Version Management **************************/
 
 //! Get the version of XPU Driver
-int GetDriverVersion() {
-  uint32_t driver_version_major = 0;
-  uint32_t driver_version_minor = 0;
-  PADDLE_ENFORCE_XPU_SUCCESS(
-      xpu_get_driver_version(&driver_version_major, &driver_version_minor));
-  int driver_version = driver_version_major * 10 + driver_version_minor;
-  return driver_version;
-}
+int GetDriverVersion() { return pten::backends::xpu::GetDriverVersion(); }
 
 //! Get the version of XPU Runtime
-int GetRuntimeVersion() {
-  uint32_t rumtime_version_major = 0;
-  uint32_t rumtime_version_minor = 0;
-  PADDLE_ENFORCE_XPU_SUCCESS(
-      xpu_get_runtime_version(&rumtime_version_major, &rumtime_version_minor));
-  int runtime_version = rumtime_version_major * 10 + rumtime_version_minor;
-  return runtime_version;
-}
+int GetRuntimeVersion() { return pten::backends::xpu::GetRuntimeVersion(); }
 
 /**************************** Device Management **************************/
 
-static int GetDeviceCountImpl() {
-  const auto* xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES");
-  if (xpu_visible_devices != nullptr) {
-    std::string xpu_visible_devices_str(xpu_visible_devices);
-    if (std::all_of(xpu_visible_devices_str.begin(),
-                    xpu_visible_devices_str.end(),
-                    [](char ch) { return ch == ' '; })) {
-      VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected.";
-      return 0;
-    }
-  }
-
-  int count = 0;
-  PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_count(&count));
-  return count;
-}
-
-int GetXPUDeviceCount() {
-  static auto dev_cnt = GetDeviceCountImpl();
-  return dev_cnt;
-}
+int GetXPUDeviceCount() { return pten::backends::xpu::GetXPUDeviceCount(); }
 
 int GetXPUCurrentDeviceId() {
-  int dev_id;
-  PADDLE_ENFORCE_XPU_SUCCESS(xpu_current_device(&dev_id));
-  if (dev_id >= 64) {
-    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
-    dev_id -= 64;
-  }
-  return dev_id;
+  return pten::backends::xpu::GetXPUCurrentDeviceId();
 }
 
-void SetXPUDeviceId(int id) {
-  PADDLE_ENFORCE_LT(
-      id, GetXPUDeviceCount(),
-      platform::errors::InvalidArgument("id must less than XPU count"));
-  PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id));
-}
+void SetXPUDeviceId(int id) { pten::backends::xpu::SetXPUDeviceId(id); }
 
 //! Get a list of device ids from environment variable or use all.
 std::vector<int> GetXPUSelectedDevices() {
   // use user specified XPUs in single-node multi-process mode.
-  std::vector<int> devices;
-  if (!FLAGS_selected_xpus.empty()) {
-    auto devices_str = paddle::string::Split(FLAGS_selected_xpus, ',');
-    for (auto id : devices_str) {
-      devices.push_back(atoi(id.c_str()));
-    }
-  } else {
-    int count = GetXPUDeviceCount();
-    for (int i = 0; i < count; ++i) {
-      devices.push_back(i);
-    }
-  }
-  return devices;
+  return pten::backends::xpu::GetXPUSelectedDevices();
 }
 
 /**************************** Memory Management **************************/
 
 void MemcpySyncH2D(void* dst, const void* src, size_t count,
                    const platform::XPUPlace& dst_place) {
-  platform::XPUDeviceGuard guard(dst_place.device);
-  PADDLE_ENFORCE_XPU_SUCCESS(
-      xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  pten::backends::xpu::MemcpySyncH2D(dst, src, count, dst_place);
 }
 
 void MemcpySyncD2H(void* dst, const void* src, size_t count,
                    const platform::XPUPlace& src_place) {
-  platform::XPUDeviceGuard guard(src_place.device);
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.GetByPlace(src_place);
   dev_ctx->Wait();
-  PADDLE_ENFORCE_XPU_SUCCESS(
-      xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST));
+  pten::backends::xpu::MemcpySyncD2H(dst, src, count, src_place, *dev_ctx);
 }
 
 // if src.device == dst.device and you need sync , after call this function,
@@ -139,33 +70,16 @@ void MemcpySyncD2H(void* dst, const void* src, size_t count,
 void MemcpySyncD2D(void* dst, const platform::XPUPlace& dst_place,
                    const void* src, const platform::XPUPlace& src_place,
                    size_t count) {
-  int dev_id = GetXPUCurrentDeviceId();
-  if (dst_place.device == dev_id && src_place.device == dev_id) {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto* dev_ctx = pool.GetByPlace(src_place);
-    PADDLE_ENFORCE_XDNN_SUCCESS(
-        xpu::copy(dev_ctx->x_context(), static_cast<const int8_t*>(src),
-                  static_cast<int8_t*>(dst), count),
-        "copy ");
-  } else {
-    PADDLE_ENFORCE_XPU_SUCCESS(
-        xpu_memcpy_peer(dst_place.device, dst, src_place.device, src, count));
-  }
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.GetByPlace(src_place);
+  pten::backends::xpu::MemcpySyncD2D(dst, dst_place, src, src_place, count,
+                                     *dev_ctx);
 }
 
 /**************************** Others **************************/
 
-XPUVersion get_xpu_version(int dev_id) {
-  uint64_t v = 0;
-  PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id));
-
-  if (v == K100 || v == K200) {
-    VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n";
-    return XPU1;
-  } else {
-    VLOG(1) << "KUNLUN device " << dev_id << " is XPU2\n";
-    return XPU2;
-  }
+pten::backends::xpu::XPUVersion get_xpu_version(int dev_id) {
+  return pten::backends::xpu::get_xpu_version(dev_id);
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.h b/paddle/fluid/platform/device/xpu/xpu_info.h
index 220bebb9e6b..03082e8dc50 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.h
+++ b/paddle/fluid/platform/device/xpu/xpu_info.h
@@ -13,6 +13,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include <vector>
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/backends/xpu/xpu_info.h"
 
 namespace paddle {
 namespace platform {
@@ -50,31 +51,9 @@ void MemcpySyncD2D(void *dst, const platform::XPUPlace &dst_place,
                    const void *src, const platform::XPUPlace &src_place,
                    size_t count);
 
-class XPUDeviceGuard {
- public:
-  explicit inline XPUDeviceGuard(int dev_id) {
-    int prev_id = platform::GetXPUCurrentDeviceId();
-    if (prev_id != dev_id) {
-      prev_id_ = prev_id;
-      platform::SetXPUDeviceId(dev_id);
-    }
-  }
+using XPUDeviceGuard = pten::backends::xpu::XPUDeviceGuard;
 
-  inline ~XPUDeviceGuard() {
-    if (prev_id_ != -1) {
-      platform::SetXPUDeviceId(prev_id_);
-    }
-  }
-
-  XPUDeviceGuard(const XPUDeviceGuard &o) = delete;
-  XPUDeviceGuard &operator=(const XPUDeviceGuard &o) = delete;
-
- private:
-  int prev_id_{-1};
-};
-
-enum XPUVersion { XPU1, XPU2 };
-XPUVersion get_xpu_version(int dev_id);
+pten::backends::xpu::XPUVersion get_xpu_version(int dev_id);
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index 36be4a55d0a..e9b494024bd 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -24,7 +24,7 @@ namespace platform {
 bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
   auto& ops = get_kl1_ops();
   auto v = get_xpu_version(type.place_.device);
-  if (v == XPU2) {
+  if (v == pten::backends::xpu::XPUVersion::XPU2) {
     ops = get_kl2_ops();
   }
 
@@ -74,10 +74,11 @@ bool is_in_xpu_black_list(const std::string& op_name) {
   return false;
 }
 
-std::vector<vartype::Type> get_xpu_op_support_type(const std::string& op_name,
-                                                   XPUVersion version) {
+std::vector<vartype::Type> get_xpu_op_support_type(
+    const std::string& op_name, pten::backends::xpu::XPUVersion version) {
   std::vector<vartype::Type> res;
-  auto& ops = version == XPU1 ? get_kl1_ops() : get_kl2_ops();
+  auto& ops = version == pten::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops()
+                                                               : get_kl2_ops();
   if (ops.find(op_name) != ops.end()) {
     XPUKernelSet& type_set = ops[op_name];
     for (auto& item : type_set) {
@@ -87,9 +88,10 @@ std::vector<vartype::Type> get_xpu_op_support_type(const std::string& op_name,
   return res;
 }
 
-XPUOpListMap get_xpu_op_list(XPUVersion version) {
+XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version) {
   XPUOpListMap res;
-  auto& ops = version == XPU1 ? get_kl1_ops() : get_kl2_ops();
+  auto& ops = version == pten::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops()
+                                                               : get_kl2_ops();
   for (auto& op : ops) {
     std::vector<vartype::Type> op_vartypes;
     for (auto& item : op.second) {
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.h b/paddle/fluid/platform/device/xpu/xpu_op_list.h
index 3672d68492a..4c3eb097a14 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h
@@ -27,9 +27,9 @@ using XPUOpListMap =
 bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type);
 bool is_in_xpu_black_list(const std::string& op_name);
 
-std::vector<vartype::Type> get_xpu_op_support_type(const std::string& op_name,
-                                                   XPUVersion version);
-XPUOpListMap get_xpu_op_list(XPUVersion version);
+std::vector<vartype::Type> get_xpu_op_support_type(
+    const std::string& op_name, pten::backends::xpu::XPUVersion version);
+XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version);
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 6ffeaf101fe..bfb1f572068 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -246,52 +246,14 @@ IPUDeviceContext::~IPUDeviceContext() {}
 
 #endif
 #ifdef PADDLE_WITH_XPU
-XPUDeviceContext::XPUDeviceContext() {
-  context_ = xpu::create_context();
-  xpu_version_ = get_xpu_version(place_.device);
-}
+XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {}
 
 XPUDeviceContext::~XPUDeviceContext() {}
 
-XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
-  platform::XPUDeviceGuard guard(place.device);
-
+XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) {
   LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
-                          << static_cast<int>(place_.device);
-
-  context_ = xpu::create_context();
-  const int MAX_XPU_NUM = 16;
-  static void* l3ptrs[MAX_XPU_NUM] = {nullptr};
-
-  int l3_size = 13.5 * 1024 * 1024;
-  if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) {
-    l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE"));
-  }
-
-  auto selected_xpus = GetXPUSelectedDevices();
-  for (unsigned int i = 0; i < selected_xpus.size(); i++) {
-    if (place.device == selected_xpus[i]) {
-      if (l3ptrs[place.device] == nullptr) {
-        xpu_malloc(static_cast<void**>(&l3ptrs[place.device]), l3_size,
-                   XPU_MEM_L3);
-      }
-      if (l3ptrs[place.device] != nullptr) {
-        context_->_l3_mgr.set(l3ptrs[place.device], l3_size);
-        VLOG(3) << "xpu place " << place.device << " set l3 size " << l3_size;
-      }
-      break;
-    }
-  }
+                          << static_cast<int>(place.device);
 }
-
-void XPUDeviceContext::Wait() const {
-  platform::SetXPUDeviceId(place_.device);
-  xpu_wait(context_->xpu_stream);
-}
-
-Place XPUDeviceContext::GetPlace() const { return place_; }
-
-xpu::Context* XPUDeviceContext::x_context() const { return context_; }
 #endif
 
 #ifdef PADDLE_WITH_ASCEND_CL
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 78c09dca5b4..52f17cd986c 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -78,6 +78,7 @@ struct GpuDevice;
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/pten/backends/xpu/xpu_context.h"
 #endif
 
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -171,39 +172,12 @@ struct DefaultDeviceContextType<platform::MLUPlace>;
 
 #ifdef PADDLE_WITH_XPU
 namespace xpu = baidu::xpu::api;
-class XPUDeviceContext : public DeviceContext {
+class XPUDeviceContext : public pten::XPUContext {
  public:
   XPUDeviceContext();
   explicit XPUDeviceContext(XPUPlace place);
   virtual ~XPUDeviceContext();
   Eigen::DefaultDevice* eigen_device() const { return nullptr; }
-  XPUVersion xpu_version() const { return xpu_version_; }
-  Place GetPlace() const override;
-  xpu::Context* x_context() const;
-
-  /*! \brief  Wait for all operations completion in the stream. */
-  void Wait() const override;
-
-#ifdef PADDLE_WITH_XPU_BKCL
-  /*! \brief  Return bkcl context. */
-  BKCLContext_t bkcl_context() const { return bkcl_context_; }
-
-  /*! \brief  Set bkcl context. */
-  void set_bkcl_context(BKCLContext_t context) { bkcl_context_ = context; }
-#endif
-
- private:
-  XPUPlace place_;
-  XPUVersion xpu_version_;
-  xpu::Context* context_;
-#ifdef PADDLE_WITH_XPU_BKCL
-  BKCLContext_t bkcl_context_;
-#endif
-
-  // Need to be the same with other DeviceContext,
-  // Eventhough eigen_device_ is not used in XPU
-  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
-  DISABLE_COPY_AND_ASSIGN(XPUDeviceContext);
 };
 
 template <>
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 454e3b524f5..34dc0b2c050 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1756,27 +1756,30 @@ All parameter, weight, gradient are variables in Paddle.
       .def("__repr__", string::to_string<const platform::XPUPlace &>)
       .def("__str__", string::to_string<const platform::XPUPlace &>);
 #ifdef PADDLE_WITH_XPU
-  py::enum_<platform::XPUVersion>(m, "XPUVersion", py::arithmetic())
-      .value("XPU1", platform::XPUVersion::XPU1)
-      .value("XPU2", platform::XPUVersion::XPU2)
+  py::enum_<pten::backends::xpu::XPUVersion>(m, "XPUVersion", py::arithmetic())
+      .value("XPU1", pten::backends::xpu::XPUVersion::XPU1)
+      .value("XPU2", pten::backends::xpu::XPUVersion::XPU2)
       .export_values();
   m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
   m.def("get_xpu_device_version",
         [](int device_id) { return platform::get_xpu_version(device_id); });
-  m.def("get_xpu_device_op_support_types",
-        [](const std::string &op_name, platform::XPUVersion version) {
-          return platform::get_xpu_op_support_type(op_name, version);
-        });
-  m.def("get_xpu_device_op_list", [](platform::XPUVersion version) {
+  m.def(
+      "get_xpu_device_op_support_types",
+      [](const std::string &op_name, pten::backends::xpu::XPUVersion version) {
+        return platform::get_xpu_op_support_type(op_name, version);
+      });
+  m.def("get_xpu_device_op_list", [](pten::backends::xpu::XPUVersion version) {
     return platform::get_xpu_op_list(version);
   });
   m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool {
     // XPUs with Compute Capability > xpu2 support float16 and bfloat16
-    return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1;
+    return platform::get_xpu_version(place.device) >
+           pten::backends::xpu::XPUVersion::XPU1;
   });
   m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool {
     // XPUs with Compute Capability > xpu2 support float16 and bfloat16
-    return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1;
+    return platform::get_xpu_version(place.device) >
+           pten::backends::xpu::XPUVersion::XPU1;
   });
 #endif
 
diff --git a/paddle/pten/backends/CMakeLists.txt b/paddle/pten/backends/CMakeLists.txt
index 3587910ff50..e9f222d642e 100644
--- a/paddle/pten/backends/CMakeLists.txt
+++ b/paddle/pten/backends/CMakeLists.txt
@@ -2,4 +2,12 @@ add_subdirectory(dynload)
 
 add_subdirectory(cpu)
 
-cc_library(pten_context SRCS all_context.cc DEPS device_context)
+if(WITH_XPU)
+  add_subdirectory(xpu)
+endif()
+
+cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context)
+
+if(WITH_XPU)
+  add_dependencies(pten_context xpu_context)
+endif()
diff --git a/paddle/pten/backends/cpu/cpu_context.cc b/paddle/pten/backends/cpu/cpu_context.cc
index e749dfb9bd7..efce128596b 100644
--- a/paddle/pten/backends/cpu/cpu_context.cc
+++ b/paddle/pten/backends/cpu/cpu_context.cc
@@ -18,16 +18,11 @@
 
 // NOTE: The paddle framework should add WITH_EIGEN option to support compile
 // without eigen.
-#include "paddle/pten/core/device_context.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace pten {
 
 struct CPUContext::CPUImpl {
-  Eigen::DefaultDevice* device_{nullptr};
-  CPUContextResource res_;
-  CPUPlace place_;
-
   CPUImpl() { device_ = new Eigen::DefaultDevice(); }
 
   // Users need to manage external resources.
@@ -36,7 +31,7 @@ struct CPUContext::CPUImpl {
   }
 
   ~CPUImpl() {
-    if (res_.device == nullptr) {
+    if (res_.device == nullptr && device_ != nullptr) {
       delete device_;
       device_ = nullptr;
     }
@@ -56,27 +51,28 @@ struct CPUContext::CPUImpl {
   }
 
   Place GetPlace() const { return place_; }
+
+  Eigen::DefaultDevice* device_{nullptr};
+  CPUContextResource res_;
+  CPUPlace place_;
 };
 
-CPUContext::CPUContext() : DeviceContext(), cpu_impl_(nullptr) {
+CPUContext::CPUContext() : DeviceContext() {
   cpu_impl_ = std::make_unique<CPUImpl>();
 }
 
-CPUContext::CPUContext(const CPUContext& other)
-    : DeviceContext(), cpu_impl_(nullptr) {
+CPUContext::CPUContext(const CPUContext& other) : DeviceContext() {
   cpu_impl_ = std::make_unique<CPUImpl>();
   cpu_impl_->SetEigenDevice(other.eigen_device());
 }
 
-CPUContext::CPUContext(CPUContext&& other)
-    : DeviceContext(), cpu_impl_(nullptr) {
+CPUContext::CPUContext(CPUContext&& other) : DeviceContext() {
   cpu_impl_ = std::move(other.cpu_impl_);
 }
 
 CPUContext::~CPUContext() = default;
 
-CPUContext::CPUContext(const CPUContextResource& ctx_res)
-    : DeviceContext(), cpu_impl_(nullptr) {
+CPUContext::CPUContext(const CPUContextResource& ctx_res) : DeviceContext() {
   cpu_impl_ = std::make_unique<CPUImpl>(ctx_res);
 }
 
diff --git a/paddle/pten/backends/xpu/CMakeLists.txt b/paddle/pten/backends/xpu/CMakeLists.txt
new file mode 100644
index 00000000000..65341dd206f
--- /dev/null
+++ b/paddle/pten/backends/xpu/CMakeLists.txt
@@ -0,0 +1,2 @@
+cc_library(pten_xpu_info SRCS xpu_info.cc DEPS enforce xpulib pten_place)
+cc_library(xpu_context SRCS xpu_context.cc DEPS pten_device_context pten_xpu_info)
diff --git a/paddle/pten/backends/xpu/enforce_xpu.h b/paddle/pten/backends/xpu/enforce_xpu.h
new file mode 100644
index 00000000000..38aeff198d4
--- /dev/null
+++ b/paddle/pten/backends/xpu/enforce_xpu.h
@@ -0,0 +1,194 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/backends/xpu/xpu_header.h"
+#include "xpu/bkcl.h"
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pten {
+namespace backends {
+namespace xpu {
+
+// Note: XPU runtime api return int, not XPUError_t
+inline const char* xpuGetErrorString(int stat) {
+  switch (stat) {
+    case XPU_SUCCESS:
+      return "Success";
+    case XPUERR_INVALID_DEVICE:
+      return "Invalid XPU device";
+    case XPUERR_UNINIT:
+      return "XPU runtime not properly inited";
+    case XPUERR_NOMEM:
+      return "Device memory not enough";
+    case XPUERR_NOCPUMEM:
+      return "CPU memory not enough";
+    case XPUERR_INVALID_PARAM:
+      return "Invalid parameter";
+    case XPUERR_NOXPUFUNC:
+      return "Cannot get XPU Func";
+    case XPUERR_LDSO:
+      return "Error loading dynamic library";
+    case XPUERR_LDSYM:
+      return "Error loading func from dynamic library";
+    case XPUERR_SIMULATOR:
+      return "Error from XPU Simulator";
+    case XPUERR_NOSUPPORT:
+      return "Operation not supported";
+    case XPUERR_ABNORMAL:
+      return "Device abnormal due to previous error";
+    case XPUERR_KEXCEPTION:
+      return "Exception in kernel execution";
+    case XPUERR_TIMEOUT:
+      return "Kernel execution timed out";
+    case XPUERR_BUSY:
+      return "Resource busy";
+    case XPUERR_USEAFCLOSE:
+      return "Use a stream after closed";
+    case XPUERR_UCECC:
+      return "Uncorrectable ECC";
+    case XPUERR_OVERHEAT:
+      return "Overheat";
+    case XPUERR_UNEXPECT:
+      return "Execution error, reach unexpected control flow";
+    case XPUERR_DEVRESET:
+      return "Device is being reset, try again later";
+    case XPUERR_HWEXCEPTION:
+      return "Hardware module exception";
+    case XPUERR_HBM_INIT:
+      return "Error init HBM";
+    case XPUERR_DEVINIT:
+      return "Error init device";
+    case XPUERR_PEERRESET:
+      return "Device is being reset, try again later";
+    case XPUERR_MAXDEV:
+      return "Device count exceed limit";
+    case XPUERR_NOIOC:
+      return "Unknown IOCTL command";
+    case XPUERR_DMATIMEOUT:
+      return "DMA timed out, a reboot maybe needed";
+    case XPUERR_DMAABORT:
+      return "DMA aborted due to error, possibly wrong address or hardware "
+             "state";
+    case XPUERR_MCUUNINIT:
+      return "Firmware not initialized";
+    case XPUERR_OLDFW:
+      return "Firmware version too old (<15), please update.";
+    case XPUERR_PCIE:
+      return "Error in PCIE";
+    case XPUERR_FAULT:
+      return "Error copy between kernel and user space";
+    case XPUERR_INTERRUPTED:
+      return "Execution interrupted by user";
+    default:
+      return "unkonwn error";
+  }
+}
+
+inline const char* bkclGetErrorString(BKCLResult_t stat) {
+  switch (stat) {
+    case BKCL_SUCCESS:
+      return "BKCL_SUCCESS";
+    case BKCL_INVALID_ARGUMENT:
+      return "BKCL_INVALID_ARGUMENT";
+    case BKCL_RUNTIME_ERROR:
+      return "BKCL_RUNTIME_ERROR";
+    case BKCL_SYSTEM_ERROR:
+      return "BKCL_SYSTEM_ERROR";
+    case BKCL_INTERNAL_ERROR:
+      return "BKCL_INTERNAL_ERROR";
+    default:
+      return "Unknown BKCL status";
+  }
+}
+
+inline const char* xdnnGetErrorString(int stat) {
+  switch (stat) {
+    case baidu::xpu::api::Error_t::SUCCESS:
+      return "XDNN_SUCCESS";
+    case baidu::xpu::api::Error_t::INVALID_PARAM:
+      return "XDNN_INVALID_PARAM";
+    case baidu::xpu::api::Error_t::RUNTIME_ERROR:
+      return "XDNN_RUNTIME_ERROR";
+    case baidu::xpu::api::Error_t::NO_ENOUGH_WORKSPACE:
+      return "XDNN_NO_ENOUGH_WORKSPACE";
+    case baidu::xpu::api::Error_t::NOT_IMPLEMENT:
+      return "XDNN_NOT_IMPLEMENT";
+    default:
+      return "Unknown XDNN status";
+  }
+}
+
+inline std::string build_xpu_error_msg(int stat) {
+  std::string msg("XPU Error <" + std::to_string(stat) + ">, ");
+  return msg + xpuGetErrorString(stat) + " ";
+}
+
+inline std::string build_xpu_error_msg(BKCLResult_t stat) {
+  std::string msg("BKCL Error, ");
+  return msg + bkclGetErrorString(stat) + " ";
+}
+
+inline std::string build_xpu_xdnn_error_msg(int stat, std::string msg) {
+  return msg + " XDNN Error, " + xdnnGetErrorString(stat) + " ";
+}
+
+namespace details {
+
+template <typename T>
+struct ExternalApiType {};
+
+#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
+  template <>                                         \
+  struct ExternalApiType<type> {                      \
+    using Type = type;                                \
+    static constexpr Type kSuccess = success_value;   \
+  }
+
+DEFINE_EXTERNAL_API_TYPE(int, XPU_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
+
+#undef DEFINE_EXTERNAL_API_TYPE
+
+}  // namespace details
+
+#define PADDLE_ENFORCE_XPU_SUCCESS(COND)                         \
+  do {                                                           \
+    auto __cond__ = (COND);                                      \
+    using __XPU_STATUS_TYPE__ = decltype(__cond__);              \
+    constexpr auto __success_type__ =                            \
+        ::pten::backends::xpu::details::ExternalApiType<         \
+            __XPU_STATUS_TYPE__>::kSuccess;                      \
+    if (UNLIKELY(__cond__ != __success_type__)) {                \
+      auto __summary__ = paddle::platform::errors::External(     \
+          ::pten::backends::xpu::build_xpu_error_msg(__cond__)); \
+      __THROW_ERROR_INTERNAL__(__summary__);                     \
+    }                                                            \
+  } while (0)
+
+#define PADDLE_ENFORCE_XDNN_SUCCESS(COND, MSG)                             \
+  do {                                                                     \
+    auto __cond__ = (COND);                                                \
+    if (UNLIKELY(__cond__ != baidu::xpu::api::Error_t::SUCCESS)) {         \
+      auto __summary__ = paddle::platform::errors::External(               \
+          ::pten::backends::xpu::build_xpu_xdnn_error_msg(__cond__, MSG)); \
+      __THROW_ERROR_INTERNAL__(__summary__);                               \
+    }                                                                      \
+  } while (0)
+
+}  // namespace xpu
+}  // namespace backends
+}  // namespace pten
diff --git a/paddle/pten/backends/xpu/forwards.h b/paddle/pten/backends/xpu/forwards.h
new file mode 100644
index 00000000000..805a74865b6
--- /dev/null
+++ b/paddle/pten/backends/xpu/forwards.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// Forward-declares.
+#pragma once
+
+// Forward declaration of xpu context.
+namespace baidu {
+namespace xpu {
+namespace api {
+
+struct Context;
+typedef void* BKCLContext_t;
+
+}  // namespace api
+}  // namespace xpu
+}  // namespace baidu
diff --git a/paddle/pten/backends/xpu/xpu_context.cc b/paddle/pten/backends/xpu/xpu_context.cc
new file mode 100644
index 00000000000..af4478662a5
--- /dev/null
+++ b/paddle/pten/backends/xpu/xpu_context.cc
@@ -0,0 +1,169 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/backends/xpu/xpu_context.h"
+#include <memory>
+#include "paddle/pten/api/ext/exception.h"
+
+#include "xpu/runtime.h"
+#include "xpu/runtime_ex.h"
+#include "xpu/xdnn.h"
+
+namespace xpu = baidu::xpu::api;
+
+namespace pten {
+
+struct XPUContext::XPUImpl {
+  void SetL3Cache() {
+    const int MAX_XPU_NUM = 16;
+    static void* l3ptrs[MAX_XPU_NUM] = {nullptr};
+
+    int l3_size = 13.5 * 1024 * 1024;
+    if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) {
+      l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE"));
+    }
+
+    auto selected_xpus = backends::xpu::GetXPUSelectedDevices();
+    for (unsigned int i = 0; i < selected_xpus.size(); i++) {
+      if (place_.GetDeviceId() == selected_xpus[i]) {
+        if (l3ptrs[place_.GetDeviceId()] == nullptr) {
+          xpu_malloc(static_cast<void**>(&l3ptrs[place_.GetDeviceId()]),
+                     l3_size,
+                     XPU_MEM_L3);
+        }
+        if (l3ptrs[place_.GetDeviceId()] != nullptr) {
+          context_->_l3_mgr.set(l3ptrs[place_.GetDeviceId()], l3_size);
+          VLOG(3) << "xpu place " << place_.GetDeviceId() << " set l3 size "
+                  << l3_size;
+        }
+        break;
+      }
+    }
+  }
+
+  XPUImpl() {
+    context_ = xpu::create_context();
+    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
+  }
+
+  explicit XPUImpl(XPUPlace place) : place_(place) {
+    backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
+
+    LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
+                            << static_cast<int>(place_.device);
+
+    context_ = xpu::create_context();
+    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
+    SetL3Cache();
+  }
+
+  // Users need to manage external resources.
+  explicit XPUImpl(const XPUContextResource& ctx_res,
+                   const XPUPlace& place = XPUPlace(0))
+      : res_(ctx_res), place_(place) {
+    context_ = res_.context;
+    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
+    SetL3Cache();
+  }
+
+  ~XPUImpl() {
+    if (res_.context == nullptr && context_ != nullptr) {
+      xpu::destroy_context(context_);
+      context_ = nullptr;
+    }
+  }
+
+  Place GetPlace() const { return place_; }
+
+  backends::xpu::XPUVersion GetXpuVersion() const { return xpu_version_; }
+
+  xpu::Context* GetXContext() const {
+    PD_CHECK(context_ != nullptr, "the xpu context is nullptr.");
+    return context_;
+  }
+
+  xpu::BKCLContext_t GetBkclContext() const { return bkcl_context_; }
+
+  void Wait() const {
+    backends::xpu::SetXPUDeviceId(place_.GetDeviceId());
+    PD_CHECK(context_ != nullptr, "the xpu context is nullptr.");
+    xpu_wait(context_->xpu_stream);
+  }
+
+  void SetXContext(xpu::Context* context) {
+    if (context == nullptr) {
+      return;
+    }
+    res_.context = context;
+    context_ = context;
+  }
+
+  void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; }
+
+  XPUContextResource res_;
+  XPUPlace place_;
+  backends::xpu::XPUVersion xpu_version_;
+  xpu::Context* context_{nullptr};
+  // NOTE: Distributed communicator, distributed framework manages its
+  // resources, XPUContext only holds references.
+  xpu::BKCLContext_t bkcl_context_{nullptr};
+};
+
+XPUContext::XPUContext() : DeviceContext() {
+  impl_ = std::make_unique<XPUImpl>();
+}
+
+XPUContext::XPUContext(const XPUPlace& place) {
+  impl_ = std::make_unique<XPUImpl>(place);
+}
+
+XPUContext::XPUContext(const XPUContext& other) : DeviceContext() {
+  impl_ = std::make_unique<XPUImpl>();
+  impl_->SetXContext(other.x_context());
+  impl_->SetBkclContext(other.bkcl_context());
+}
+
+XPUContext::XPUContext(XPUContext&& other) : DeviceContext() {
+  impl_ = std::move(other.impl_);
+}
+
+XPUContext::~XPUContext() = default;
+
+XPUContext::XPUContext(const XPUContextResource& ctx_res) : DeviceContext() {
+  impl_ = std::make_unique<XPUImpl>(ctx_res);
+}
+
+Place XPUContext::GetPlace() const { return impl_->GetPlace(); }
+
+backends::xpu::XPUVersion XPUContext::xpu_version() const {
+  return impl_->GetXpuVersion();
+}
+
+xpu::Context* XPUContext::x_context() const { return impl_->GetXContext(); }
+
+xpu::BKCLContext_t XPUContext::bkcl_context() const {
+  return impl_->GetBkclContext();
+}
+
+void XPUContext::Wait() const { impl_->Wait(); }
+
+void XPUContext::set_x_context(xpu::Context* context) {
+  impl_->SetXContext(context);
+}
+
+void XPUContext::set_bkcl_context(xpu::BKCLContext_t context) {
+  impl_->SetBkclContext(context);
+}
+
+}  // namespace pten
diff --git a/paddle/pten/backends/xpu/xpu_context.h b/paddle/pten/backends/xpu/xpu_context.h
index 94d2a1532f6..4ae5786211d 100644
--- a/paddle/pten/backends/xpu/xpu_context.h
+++ b/paddle/pten/backends/xpu/xpu_context.h
@@ -14,13 +14,60 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_XPU
+#include <memory>
+#include "paddle/pten/backends/xpu/forwards.h"
+#include "paddle/pten/common/place.h"
+#include "paddle/pten/core/device_context.h"
 
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/backends/xpu/xpu_header.h"
+#include "paddle/pten/backends/xpu/xpu_info.h"
+
+namespace xpu = baidu::xpu::api;
 
 namespace pten {
-using XPUContext = paddle::platform::XPUDeviceContext;
-}  // namespace pten
 
-#endif  // PADDLE_WITH_XPU
+struct XPUContextResource {
+  xpu::Context* context{nullptr};
+};
+
+class XPUContext : public DeviceContext {
+ public:
+  // NOTE: DeviceContext hold resources. Used in training scenarios.
+  XPUContext();
+
+  explicit XPUContext(const XPUPlace&);
+
+  // NOTE: Share the same underlying resources, please ensure that resources are
+  // not released.
+  XPUContext(const XPUContext&);
+
+  XPUContext(XPUContext&&);
+
+  virtual ~XPUContext();
+
+  Place GetPlace() const override;
+
+  backends::xpu::XPUVersion xpu_version() const;
+
+  xpu::Context* x_context() const;
+
+  // Return bkcl context.
+  xpu::BKCLContext_t bkcl_context() const;
+
+  // Wait for all operations completion in the stream.
+  void Wait() const override;
+
+ public:
+  // NOTE: External users manage resources. Used in inference scenarios.
+  explicit XPUContext(const XPUContextResource&);
+
+  void set_x_context(xpu::Context*);
+
+  void set_bkcl_context(xpu::BKCLContext_t context);
+
+ private:
+  struct XPUImpl;
+  std::unique_ptr<XPUImpl> impl_;
+};
+
+}  // namespace pten
diff --git a/paddle/pten/backends/xpu/xpu_header.h b/paddle/pten/backends/xpu/xpu_header.h
new file mode 100644
index 00000000000..99e4a06720f
--- /dev/null
+++ b/paddle/pten/backends/xpu/xpu_header.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+#include <map>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/float16.h"
+
+#include "xpu/runtime.h"
+#include "xpu/runtime_ex.h"
+#include "xpu/xdnn.h"
+
+namespace xpu = baidu::xpu::api;
+
+static std::map<int, std::string> XPUAPIErrorMsg = {
+    {xpu::Error_t::SUCCESS, "xpu api success"},
+    {xpu::Error_t::INVALID_PARAM, "xpu api invalid param"},
+    {xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"},
+    {xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}};
+
+template <typename T>
+class XPUTypeTrait {
+ public:
+  using Type = T;
+};
+
+template <>
+class XPUTypeTrait<pten::dtype::float16> {
+ public:
+  using Type = float16;
+};
+
+template <>
+class XPUTypeTrait<pten::dtype::bfloat16> {
+ public:
+  using Type = bfloat16;
+};
+
+#endif
diff --git a/paddle/pten/backends/xpu/xpu_info.cc b/paddle/pten/backends/xpu/xpu_info.cc
new file mode 100644
index 00000000000..01d23be848b
--- /dev/null
+++ b/paddle/pten/backends/xpu/xpu_info.cc
@@ -0,0 +1,199 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/xpu/xpu_info.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <string>
+
+#include "paddle/pten/backends/xpu/enforce_xpu.h"
+#include "paddle/pten/backends/xpu/xpu_context.h"
+#include "paddle/pten/backends/xpu/xpu_header.h"
+#include "paddle/pten/common/place.h"
+
+// TODO(wilber): The pten computing library requires a component to manage
+// flags.
+#include "paddle/fluid/platform/flags.h"
+
+PADDLE_DEFINE_EXPORTED_string(
+    selected_xpus,
+    "",
+    "A list of device ids separated by comma, like: 0,1,2,3. "
+    "This option is useful when doing multi process training and "
+    "each process have only one device (XPU). If you want to use "
+    "all visible devices, set this to empty string. NOTE: the "
+    "reason of doing this is that we want to use P2P communication"
+    "between XPU devices, use XPU_VISIBLE_DEVICES can only use"
+    "share-memory only.");
+
+namespace pten {
+class XPUContext;
+
+namespace backends {
+namespace xpu {
+
+/**************************** Version Management **************************/
+
+//! Get the version of XPU Driver
+int GetDriverVersion() {
+  uint32_t driver_version_major = 0;
+  uint32_t driver_version_minor = 0;
+  PADDLE_ENFORCE_XPU_SUCCESS(
+      xpu_get_driver_version(&driver_version_major, &driver_version_minor));
+  int driver_version = driver_version_major * 10 + driver_version_minor;
+  return driver_version;
+}
+
+//! Get the version of XPU Runtime
+int GetRuntimeVersion() {
+  uint32_t rumtime_version_major = 0;
+  uint32_t rumtime_version_minor = 0;
+  PADDLE_ENFORCE_XPU_SUCCESS(
+      xpu_get_runtime_version(&rumtime_version_major, &rumtime_version_minor));
+  int runtime_version = rumtime_version_major * 10 + rumtime_version_minor;
+  return runtime_version;
+}
+
+/**************************** Device Management **************************/
+
+static int GetDeviceCountImpl() {
+  const auto* xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES");
+  if (xpu_visible_devices != nullptr) {
+    std::string xpu_visible_devices_str(xpu_visible_devices);
+    if (std::all_of(xpu_visible_devices_str.begin(),
+                    xpu_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected.";
+      return 0;
+    }
+  }
+
+  int count = 0;
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_count(&count));
+  return count;
+}
+
+int GetXPUDeviceCount() {
+  static auto dev_cnt = GetDeviceCountImpl();
+  return dev_cnt;
+}
+
+int GetXPUCurrentDeviceId() {
+  int dev_id;
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_current_device(&dev_id));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  return dev_id;
+}
+
+void SetXPUDeviceId(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetXPUDeviceCount(),
+      paddle::platform::errors::InvalidArgument("id must less than XPU count"));
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id));
+}
+
+static inline std::vector<std::string> Split(std::string const& original,
+                                             char separator) {
+  std::vector<std::string> results;
+  std::string token;
+  std::istringstream is(original);
+  while (std::getline(is, token, separator)) {
+    if (!token.empty()) {
+      results.push_back(token);
+    }
+  }
+  return results;
+}
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetXPUSelectedDevices() {
+  // use user specified XPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_xpus.empty()) {
+    auto devices_str = Split(FLAGS_selected_xpus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetXPUDeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
+/**************************** Memory Management **************************/
+
+void MemcpySyncH2D(void* dst,
+                   const void* src,
+                   size_t count,
+                   const pten::XPUPlace& dst_place) {
+  XPUDeviceGuard guard(dst_place.device);
+  PADDLE_ENFORCE_XPU_SUCCESS(
+      xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+}
+
+void MemcpySyncD2H(void* dst,
+                   const void* src,
+                   size_t count,
+                   const pten::XPUPlace& src_place,
+                   const pten::XPUContext& dev_ctx) {
+  XPUDeviceGuard guard(src_place.GetDeviceId());
+  dev_ctx.Wait();
+  PADDLE_ENFORCE_XPU_SUCCESS(
+      xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST));
+}
+
+// if src.device == dst.device and you need sync , after call this function,
+// need to call xpu_wait()
+void MemcpySyncD2D(void* dst,
+                   const pten::XPUPlace& dst_place,
+                   const void* src,
+                   const pten::XPUPlace& src_place,
+                   size_t count,
+                   const pten::XPUContext& dev_ctx) {
+  int dev_id = GetXPUCurrentDeviceId();
+  if (dst_place.device == dev_id && src_place.device == dev_id) {
+    PADDLE_ENFORCE_XDNN_SUCCESS(
+        baidu::xpu::api::copy(dev_ctx.x_context(),
+                              static_cast<const int8_t*>(src),
+                              static_cast<int8_t*>(dst),
+                              count),
+        "copy ");
+  } else {
+    PADDLE_ENFORCE_XPU_SUCCESS(
+        xpu_memcpy_peer(dst_place.device, dst, src_place.device, src, count));
+  }
+}
+
+/**************************** Others **************************/
+
+XPUVersion get_xpu_version(int dev_id) {
+  uint64_t v = 0;
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id));
+
+  if (v == K100 || v == K200) {
+    VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n";
+    return XPU1;
+  } else {
+    VLOG(1) << "KUNLUN device " << dev_id << " is XPU2\n";
+    return XPU2;
+  }
+}
+
+}  // namespace xpu
+}  // namespace backends
+}  // namespace pten
diff --git a/paddle/pten/backends/xpu/xpu_info.h b/paddle/pten/backends/xpu/xpu_info.h
new file mode 100644
index 00000000000..8cf836ba16d
--- /dev/null
+++ b/paddle/pten/backends/xpu/xpu_info.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/pten/common/place.h"
+
+namespace pten {
+
+class XPUContext;
+
+namespace backends {
+namespace xpu {
+
+/***** Version Management *****/
+
+//! Get the version of XPU Driver
+int GetDriverVersion();
+
+//! Get the version of XPU Runtime
+int GetRuntimeVersion();
+
+/***** Device Management *****/
+
+//! Get the total number of XPU devices in system.
+int GetXPUDeviceCount();
+
+//! Set the XPU device id for next execution.
+void SetXPUDeviceId(int device_id);
+
+//! Get the current XPU device id in system.
+int GetXPUCurrentDeviceId();
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetXPUSelectedDevices();
+
+/***** Memory Management *****/
+
+//! Copy memory from address src to dst synchronously.
+void MemcpySyncH2D(void *dst,
+                   const void *src,
+                   size_t count,
+                   const pten::XPUPlace &dst_place);
+void MemcpySyncD2H(void *dst,
+                   const void *src,
+                   size_t count,
+                   const pten::XPUPlace &src_place,
+                   const pten::XPUContext &dev_ctx);
+void MemcpySyncD2D(void *dst,
+                   const pten::XPUPlace &dst_place,
+                   const void *src,
+                   const pten::XPUPlace &src_place,
+                   size_t count,
+                   const pten::XPUContext &dev_ctx);
+
+class XPUDeviceGuard {
+ public:
+  explicit inline XPUDeviceGuard(int dev_id) {
+    int prev_id = GetXPUCurrentDeviceId();
+    if (prev_id != dev_id) {
+      prev_id_ = prev_id;
+      SetXPUDeviceId(dev_id);
+    }
+  }
+
+  inline ~XPUDeviceGuard() {
+    if (prev_id_ != -1) {
+      SetXPUDeviceId(prev_id_);
+    }
+  }
+
+  XPUDeviceGuard(const XPUDeviceGuard &o) = delete;
+  XPUDeviceGuard &operator=(const XPUDeviceGuard &o) = delete;
+
+ private:
+  int prev_id_{-1};
+};
+
+enum XPUVersion { XPU1, XPU2 };
+XPUVersion get_xpu_version(int dev_id);
+
+}  // namespace xpu
+}  // namespace backends
+}  // namespace pten
diff --git a/paddle/pten/core/device_context.cc b/paddle/pten/core/device_context.cc
index 7b2c4a2cf17..7566b351bf6 100644
--- a/paddle/pten/core/device_context.cc
+++ b/paddle/pten/core/device_context.cc
@@ -13,28 +13,45 @@
 // limitations under the License.
 
 #include "paddle/pten/core/device_context.h"
+#include "paddle/pten/api/ext/exception.h"
 
 namespace pten {
 
 struct DeviceContext::Impl {
-  Allocator* allocator_{nullptr};
-
   Impl() = default;
   ~Impl() = default;
 
-  void SetAllocator(Allocator* allocator) { allocator_ = allocator; }
+  void SetDeviceAllocator(Allocator* allocator) {
+    device_allocator_ = allocator;
+  }
+
+  void SetHostAllocator(Allocator* allocator) { host_allocator_ = allocator; }
+
+  const Allocator& GetDeviceAllocator() const {
+    PD_CHECK(device_allocator_ != nullptr, "the device_allocator is nullptr.");
+    return *device_allocator_;
+  }
 
-  const Allocator& GetAllocator() const { return *allocator_; }
+  const Allocator& GetHostAllocator() const {
+    PD_CHECK(host_allocator_ != nullptr, "the host_allocator is nullptr.");
+    return *host_allocator_;
+  }
 
   // TODO(Wilber): Add impl. It seems that tensorbase not have interface to
   // communicate with allocator.
-  void Alloc(TensorBase* tensor) {}
+  void HostAlloc(TensorBase* tensor) {}
+  void DeviceAlloc(TensorBase* tensor) {}
+
+  Allocator* device_allocator_{nullptr};
+  Allocator* host_allocator_{nullptr};
 };
 
 DeviceContext::DeviceContext() { impl_ = std::make_unique<Impl>(); }
 
 DeviceContext::DeviceContext(const DeviceContext& other) {
-  impl_->SetAllocator(const_cast<Allocator*>(&other.GetAllocator()));
+  impl_->SetDeviceAllocator(
+      const_cast<Allocator*>(&other.GetDeviceAllocator()));
+  impl_->SetHostAllocator(const_cast<Allocator*>(&other.GetHostAllocator()));
 }
 
 DeviceContext::DeviceContext(DeviceContext&& other) {
@@ -43,14 +60,26 @@ DeviceContext::DeviceContext(DeviceContext&& other) {
 
 DeviceContext::~DeviceContext() = default;
 
-void DeviceContext::SetAllocator(Allocator* allocator) {
-  impl_->SetAllocator(allocator);
+void DeviceContext::SetHostAllocator(Allocator* allocator) {
+  impl_->SetHostAllocator(allocator);
+}
+
+void DeviceContext::SetDeviceAllocator(Allocator* allocator) {
+  impl_->SetDeviceAllocator(allocator);
+}
+
+const Allocator& DeviceContext::GetHostAllocator() const {
+  return impl_->GetHostAllocator();
 }
 
-const Allocator& DeviceContext::GetAllocator() const {
-  return impl_->GetAllocator();
+const Allocator& DeviceContext::GetDeviceAllocator() const {
+  return impl_->GetDeviceAllocator();
 }
 
-void DeviceContext::Alloc(TensorBase* tensor) { impl_->Alloc(tensor); }
+void DeviceContext::HostAlloc(TensorBase* tensor) { impl_->HostAlloc(tensor); }
+
+void DeviceContext::DeviceAlloc(TensorBase* tensor) {
+  impl_->DeviceAlloc(tensor);
+}
 
 }  // namespace pten
diff --git a/paddle/pten/core/device_context.h b/paddle/pten/core/device_context.h
index 1ee2e21494b..c658a24c352 100644
--- a/paddle/pten/core/device_context.h
+++ b/paddle/pten/core/device_context.h
@@ -57,19 +57,38 @@ class DeviceContext {
    *
    * @param allocator
    */
-  void SetAllocator(Allocator*);
+  void SetDeviceAllocator(Allocator*);
 
   /**
-   * @brief Get the const Allocator object.
+   * @brief Get the const deveice-releated Allocator object.
    *
    * @return Allocator
    */
-  const Allocator& GetAllocator() const;
+  const Allocator& GetDeviceAllocator() const;
 
   /**
-   * @brief Allocate memory for tensor.
+   * @brief Allocate device memory for tensor.
    */
-  void Alloc(pten::TensorBase*);
+  void DeviceAlloc(pten::TensorBase*);
+
+  /**
+   * @brief Set the host Allocator object.
+   *
+   * @param allocator
+   */
+  void SetHostAllocator(Allocator*);
+
+  /**
+   * @brief Get the const host Allocator object.
+   *
+   * @return Allocator
+   */
+  const Allocator& GetHostAllocator() const;
+
+  /**
+   * @brief Allocate host memory for tensor.
+   */
+  void HostAlloc(pten::TensorBase*);
 
   // TODO(wilber): Just for the convenience of migrating the code, it will be
   // modified or removed later.
-- 
GitLab