[PTEN] Add xpu context. (#39098)

c1e5a393 · Wilber · GitHub · b2a7261d · c1e5a393 · c1e5a393
28 changed file
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -86,5 +86,12 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> {
  using TYPE = pten::CPUContext;
 };

+#ifdef PADDLE_WITH_XPU
+template <>
+struct ConvertToPtenContext<platform::XPUDeviceContext> {
+  using TYPE = pten::XPUContext;
+};
+#endif
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
@@ -94,11 +94,11 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
        inverse_scale = 0.0;
      }

-      paddle::platform::XPUVersion version = dev_ctx.xpu_version();
+      auto version = dev_ctx.xpu_version();
      framework::Tensor float_x;
      framework::Tensor float_out;
      if (std::is_same<T, paddle::platform::float16>::value &&
-          (version == paddle::platform::XPUVersion::XPU1)) {
+          (version == pten::backends::xpu::XPUVersion::XPU1)) {
        float_x.mutable_data<MPDType>(dev_ctx.GetPlace(),
                                      x->numel() * sizeof(MPDType));
        float_out.mutable_data<MPDType>(dev_ctx.GetPlace(),

--- a/paddle/fluid/operators/dropout_op_xpu.cc
+++ b/paddle/fluid/operators/dropout_op_xpu.cc
@@ -107,8 +107,8 @@ class DropoutGradXPUKernel : public framework::OpKernel<T> {
      return;
    }

-    paddle::platform::XPUVersion version = dev_ctx.xpu_version();
-    if (version == paddle::platform::XPUVersion::XPU1) {
+    auto version = dev_ctx.xpu_version();
+    if (version == pten::backends::xpu::XPUVersion::XPU1) {
      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
      XPUType* mask_new = RAII_GUARD.alloc_l3_or_gm<XPUType>(mask->numel());
      float scale =

--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -448,7 +448,8 @@ class ReshapeKernel {
 #ifdef PADDLE_WITH_XPU
    if (platform::is_xpu_place(ctx.GetPlace())) {
      auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
-      pten::ReshapeKernel(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out);
+      pten::ReshapeKernel(static_cast<const pten::XPUContext &>(dev_ctx),
+                          *pt_x.get(), pt_scalar_shape, pt_out);
    }
 #endif
    // non-inplace need move all result from pt_out to out, inplace need set
@@ -485,7 +486,8 @@ class ReshapeGradKernel {
 #ifdef PADDLE_WITH_XPU
    if (platform::is_xpu_place(ctx.GetPlace())) {
      auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
-      pten::ReshapeGradKernel(dev_ctx, *pt_d_out.get(), pt_d_x.get());
+      pten::ReshapeGradKernel(static_cast<const pten::XPUContext &>(dev_ctx),
+                              *pt_d_out.get(), pt_d_x.get());
    }
 #endif
  }
@@ -516,7 +518,9 @@ class ReshapeDoubleGradKernel {
 #ifdef PADDLE_WITH_XPU
    if (platform::is_xpu_place(ctx.GetPlace())) {
      auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
-      pten::ReshapeDoubleGradKernel(dev_ctx, *pt_dd_x.get(), pt_dd_out.get());
+      pten::ReshapeDoubleGradKernel(
+          static_cast<const pten::XPUContext &>(dev_ctx), *pt_dd_x.get(),
+          pt_dd_out.get());
    }
 #endif
  }

--- a/paddle/fluid/operators/softmax_op_xpu.cc
+++ b/paddle/fluid/operators/softmax_op_xpu.cc
@@ -45,8 +45,8 @@ class SoftmaxXPUKernel : public framework::OpKernel<T> {
    auto& dev_ctx = context.template device_context<DeviceContext>();

    int r = XPU_SUCCESS;
-    paddle::platform::XPUVersion version = dev_ctx.xpu_version();
-    if (version == paddle::platform::XPUVersion::XPU1) {
+    auto version = dev_ctx.xpu_version();
+    if (version == pten::backends::xpu::XPUVersion::XPU1) {
      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
      XPUType* clip_x_data_l3 = RAII_GUARD.alloc_l3_or_gm<XPUType>(x->numel());
      r = xpu::clip_v2(dev_ctx.x_context(),

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -121,6 +121,9 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
    place pten_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
    ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} cpu_context)
+if(WITH_XPU)
+  target_link_libraries(device_context xpu_context)
+endif()

 cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)
 if(WITH_ASCEND_CL)

--- a/paddle/fluid/platform/device/xpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -4,7 +4,7 @@ endif()

 set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl)

-cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place)
+cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place pten_xpu_info)
 cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context)

 add_subdirectory(tests)
--- a/paddle/fluid/platform/device/xpu/enforce_xpu.h
+++ b/paddle/fluid/platform/device/xpu/enforce_xpu.h
@@ -15,177 +15,36 @@ limitations under the License. */
 #pragma once

 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "xpu/bkcl.h"
+
+#include "paddle/pten/backends/xpu/enforce_xpu.h"

 namespace paddle {
 namespace platform {

 // Note: XPU runtime api return int, not XPUError_t
 inline const char* xpuGetErrorString(int stat) {
-  switch (stat) {
-    case XPU_SUCCESS:
-      return "Success";
-    case XPUERR_INVALID_DEVICE:
-      return "Invalid XPU device";
-    case XPUERR_UNINIT:
-      return "XPU runtime not properly inited";
-    case XPUERR_NOMEM:
-      return "Device memory not enough";
-    case XPUERR_NOCPUMEM:
-      return "CPU memory not enough";
-    case XPUERR_INVALID_PARAM:
-      return "Invalid parameter";
-    case XPUERR_NOXPUFUNC:
-      return "Cannot get XPU Func";
-    case XPUERR_LDSO:
-      return "Error loading dynamic library";
-    case XPUERR_LDSYM:
-      return "Error loading func from dynamic library";
-    case XPUERR_SIMULATOR:
-      return "Error from XPU Simulator";
-    case XPUERR_NOSUPPORT:
-      return "Operation not supported";
-    case XPUERR_ABNORMAL:
-      return "Device abnormal due to previous error";
-    case XPUERR_KEXCEPTION:
-      return "Exception in kernel execution";
-    case XPUERR_TIMEOUT:
-      return "Kernel execution timed out";
-    case XPUERR_BUSY:
-      return "Resource busy";
-    case XPUERR_USEAFCLOSE:
-      return "Use a stream after closed";
-    case XPUERR_UCECC:
-      return "Uncorrectable ECC";
-    case XPUERR_OVERHEAT:
-      return "Overheat";
-    case XPUERR_UNEXPECT:
-      return "Execution error, reach unexpected control flow";
-    case XPUERR_DEVRESET:
-      return "Device is being reset, try again later";
-    case XPUERR_HWEXCEPTION:
-      return "Hardware module exception";
-    case XPUERR_HBM_INIT:
-      return "Error init HBM";
-    case XPUERR_DEVINIT:
-      return "Error init device";
-    case XPUERR_PEERRESET:
-      return "Device is being reset, try again later";
-    case XPUERR_MAXDEV:
-      return "Device count exceed limit";
-    case XPUERR_NOIOC:
-      return "Unknown IOCTL command";
-    case XPUERR_DMATIMEOUT:
-      return "DMA timed out, a reboot maybe needed";
-    case XPUERR_DMAABORT:
-      return "DMA aborted due to error, possibly wrong address or hardware "
-             "state";
-    case XPUERR_MCUUNINIT:
-      return "Firmware not initialized";
-    case XPUERR_OLDFW:
-      return "Firmware version too old (<15), please update.";
-    case XPUERR_PCIE:
-      return "Error in PCIE";
-    case XPUERR_FAULT:
-      return "Error copy between kernel and user space";
-    case XPUERR_INTERRUPTED:
-      return "Execution interrupted by user";
-    default:
-      return "unkonwn error";
-  }
+  return pten::backends::xpu::xpuGetErrorString(stat);
 }

 inline const char* bkclGetErrorString(BKCLResult_t stat) {
-  switch (stat) {
-    case BKCL_SUCCESS:
-      return "BKCL_SUCCESS";
-    case BKCL_INVALID_ARGUMENT:
-      return "BKCL_INVALID_ARGUMENT";
-    case BKCL_RUNTIME_ERROR:
-      return "BKCL_RUNTIME_ERROR";
-    case BKCL_SYSTEM_ERROR:
-      return "BKCL_SYSTEM_ERROR";
-    case BKCL_INTERNAL_ERROR:
-      return "BKCL_INTERNAL_ERROR";
-    default:
-      return "Unknown BKCL status";
-  }
+  return pten::backends::xpu::bkclGetErrorString(stat);
 }

 inline const char* xdnnGetErrorString(int stat) {
-  switch (stat) {
-    case xpu::Error_t::SUCCESS:
-      return "XDNN_SUCCESS";
-    case xpu::Error_t::INVALID_PARAM:
-      return "XDNN_INVALID_PARAM";
-    case xpu::Error_t::RUNTIME_ERROR:
-      return "XDNN_RUNTIME_ERROR";
-    case xpu::Error_t::NO_ENOUGH_WORKSPACE:
-      return "XDNN_NO_ENOUGH_WORKSPACE";
-    case xpu::Error_t::NOT_IMPLEMENT:
-      return "XDNN_NOT_IMPLEMENT";
-    default:
-      return "Unknown XDNN status";
-  }
+  return pten::backends::xpu::xdnnGetErrorString(stat);
 }

 inline std::string build_xpu_error_msg(int stat) {
-  std::string msg("XPU Error <" + std::to_string(stat) + ">, ");
-  return msg + xpuGetErrorString(stat) + " ";
+  return pten::backends::xpu::build_xpu_error_msg(stat);
 }

 inline std::string build_xpu_error_msg(BKCLResult_t stat) {
-  std::string msg("BKCL Error, ");
-  return msg + bkclGetErrorString(stat) + " ";
+  return pten::backends::xpu::build_xpu_error_msg(stat);
 }

 inline std::string build_xpu_xdnn_error_msg(int stat, std::string msg) {
-  return msg + " XDNN Error, " + xdnnGetErrorString(stat) + " ";
+  return pten::backends::xpu::build_xpu_xdnn_error_msg(stat, msg);
 }

-namespace details {
-
-template <typename T>
-struct ExternalApiType {};
-
-#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
-  template <>                                         \
-  struct ExternalApiType<type> {                      \
-    using Type = type;                                \
-    static constexpr Type kSuccess = success_value;   \
-  }
-
-DEFINE_EXTERNAL_API_TYPE(int, XPU_SUCCESS);
-DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
-
-#undef DEFINE_EXTERNAL_API_TYPE
-
-}  // namespace details
-
-#define PADDLE_ENFORCE_XPU_SUCCESS(COND)                      \
-  do {                                                        \
-    auto __cond__ = (COND);                                   \
-    using __XPU_STATUS_TYPE__ = decltype(__cond__);           \
-    constexpr auto __success_type__ =                         \
-        ::paddle::platform::details::ExternalApiType<         \
-            __XPU_STATUS_TYPE__>::kSuccess;                   \
-    if (UNLIKELY(__cond__ != __success_type__)) {             \
-      auto __summary__ = paddle::platform::errors::External(  \
-          ::paddle::platform::build_xpu_error_msg(__cond__)); \
-      __THROW_ERROR_INTERNAL__(__summary__);                  \
-    }                                                         \
-  } while (0)
-
-#define PADDLE_ENFORCE_XDNN_SUCCESS(COND, MSG)                          \
-  do {                                                                  \
-    auto __cond__ = (COND);                                             \
-    if (UNLIKELY(__cond__ != xpu::Error_t::SUCCESS)) {                  \
-      auto __summary__ = paddle::platform::errors::External(            \
-          ::paddle::platform::build_xpu_xdnn_error_msg(__cond__, MSG)); \
-      __THROW_ERROR_INTERNAL__(__summary__);                            \
-    }                                                                   \
-  } while (0)
-
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device/xpu/xpu_header.h
+++ b/paddle/fluid/platform/device/xpu/xpu_header.h
@@ -15,42 +15,5 @@ limitations under the License. */
 #pragma once

 #ifdef PADDLE_WITH_XPU
-#include <map>
-#include <string>
-#include <unordered_map>
-
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-
-#include "xpu/runtime.h"
-#include "xpu/runtime_ex.h"
-#include "xpu/xdnn.h"
-
-namespace xpu = baidu::xpu::api;
-
-static std::map<int, std::string> XPUAPIErrorMsg = {
-    {xpu::Error_t::SUCCESS, "xpu api success"},
-    {xpu::Error_t::INVALID_PARAM, "xpu api invalid param"},
-    {xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"},
-    {xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}};
-
-template <typename T>
-class XPUTypeTrait {
- public:
-  using Type = T;
-};
-
-template <>
-class XPUTypeTrait<paddle::platform::float16> {
- public:
-  using Type = float16;
-};
-
-template <>
-class XPUTypeTrait<paddle::platform::bfloat16> {
- public:
-  using Type = bfloat16;
-};
-
+#include "paddle/pten/backends/xpu/xpu_header.h"
 #endif
--- a/paddle/fluid/platform/device/xpu/xpu_info.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_info.cc
@@ -14,22 +14,14 @@ limitations under the License. */
 #include <cstdlib>
 #include <string>
 #include "gflags/gflags.h"
+
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"
-
-PADDLE_DEFINE_EXPORTED_string(
-    selected_xpus, "",
-    "A list of device ids separated by comma, like: 0,1,2,3. "
-    "This option is useful when doing multi process training and "
-    "each process have only one device (XPU). If you want to use "
-    "all visible devices, set this to empty string. NOTE: the "
-    "reason of doing this is that we want to use P2P communication"
-    "between XPU devices, use XPU_VISIBLE_DEVICES can only use"
-    "share-memory only.");
+
+#include "paddle/pten/backends/xpu/xpu_info.h"

 namespace paddle {
 namespace platform {
@@ -37,101 +29,40 @@ namespace platform {
 /**************************** Version Management **************************/

 //! Get the version of XPU Driver
-int GetDriverVersion() {
-  uint32_t driver_version_major = 0;
-  uint32_t driver_version_minor = 0;
-  PADDLE_ENFORCE_XPU_SUCCESS(
-      xpu_get_driver_version(&driver_version_major, &driver_version_minor));
-  int driver_version = driver_version_major * 10 + driver_version_minor;
-  return driver_version;
-}
+int GetDriverVersion() { return pten::backends::xpu::GetDriverVersion(); }

 //! Get the version of XPU Runtime
-int GetRuntimeVersion() {
-  uint32_t rumtime_version_major = 0;
-  uint32_t rumtime_version_minor = 0;
-  PADDLE_ENFORCE_XPU_SUCCESS(
-      xpu_get_runtime_version(&rumtime_version_major, &rumtime_version_minor));
-  int runtime_version = rumtime_version_major * 10 + rumtime_version_minor;
-  return runtime_version;
-}
+int GetRuntimeVersion() { return pten::backends::xpu::GetRuntimeVersion(); }

 /**************************** Device Management **************************/

-static int GetDeviceCountImpl() {
-  const auto* xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES");
-  if (xpu_visible_devices != nullptr) {
-    std::string xpu_visible_devices_str(xpu_visible_devices);
-    if (std::all_of(xpu_visible_devices_str.begin(),
-                    xpu_visible_devices_str.end(),
-                    [](char ch) { return ch == ' '; })) {
-      VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected.";
-      return 0;
-    }
-  }
-
-  int count = 0;
-  PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_count(&count));
-  return count;
-}
-
-int GetXPUDeviceCount() {
-  static auto dev_cnt = GetDeviceCountImpl();
-  return dev_cnt;
-}
+int GetXPUDeviceCount() { return pten::backends::xpu::GetXPUDeviceCount(); }

 int GetXPUCurrentDeviceId() {
-  int dev_id;
-  PADDLE_ENFORCE_XPU_SUCCESS(xpu_current_device(&dev_id));
-  if (dev_id >= 64) {
-    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
-    dev_id -= 64;
-  }
-  return dev_id;
+  return pten::backends::xpu::GetXPUCurrentDeviceId();
 }

-void SetXPUDeviceId(int id) {
-  PADDLE_ENFORCE_LT(
-      id, GetXPUDeviceCount(),
-      platform::errors::InvalidArgument("id must less than XPU count"));
-  PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id));
-}
+void SetXPUDeviceId(int id) { pten::backends::xpu::SetXPUDeviceId(id); }

 //! Get a list of device ids from environment variable or use all.
 std::vector<int> GetXPUSelectedDevices() {
  // use user specified XPUs in single-node multi-process mode.
-  std::vector<int> devices;
-  if (!FLAGS_selected_xpus.empty()) {
-    auto devices_str = paddle::string::Split(FLAGS_selected_xpus, ',');
-    for (auto id : devices_str) {
-      devices.push_back(atoi(id.c_str()));
-    }
-  } else {
-    int count = GetXPUDeviceCount();
-    for (int i = 0; i < count; ++i) {
-      devices.push_back(i);
-    }
-  }
-  return devices;
+  return pten::backends::xpu::GetXPUSelectedDevices();
 }

 /**************************** Memory Management **************************/

 void MemcpySyncH2D(void* dst, const void* src, size_t count,
                   const platform::XPUPlace& dst_place) {
-  platform::XPUDeviceGuard guard(dst_place.device);
-  PADDLE_ENFORCE_XPU_SUCCESS(
-      xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  pten::backends::xpu::MemcpySyncH2D(dst, src, count, dst_place);
 }

 void MemcpySyncD2H(void* dst, const void* src, size_t count,
                   const platform::XPUPlace& src_place) {
-  platform::XPUDeviceGuard guard(src_place.device);
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto* dev_ctx = pool.GetByPlace(src_place);
  dev_ctx->Wait();
-  PADDLE_ENFORCE_XPU_SUCCESS(
-      xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST));
+  pten::backends::xpu::MemcpySyncD2H(dst, src, count, src_place, *dev_ctx);
 }

 // if src.device == dst.device and you need sync , after call this function,
@@ -139,33 +70,16 @@ void MemcpySyncD2H(void* dst, const void* src, size_t count,
 void MemcpySyncD2D(void* dst, const platform::XPUPlace& dst_place,
                   const void* src, const platform::XPUPlace& src_place,
                   size_t count) {
-  int dev_id = GetXPUCurrentDeviceId();
-  if (dst_place.device == dev_id && src_place.device == dev_id) {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto* dev_ctx = pool.GetByPlace(src_place);
-    PADDLE_ENFORCE_XDNN_SUCCESS(
-        xpu::copy(dev_ctx->x_context(), static_cast<const int8_t*>(src),
-                  static_cast<int8_t*>(dst), count),
-        "copy ");
-  } else {
-    PADDLE_ENFORCE_XPU_SUCCESS(
-        xpu_memcpy_peer(dst_place.device, dst, src_place.device, src, count));
-  }
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.GetByPlace(src_place);
+  pten::backends::xpu::MemcpySyncD2D(dst, dst_place, src, src_place, count,
+                                     *dev_ctx);
 }

 /**************************** Others **************************/

-XPUVersion get_xpu_version(int dev_id) {
-  uint64_t v = 0;
-  PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id));
-
-  if (v == K100 || v == K200) {
-    VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n";
-    return XPU1;
-  } else {
-    VLOG(1) << "KUNLUN device " << dev_id << " is XPU2\n";
-    return XPU2;
-  }
+pten::backends::xpu::XPUVersion get_xpu_version(int dev_id) {
+  return pten::backends::xpu::get_xpu_version(dev_id);
 }

 }  // namespace platform

--- a/paddle/fluid/platform/device/xpu/xpu_info.h
+++ b/paddle/fluid/platform/device/xpu/xpu_info.h
@@ -13,6 +13,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include <vector>
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/backends/xpu/xpu_info.h"

 namespace paddle {
 namespace platform {
@@ -50,31 +51,9 @@ void MemcpySyncD2D(void *dst, const platform::XPUPlace &dst_place,
                   const void *src, const platform::XPUPlace &src_place,
                   size_t count);

-class XPUDeviceGuard {
- public:
-  explicit inline XPUDeviceGuard(int dev_id) {
-    int prev_id = platform::GetXPUCurrentDeviceId();
-    if (prev_id != dev_id) {
-      prev_id_ = prev_id;
-      platform::SetXPUDeviceId(dev_id);
-    }
-  }
+using XPUDeviceGuard = pten::backends::xpu::XPUDeviceGuard;

-  inline ~XPUDeviceGuard() {
-    if (prev_id_ != -1) {
-      platform::SetXPUDeviceId(prev_id_);
-    }
-  }
-
-  XPUDeviceGuard(const XPUDeviceGuard &o) = delete;
-  XPUDeviceGuard &operator=(const XPUDeviceGuard &o) = delete;
-
- private:
-  int prev_id_{-1};
-};
-
-enum XPUVersion { XPU1, XPU2 };
-XPUVersion get_xpu_version(int dev_id);
+pten::backends::xpu::XPUVersion get_xpu_version(int dev_id);

 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -24,7 +24,7 @@ namespace platform {
 bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
  auto& ops = get_kl1_ops();
  auto v = get_xpu_version(type.place_.device);
-  if (v == XPU2) {
+  if (v == pten::backends::xpu::XPUVersion::XPU2) {
    ops = get_kl2_ops();
  }

@@ -74,10 +74,11 @@ bool is_in_xpu_black_list(const std::string& op_name) {
  return false;
 }

-std::vector<vartype::Type> get_xpu_op_support_type(const std::string& op_name,
-                                                   XPUVersion version) {
+std::vector<vartype::Type> get_xpu_op_support_type(
+    const std::string& op_name, pten::backends::xpu::XPUVersion version) {
  std::vector<vartype::Type> res;
-  auto& ops = version == XPU1 ? get_kl1_ops() : get_kl2_ops();
+  auto& ops = version == pten::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops()
+                                                               : get_kl2_ops();
  if (ops.find(op_name) != ops.end()) {
    XPUKernelSet& type_set = ops[op_name];
    for (auto& item : type_set) {
@@ -87,9 +88,10 @@ std::vector<vartype::Type> get_xpu_op_support_type(const std::string& op_name,
  return res;
 }

-XPUOpListMap get_xpu_op_list(XPUVersion version) {
+XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version) {
  XPUOpListMap res;
-  auto& ops = version == XPU1 ? get_kl1_ops() : get_kl2_ops();
+  auto& ops = version == pten::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops()
+                                                               : get_kl2_ops();
  for (auto& op : ops) {
    std::vector<vartype::Type> op_vartypes;
    for (auto& item : op.second) {

--- a/paddle/fluid/platform/device/xpu/xpu_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h
@@ -27,9 +27,9 @@ using XPUOpListMap =
 bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type);
 bool is_in_xpu_black_list(const std::string& op_name);

-std::vector<vartype::Type> get_xpu_op_support_type(const std::string& op_name,
-                                                   XPUVersion version);
-XPUOpListMap get_xpu_op_list(XPUVersion version);
+std::vector<vartype::Type> get_xpu_op_support_type(
+    const std::string& op_name, pten::backends::xpu::XPUVersion version);
+XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version);

 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -246,52 +246,14 @@ IPUDeviceContext::~IPUDeviceContext() {}

 #endif
 #ifdef PADDLE_WITH_XPU
-XPUDeviceContext::XPUDeviceContext() {
-  context_ = xpu::create_context();
-  xpu_version_ = get_xpu_version(place_.device);
-}
+XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {}

 XPUDeviceContext::~XPUDeviceContext() {}

-XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
-  platform::XPUDeviceGuard guard(place.device);
-
+XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) {
  LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
-                          << static_cast<int>(place_.device);
-
-  context_ = xpu::create_context();
-  const int MAX_XPU_NUM = 16;
-  static void* l3ptrs[MAX_XPU_NUM] = {nullptr};
-
-  int l3_size = 13.5 * 1024 * 1024;
-  if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) {
-    l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE"));
-  }
-
-  auto selected_xpus = GetXPUSelectedDevices();
-  for (unsigned int i = 0; i < selected_xpus.size(); i++) {
-    if (place.device == selected_xpus[i]) {
-      if (l3ptrs[place.device] == nullptr) {
-        xpu_malloc(static_cast<void**>(&l3ptrs[place.device]), l3_size,
-                   XPU_MEM_L3);
-      }
-      if (l3ptrs[place.device] != nullptr) {
-        context_->_l3_mgr.set(l3ptrs[place.device], l3_size);
-        VLOG(3) << "xpu place " << place.device << " set l3 size " << l3_size;
-      }
-      break;
-    }
-  }
+                          << static_cast<int>(place.device);
 }
-
-void XPUDeviceContext::Wait() const {
-  platform::SetXPUDeviceId(place_.device);
-  xpu_wait(context_->xpu_stream);
-}
-
-Place XPUDeviceContext::GetPlace() const { return place_; }
-
-xpu::Context* XPUDeviceContext::x_context() const { return context_; }
 #endif

 #ifdef PADDLE_WITH_ASCEND_CL

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -78,6 +78,7 @@ struct GpuDevice;
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/pten/backends/xpu/xpu_context.h"
 #endif

 #ifdef PADDLE_WITH_ASCEND_CL
@@ -171,39 +172,12 @@ struct DefaultDeviceContextType<platform::MLUPlace>;

 #ifdef PADDLE_WITH_XPU
 namespace xpu = baidu::xpu::api;
-class XPUDeviceContext : public DeviceContext {
+class XPUDeviceContext : public pten::XPUContext {
 public:
  XPUDeviceContext();
  explicit XPUDeviceContext(XPUPlace place);
  virtual ~XPUDeviceContext();
  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
-  XPUVersion xpu_version() const { return xpu_version_; }
-  Place GetPlace() const override;
-  xpu::Context* x_context() const;
-
-  /*! \brief  Wait for all operations completion in the stream. */
-  void Wait() const override;
-
-#ifdef PADDLE_WITH_XPU_BKCL
-  /*! \brief  Return bkcl context. */
-  BKCLContext_t bkcl_context() const { return bkcl_context_; }
-
-  /*! \brief  Set bkcl context. */
-  void set_bkcl_context(BKCLContext_t context) { bkcl_context_ = context; }
-#endif
-
- private:
-  XPUPlace place_;
-  XPUVersion xpu_version_;
-  xpu::Context* context_;
-#ifdef PADDLE_WITH_XPU_BKCL
-  BKCLContext_t bkcl_context_;
-#endif
-
-  // Need to be the same with other DeviceContext,
-  // Eventhough eigen_device_ is not used in XPU
-  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
-  DISABLE_COPY_AND_ASSIGN(XPUDeviceContext);
 };

 template <>

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1756,27 +1756,30 @@ All parameter, weight, gradient are variables in Paddle.
      .def("__repr__", string::to_string<const platform::XPUPlace &>)
      .def("__str__", string::to_string<const platform::XPUPlace &>);
 #ifdef PADDLE_WITH_XPU
-  py::enum_<platform::XPUVersion>(m, "XPUVersion", py::arithmetic())
-      .value("XPU1", platform::XPUVersion::XPU1)
-      .value("XPU2", platform::XPUVersion::XPU2)
+  py::enum_<pten::backends::xpu::XPUVersion>(m, "XPUVersion", py::arithmetic())
+      .value("XPU1", pten::backends::xpu::XPUVersion::XPU1)
+      .value("XPU2", pten::backends::xpu::XPUVersion::XPU2)
      .export_values();
  m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
  m.def("get_xpu_device_version",
        [](int device_id) { return platform::get_xpu_version(device_id); });
-  m.def("get_xpu_device_op_support_types",
-        [](const std::string &op_name, platform::XPUVersion version) {
-          return platform::get_xpu_op_support_type(op_name, version);
-        });
-  m.def("get_xpu_device_op_list", [](platform::XPUVersion version) {
+  m.def(
+      "get_xpu_device_op_support_types",
+      [](const std::string &op_name, pten::backends::xpu::XPUVersion version) {
+        return platform::get_xpu_op_support_type(op_name, version);
+      });
+  m.def("get_xpu_device_op_list", [](pten::backends::xpu::XPUVersion version) {
    return platform::get_xpu_op_list(version);
  });
  m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool {
    // XPUs with Compute Capability > xpu2 support float16 and bfloat16
-    return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1;
+    return platform::get_xpu_version(place.device) >
+           pten::backends::xpu::XPUVersion::XPU1;
  });
  m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool {
    // XPUs with Compute Capability > xpu2 support float16 and bfloat16
-    return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1;
+    return platform::get_xpu_version(place.device) >
+           pten::backends::xpu::XPUVersion::XPU1;
  });
 #endif


--- a/paddle/pten/backends/CMakeLists.txt
+++ b/paddle/pten/backends/CMakeLists.txt
@@ -2,4 +2,12 @@ add_subdirectory(dynload)

 add_subdirectory(cpu)

-cc_library(pten_context SRCS all_context.cc DEPS device_context)
+if(WITH_XPU)
+  add_subdirectory(xpu)
+endif()
+
+cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context)
+
+if(WITH_XPU)
+  add_dependencies(pten_context xpu_context)
+endif()
--- a/paddle/pten/backends/cpu/cpu_context.cc
+++ b/paddle/pten/backends/cpu/cpu_context.cc
@@ -18,16 +18,11 @@

 // NOTE: The paddle framework should add WITH_EIGEN option to support compile
 // without eigen.
-#include "paddle/pten/core/device_context.h"
 #include "unsupported/Eigen/CXX11/Tensor"

 namespace pten {

 struct CPUContext::CPUImpl {
-  Eigen::DefaultDevice* device_{nullptr};
-  CPUContextResource res_;
-  CPUPlace place_;
-
  CPUImpl() { device_ = new Eigen::DefaultDevice(); }

  // Users need to manage external resources.
@@ -36,7 +31,7 @@ struct CPUContext::CPUImpl {
  }

  ~CPUImpl() {
-    if (res_.device == nullptr) {
+    if (res_.device == nullptr && device_ != nullptr) {
      delete device_;
      device_ = nullptr;
    }
@@ -56,27 +51,28 @@ struct CPUContext::CPUImpl {
  }

  Place GetPlace() const { return place_; }
+
+  Eigen::DefaultDevice* device_{nullptr};
+  CPUContextResource res_;
+  CPUPlace place_;
 };

-CPUContext::CPUContext() : DeviceContext(), cpu_impl_(nullptr) {
+CPUContext::CPUContext() : DeviceContext() {
  cpu_impl_ = std::make_unique<CPUImpl>();
 }

-CPUContext::CPUContext(const CPUContext& other)
-    : DeviceContext(), cpu_impl_(nullptr) {
+CPUContext::CPUContext(const CPUContext& other) : DeviceContext() {
  cpu_impl_ = std::make_unique<CPUImpl>();
  cpu_impl_->SetEigenDevice(other.eigen_device());
 }

-CPUContext::CPUContext(CPUContext&& other)
-    : DeviceContext(), cpu_impl_(nullptr) {
+CPUContext::CPUContext(CPUContext&& other) : DeviceContext() {
  cpu_impl_ = std::move(other.cpu_impl_);
 }

 CPUContext::~CPUContext() = default;

-CPUContext::CPUContext(const CPUContextResource& ctx_res)
-    : DeviceContext(), cpu_impl_(nullptr) {
+CPUContext::CPUContext(const CPUContextResource& ctx_res) : DeviceContext() {
  cpu_impl_ = std::make_unique<CPUImpl>(ctx_res);
 }


--- a/paddle/pten/backends/xpu/CMakeLists.txt
+++ b/paddle/pten/backends/xpu/CMakeLists.txt
+cc_library(pten_xpu_info SRCS xpu_info.cc DEPS enforce xpulib pten_place)
+cc_library(xpu_context SRCS xpu_context.cc DEPS pten_device_context pten_xpu_info)
--- a/paddle/pten/backends/xpu/enforce_xpu.h
+++ b/paddle/pten/backends/xpu/enforce_xpu.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/backends/xpu/xpu_header.h"
+#include "xpu/bkcl.h"
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pten {
+namespace backends {
+namespace xpu {
+
+// Note: XPU runtime api return int, not XPUError_t
+inline const char* xpuGetErrorString(int stat) {
+  switch (stat) {
+    case XPU_SUCCESS:
+      return "Success";
+    case XPUERR_INVALID_DEVICE:
+      return "Invalid XPU device";
+    case XPUERR_UNINIT:
+      return "XPU runtime not properly inited";
+    case XPUERR_NOMEM:
+      return "Device memory not enough";
+    case XPUERR_NOCPUMEM:
+      return "CPU memory not enough";
+    case XPUERR_INVALID_PARAM:
+      return "Invalid parameter";
+    case XPUERR_NOXPUFUNC:
+      return "Cannot get XPU Func";
+    case XPUERR_LDSO:
+      return "Error loading dynamic library";
+    case XPUERR_LDSYM:
+      return "Error loading func from dynamic library";
+    case XPUERR_SIMULATOR:
+      return "Error from XPU Simulator";
+    case XPUERR_NOSUPPORT:
+      return "Operation not supported";
+    case XPUERR_ABNORMAL:
+      return "Device abnormal due to previous error";
+    case XPUERR_KEXCEPTION:
+      return "Exception in kernel execution";
+    case XPUERR_TIMEOUT:
+      return "Kernel execution timed out";
+    case XPUERR_BUSY:
+      return "Resource busy";
+    case XPUERR_USEAFCLOSE:
+      return "Use a stream after closed";
+    case XPUERR_UCECC:
+      return "Uncorrectable ECC";
+    case XPUERR_OVERHEAT:
+      return "Overheat";
+    case XPUERR_UNEXPECT:
+      return "Execution error, reach unexpected control flow";
+    case XPUERR_DEVRESET:
+      return "Device is being reset, try again later";
+    case XPUERR_HWEXCEPTION:
+      return "Hardware module exception";
+    case XPUERR_HBM_INIT:
+      return "Error init HBM";
+    case XPUERR_DEVINIT:
+      return "Error init device";
+    case XPUERR_PEERRESET:
+      return "Device is being reset, try again later";
+    case XPUERR_MAXDEV:
+      return "Device count exceed limit";
+    case XPUERR_NOIOC:
+      return "Unknown IOCTL command";
+    case XPUERR_DMATIMEOUT:
+      return "DMA timed out, a reboot maybe needed";
+    case XPUERR_DMAABORT:
+      return "DMA aborted due to error, possibly wrong address or hardware "
+             "state";
+    case XPUERR_MCUUNINIT:
+      return "Firmware not initialized";
+    case XPUERR_OLDFW:
+      return "Firmware version too old (<15), please update.";
+    case XPUERR_PCIE:
+      return "Error in PCIE";
+    case XPUERR_FAULT:
+      return "Error copy between kernel and user space";
+    case XPUERR_INTERRUPTED:
+      return "Execution interrupted by user";
+    default:
+      return "unkonwn error";
+  }
+}
+
+inline const char* bkclGetErrorString(BKCLResult_t stat) {
+  switch (stat) {
+    case BKCL_SUCCESS:
+      return "BKCL_SUCCESS";
+    case BKCL_INVALID_ARGUMENT:
+      return "BKCL_INVALID_ARGUMENT";
+    case BKCL_RUNTIME_ERROR:
+      return "BKCL_RUNTIME_ERROR";
+    case BKCL_SYSTEM_ERROR:
+      return "BKCL_SYSTEM_ERROR";
+    case BKCL_INTERNAL_ERROR:
+      return "BKCL_INTERNAL_ERROR";
+    default:
+      return "Unknown BKCL status";
+  }
+}
+
+inline const char* xdnnGetErrorString(int stat) {
+  switch (stat) {
+    case baidu::xpu::api::Error_t::SUCCESS:
+      return "XDNN_SUCCESS";
+    case baidu::xpu::api::Error_t::INVALID_PARAM:
+      return "XDNN_INVALID_PARAM";
+    case baidu::xpu::api::Error_t::RUNTIME_ERROR:
+      return "XDNN_RUNTIME_ERROR";
+    case baidu::xpu::api::Error_t::NO_ENOUGH_WORKSPACE:
+      return "XDNN_NO_ENOUGH_WORKSPACE";
+    case baidu::xpu::api::Error_t::NOT_IMPLEMENT:
+      return "XDNN_NOT_IMPLEMENT";
+    default:
+      return "Unknown XDNN status";
+  }
+}
+
+inline std::string build_xpu_error_msg(int stat) {
+  std::string msg("XPU Error <" + std::to_string(stat) + ">, ");
+  return msg + xpuGetErrorString(stat) + " ";
+}
+
+inline std::string build_xpu_error_msg(BKCLResult_t stat) {
+  std::string msg("BKCL Error, ");
+  return msg + bkclGetErrorString(stat) + " ";
+}
+
+inline std::string build_xpu_xdnn_error_msg(int stat, std::string msg) {
+  return msg + " XDNN Error, " + xdnnGetErrorString(stat) + " ";
+}
+
+namespace details {
+
+template <typename T>
+struct ExternalApiType {};
+
+#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
+  template <>                                         \
+  struct ExternalApiType<type> {                      \
+    using Type = type;                                \
+    static constexpr Type kSuccess = success_value;   \
+  }
+
+DEFINE_EXTERNAL_API_TYPE(int, XPU_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
+
+#undef DEFINE_EXTERNAL_API_TYPE
+
+}  // namespace details
+
+#define PADDLE_ENFORCE_XPU_SUCCESS(COND)                         \
+  do {                                                           \
+    auto __cond__ = (COND);                                      \
+    using __XPU_STATUS_TYPE__ = decltype(__cond__);              \
+    constexpr auto __success_type__ =                            \
+        ::pten::backends::xpu::details::ExternalApiType<         \
+            __XPU_STATUS_TYPE__>::kSuccess;                      \
+    if (UNLIKELY(__cond__ != __success_type__)) {                \
+      auto __summary__ = paddle::platform::errors::External(     \
+          ::pten::backends::xpu::build_xpu_error_msg(__cond__)); \
+      __THROW_ERROR_INTERNAL__(__summary__);                     \
+    }                                                            \
+  } while (0)
+
+#define PADDLE_ENFORCE_XDNN_SUCCESS(COND, MSG)                             \
+  do {                                                                     \
+    auto __cond__ = (COND);                                                \
+    if (UNLIKELY(__cond__ != baidu::xpu::api::Error_t::SUCCESS)) {         \
+      auto __summary__ = paddle::platform::errors::External(               \
+          ::pten::backends::xpu::build_xpu_xdnn_error_msg(__cond__, MSG)); \
+      __THROW_ERROR_INTERNAL__(__summary__);                               \
+    }                                                                      \
+  } while (0)
+
+}  // namespace xpu
+}  // namespace backends
+}  // namespace pten
--- a/paddle/pten/backends/xpu/forwards.h
+++ b/paddle/pten/backends/xpu/forwards.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// Forward-declares.
+#pragma once
+
+// Forward declaration of xpu context.
+namespace baidu {
+namespace xpu {
+namespace api {
+
+struct Context;
+typedef void* BKCLContext_t;
+
+}  // namespace api
+}  // namespace xpu
+}  // namespace baidu
--- a/paddle/pten/backends/xpu/xpu_context.cc
+++ b/paddle/pten/backends/xpu/xpu_context.cc
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/backends/xpu/xpu_context.h"
+#include <memory>
+#include "paddle/pten/api/ext/exception.h"
+
+#include "xpu/runtime.h"
+#include "xpu/runtime_ex.h"
+#include "xpu/xdnn.h"
+
+namespace xpu = baidu::xpu::api;
+
+namespace pten {
+
+struct XPUContext::XPUImpl {
+  void SetL3Cache() {
+    const int MAX_XPU_NUM = 16;
+    static void* l3ptrs[MAX_XPU_NUM] = {nullptr};
+
+    int l3_size = 13.5 * 1024 * 1024;
+    if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) {
+      l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE"));
+    }
+
+    auto selected_xpus = backends::xpu::GetXPUSelectedDevices();
+    for (unsigned int i = 0; i < selected_xpus.size(); i++) {
+      if (place_.GetDeviceId() == selected_xpus[i]) {
+        if (l3ptrs[place_.GetDeviceId()] == nullptr) {
+          xpu_malloc(static_cast<void**>(&l3ptrs[place_.GetDeviceId()]),
+                     l3_size,
+                     XPU_MEM_L3);
+        }
+        if (l3ptrs[place_.GetDeviceId()] != nullptr) {
+          context_->_l3_mgr.set(l3ptrs[place_.GetDeviceId()], l3_size);
+          VLOG(3) << "xpu place " << place_.GetDeviceId() << " set l3 size "
+                  << l3_size;
+        }
+        break;
+      }
+    }
+  }
+
+  XPUImpl() {
+    context_ = xpu::create_context();
+    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
+  }
+
+  explicit XPUImpl(XPUPlace place) : place_(place) {
+    backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
+
+    LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
+                            << static_cast<int>(place_.device);
+
+    context_ = xpu::create_context();
+    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
+    SetL3Cache();
+  }
+
+  // Users need to manage external resources.
+  explicit XPUImpl(const XPUContextResource& ctx_res,
+                   const XPUPlace& place = XPUPlace(0))
+      : res_(ctx_res), place_(place) {
+    context_ = res_.context;
+    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
+    SetL3Cache();
+  }
+
+  ~XPUImpl() {
+    if (res_.context == nullptr && context_ != nullptr) {
+      xpu::destroy_context(context_);
+      context_ = nullptr;
+    }
+  }
+
+  Place GetPlace() const { return place_; }
+
+  backends::xpu::XPUVersion GetXpuVersion() const { return xpu_version_; }
+
+  xpu::Context* GetXContext() const {
+    PD_CHECK(context_ != nullptr, "the xpu context is nullptr.");
+    return context_;
+  }
+
+  xpu::BKCLContext_t GetBkclContext() const { return bkcl_context_; }
+
+  void Wait() const {
+    backends::xpu::SetXPUDeviceId(place_.GetDeviceId());
+    PD_CHECK(context_ != nullptr, "the xpu context is nullptr.");
+    xpu_wait(context_->xpu_stream);
+  }
+
+  void SetXContext(xpu::Context* context) {
+    if (context == nullptr) {
+      return;
+    }
+    res_.context = context;
+    context_ = context;
+  }
+
+  void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; }
+
+  XPUContextResource res_;
+  XPUPlace place_;
+  backends::xpu::XPUVersion xpu_version_;
+  xpu::Context* context_{nullptr};
+  // NOTE: Distributed communicator, distributed framework manages its
+  // resources, XPUContext only holds references.
+  xpu::BKCLContext_t bkcl_context_{nullptr};
+};
+
+XPUContext::XPUContext() : DeviceContext() {
+  impl_ = std::make_unique<XPUImpl>();
+}
+
+XPUContext::XPUContext(const XPUPlace& place) {
+  impl_ = std::make_unique<XPUImpl>(place);
+}
+
+XPUContext::XPUContext(const XPUContext& other) : DeviceContext() {
+  impl_ = std::make_unique<XPUImpl>();
+  impl_->SetXContext(other.x_context());
+  impl_->SetBkclContext(other.bkcl_context());
+}
+
+XPUContext::XPUContext(XPUContext&& other) : DeviceContext() {
+  impl_ = std::move(other.impl_);
+}
+
+XPUContext::~XPUContext() = default;
+
+XPUContext::XPUContext(const XPUContextResource& ctx_res) : DeviceContext() {
+  impl_ = std::make_unique<XPUImpl>(ctx_res);
+}
+
+Place XPUContext::GetPlace() const { return impl_->GetPlace(); }
+
+backends::xpu::XPUVersion XPUContext::xpu_version() const {
+  return impl_->GetXpuVersion();
+}
+
+xpu::Context* XPUContext::x_context() const { return impl_->GetXContext(); }
+
+xpu::BKCLContext_t XPUContext::bkcl_context() const {
+  return impl_->GetBkclContext();
+}
+
+void XPUContext::Wait() const { impl_->Wait(); }
+
+void XPUContext::set_x_context(xpu::Context* context) {
+  impl_->SetXContext(context);
+}
+
+void XPUContext::set_bkcl_context(xpu::BKCLContext_t context) {
+  impl_->SetBkclContext(context);
+}
+
+}  // namespace pten
--- a/paddle/pten/backends/xpu/xpu_context.h
+++ b/paddle/pten/backends/xpu/xpu_context.h
@@ -14,13 +14,60 @@ limitations under the License. */

 #pragma once

-#ifdef PADDLE_WITH_XPU
+#include <memory>
+#include "paddle/pten/backends/xpu/forwards.h"
+#include "paddle/pten/common/place.h"
+#include "paddle/pten/core/device_context.h"

-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/backends/xpu/xpu_header.h"
+#include "paddle/pten/backends/xpu/xpu_info.h"
+
+namespace xpu = baidu::xpu::api;

 namespace pten {
-using XPUContext = paddle::platform::XPUDeviceContext;
-}  // namespace pten

-#endif  // PADDLE_WITH_XPU
+struct XPUContextResource {
+  xpu::Context* context{nullptr};
+};
+
+class XPUContext : public DeviceContext {
+ public:
+  // NOTE: DeviceContext hold resources. Used in training scenarios.
+  XPUContext();
+
+  explicit XPUContext(const XPUPlace&);
+
+  // NOTE: Share the same underlying resources, please ensure that resources are
+  // not released.
+  XPUContext(const XPUContext&);
+
+  XPUContext(XPUContext&&);
+
+  virtual ~XPUContext();
+
+  Place GetPlace() const override;
+
+  backends::xpu::XPUVersion xpu_version() const;
+
+  xpu::Context* x_context() const;
+
+  // Return bkcl context.
+  xpu::BKCLContext_t bkcl_context() const;
+
+  // Wait for all operations completion in the stream.
+  void Wait() const override;
+
+ public:
+  // NOTE: External users manage resources. Used in inference scenarios.
+  explicit XPUContext(const XPUContextResource&);
+
+  void set_x_context(xpu::Context*);
+
+  void set_bkcl_context(xpu::BKCLContext_t context);
+
+ private:
+  struct XPUImpl;
+  std::unique_ptr<XPUImpl> impl_;
+};
+
+}  // namespace pten
--- a/paddle/pten/backends/xpu/xpu_header.h
+++ b/paddle/pten/backends/xpu/xpu_header.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+#include <map>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/float16.h"
+
+#include "xpu/runtime.h"
+#include "xpu/runtime_ex.h"
+#include "xpu/xdnn.h"
+
+namespace xpu = baidu::xpu::api;
+
+static std::map<int, std::string> XPUAPIErrorMsg = {
+    {xpu::Error_t::SUCCESS, "xpu api success"},
+    {xpu::Error_t::INVALID_PARAM, "xpu api invalid param"},
+    {xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"},
+    {xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}};
+
+template <typename T>
+class XPUTypeTrait {
+ public:
+  using Type = T;
+};
+
+template <>
+class XPUTypeTrait<pten::dtype::float16> {
+ public:
+  using Type = float16;
+};
+
+template <>
+class XPUTypeTrait<pten::dtype::bfloat16> {
+ public:
+  using Type = bfloat16;
+};
+
+#endif
--- a/paddle/pten/backends/xpu/xpu_info.cc
+++ b/paddle/pten/backends/xpu/xpu_info.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/xpu/xpu_info.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <string>
+
+#include "paddle/pten/backends/xpu/enforce_xpu.h"
+#include "paddle/pten/backends/xpu/xpu_context.h"
+#include "paddle/pten/backends/xpu/xpu_header.h"
+#include "paddle/pten/common/place.h"
+
+// TODO(wilber): The pten computing library requires a component to manage
+// flags.
+#include "paddle/fluid/platform/flags.h"
+
+PADDLE_DEFINE_EXPORTED_string(
+    selected_xpus,
+    "",
+    "A list of device ids separated by comma, like: 0,1,2,3. "
+    "This option is useful when doing multi process training and "
+    "each process have only one device (XPU). If you want to use "
+    "all visible devices, set this to empty string. NOTE: the "
+    "reason of doing this is that we want to use P2P communication"
+    "between XPU devices, use XPU_VISIBLE_DEVICES can only use"
+    "share-memory only.");
+
+namespace pten {
+class XPUContext;
+
+namespace backends {
+namespace xpu {
+
+/**************************** Version Management **************************/
+
+//! Get the version of XPU Driver
+int GetDriverVersion() {
+  uint32_t driver_version_major = 0;
+  uint32_t driver_version_minor = 0;
+  PADDLE_ENFORCE_XPU_SUCCESS(
+      xpu_get_driver_version(&driver_version_major, &driver_version_minor));
+  int driver_version = driver_version_major * 10 + driver_version_minor;
+  return driver_version;
+}
+
+//! Get the version of XPU Runtime
+int GetRuntimeVersion() {
+  uint32_t rumtime_version_major = 0;
+  uint32_t rumtime_version_minor = 0;
+  PADDLE_ENFORCE_XPU_SUCCESS(
+      xpu_get_runtime_version(&rumtime_version_major, &rumtime_version_minor));
+  int runtime_version = rumtime_version_major * 10 + rumtime_version_minor;
+  return runtime_version;
+}
+
+/**************************** Device Management **************************/
+
+static int GetDeviceCountImpl() {
+  const auto* xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES");
+  if (xpu_visible_devices != nullptr) {
+    std::string xpu_visible_devices_str(xpu_visible_devices);
+    if (std::all_of(xpu_visible_devices_str.begin(),
+                    xpu_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected.";
+      return 0;
+    }
+  }
+
+  int count = 0;
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_count(&count));
+  return count;
+}
+
+int GetXPUDeviceCount() {
+  static auto dev_cnt = GetDeviceCountImpl();
+  return dev_cnt;
+}
+
+int GetXPUCurrentDeviceId() {
+  int dev_id;
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_current_device(&dev_id));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  return dev_id;
+}
+
+void SetXPUDeviceId(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetXPUDeviceCount(),
+      paddle::platform::errors::InvalidArgument("id must less than XPU count"));
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id));
+}
+
+static inline std::vector<std::string> Split(std::string const& original,
+                                             char separator) {
+  std::vector<std::string> results;
+  std::string token;
+  std::istringstream is(original);
+  while (std::getline(is, token, separator)) {
+    if (!token.empty()) {
+      results.push_back(token);
+    }
+  }
+  return results;
+}
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetXPUSelectedDevices() {
+  // use user specified XPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_xpus.empty()) {
+    auto devices_str = Split(FLAGS_selected_xpus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetXPUDeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
+/**************************** Memory Management **************************/
+
+void MemcpySyncH2D(void* dst,
+                   const void* src,
+                   size_t count,
+                   const pten::XPUPlace& dst_place) {
+  XPUDeviceGuard guard(dst_place.device);
+  PADDLE_ENFORCE_XPU_SUCCESS(
+      xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+}
+
+void MemcpySyncD2H(void* dst,
+                   const void* src,
+                   size_t count,
+                   const pten::XPUPlace& src_place,
+                   const pten::XPUContext& dev_ctx) {
+  XPUDeviceGuard guard(src_place.GetDeviceId());
+  dev_ctx.Wait();
+  PADDLE_ENFORCE_XPU_SUCCESS(
+      xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST));
+}
+
+// if src.device == dst.device and you need sync , after call this function,
+// need to call xpu_wait()
+void MemcpySyncD2D(void* dst,
+                   const pten::XPUPlace& dst_place,
+                   const void* src,
+                   const pten::XPUPlace& src_place,
+                   size_t count,
+                   const pten::XPUContext& dev_ctx) {
+  int dev_id = GetXPUCurrentDeviceId();
+  if (dst_place.device == dev_id && src_place.device == dev_id) {
+    PADDLE_ENFORCE_XDNN_SUCCESS(
+        baidu::xpu::api::copy(dev_ctx.x_context(),
+                              static_cast<const int8_t*>(src),
+                              static_cast<int8_t*>(dst),
+                              count),
+        "copy ");
+  } else {
+    PADDLE_ENFORCE_XPU_SUCCESS(
+        xpu_memcpy_peer(dst_place.device, dst, src_place.device, src, count));
+  }
+}
+
+/**************************** Others **************************/
+
+XPUVersion get_xpu_version(int dev_id) {
+  uint64_t v = 0;
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id));
+
+  if (v == K100 || v == K200) {
+    VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n";
+    return XPU1;
+  } else {
+    VLOG(1) << "KUNLUN device " << dev_id << " is XPU2\n";
+    return XPU2;
+  }
+}
+
+}  // namespace xpu
+}  // namespace backends
+}  // namespace pten
--- a/paddle/pten/backends/xpu/xpu_info.h
+++ b/paddle/pten/backends/xpu/xpu_info.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/pten/common/place.h"
+
+namespace pten {
+
+class XPUContext;
+
+namespace backends {
+namespace xpu {
+
+/***** Version Management *****/
+
+//! Get the version of XPU Driver
+int GetDriverVersion();
+
+//! Get the version of XPU Runtime
+int GetRuntimeVersion();
+
+/***** Device Management *****/
+
+//! Get the total number of XPU devices in system.
+int GetXPUDeviceCount();
+
+//! Set the XPU device id for next execution.
+void SetXPUDeviceId(int device_id);
+
+//! Get the current XPU device id in system.
+int GetXPUCurrentDeviceId();
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetXPUSelectedDevices();
+
+/***** Memory Management *****/
+
+//! Copy memory from address src to dst synchronously.
+void MemcpySyncH2D(void *dst,
+                   const void *src,
+                   size_t count,
+                   const pten::XPUPlace &dst_place);
+void MemcpySyncD2H(void *dst,
+                   const void *src,
+                   size_t count,
+                   const pten::XPUPlace &src_place,
+                   const pten::XPUContext &dev_ctx);
+void MemcpySyncD2D(void *dst,
+                   const pten::XPUPlace &dst_place,
+                   const void *src,
+                   const pten::XPUPlace &src_place,
+                   size_t count,
+                   const pten::XPUContext &dev_ctx);
+
+class XPUDeviceGuard {
+ public:
+  explicit inline XPUDeviceGuard(int dev_id) {
+    int prev_id = GetXPUCurrentDeviceId();
+    if (prev_id != dev_id) {
+      prev_id_ = prev_id;
+      SetXPUDeviceId(dev_id);
+    }
+  }
+
+  inline ~XPUDeviceGuard() {
+    if (prev_id_ != -1) {
+      SetXPUDeviceId(prev_id_);
+    }
+  }
+
+  XPUDeviceGuard(const XPUDeviceGuard &o) = delete;
+  XPUDeviceGuard &operator=(const XPUDeviceGuard &o) = delete;
+
+ private:
+  int prev_id_{-1};
+};
+
+enum XPUVersion { XPU1, XPU2 };
+XPUVersion get_xpu_version(int dev_id);
+
+}  // namespace xpu
+}  // namespace backends
+}  // namespace pten
--- a/paddle/pten/core/device_context.cc
+++ b/paddle/pten/core/device_context.cc
@@ -13,28 +13,45 @@
 // limitations under the License.

 #include "paddle/pten/core/device_context.h"
+#include "paddle/pten/api/ext/exception.h"

 namespace pten {

 struct DeviceContext::Impl {
-  Allocator* allocator_{nullptr};
-
  Impl() = default;
  ~Impl() = default;

-  void SetAllocator(Allocator* allocator) { allocator_ = allocator; }
+  void SetDeviceAllocator(Allocator* allocator) {
+    device_allocator_ = allocator;
+  }
+
+  void SetHostAllocator(Allocator* allocator) { host_allocator_ = allocator; }
+
+  const Allocator& GetDeviceAllocator() const {
+    PD_CHECK(device_allocator_ != nullptr, "the device_allocator is nullptr.");
+    return *device_allocator_;
+  }

-  const Allocator& GetAllocator() const { return *allocator_; }
+  const Allocator& GetHostAllocator() const {
+    PD_CHECK(host_allocator_ != nullptr, "the host_allocator is nullptr.");
+    return *host_allocator_;
+  }

  // TODO(Wilber): Add impl. It seems that tensorbase not have interface to
  // communicate with allocator.
-  void Alloc(TensorBase* tensor) {}
+  void HostAlloc(TensorBase* tensor) {}
+  void DeviceAlloc(TensorBase* tensor) {}
+
+  Allocator* device_allocator_{nullptr};
+  Allocator* host_allocator_{nullptr};
 };

 DeviceContext::DeviceContext() { impl_ = std::make_unique<Impl>(); }

 DeviceContext::DeviceContext(const DeviceContext& other) {
-  impl_->SetAllocator(const_cast<Allocator*>(&other.GetAllocator()));
+  impl_->SetDeviceAllocator(
+      const_cast<Allocator*>(&other.GetDeviceAllocator()));
+  impl_->SetHostAllocator(const_cast<Allocator*>(&other.GetHostAllocator()));
 }

 DeviceContext::DeviceContext(DeviceContext&& other) {
@@ -43,14 +60,26 @@ DeviceContext::DeviceContext(DeviceContext&& other) {

 DeviceContext::~DeviceContext() = default;

-void DeviceContext::SetAllocator(Allocator* allocator) {
-  impl_->SetAllocator(allocator);
+void DeviceContext::SetHostAllocator(Allocator* allocator) {
+  impl_->SetHostAllocator(allocator);
+}
+
+void DeviceContext::SetDeviceAllocator(Allocator* allocator) {
+  impl_->SetDeviceAllocator(allocator);
+}
+
+const Allocator& DeviceContext::GetHostAllocator() const {
+  return impl_->GetHostAllocator();
 }

-const Allocator& DeviceContext::GetAllocator() const {
-  return impl_->GetAllocator();
+const Allocator& DeviceContext::GetDeviceAllocator() const {
+  return impl_->GetDeviceAllocator();
 }

-void DeviceContext::Alloc(TensorBase* tensor) { impl_->Alloc(tensor); }
+void DeviceContext::HostAlloc(TensorBase* tensor) { impl_->HostAlloc(tensor); }
+
+void DeviceContext::DeviceAlloc(TensorBase* tensor) {
+  impl_->DeviceAlloc(tensor);
+}

 }  // namespace pten
--- a/paddle/pten/core/device_context.h
+++ b/paddle/pten/core/device_context.h
@@ -57,19 +57,38 @@ class DeviceContext {
   *
   * @param allocator
   */
-  void SetAllocator(Allocator*);
+  void SetDeviceAllocator(Allocator*);

  /**
-   * @brief Get the const Allocator object.
+   * @brief Get the const deveice-releated Allocator object.
   *
   * @return Allocator
   */
-  const Allocator& GetAllocator() const;
+  const Allocator& GetDeviceAllocator() const;

  /**
-   * @brief Allocate memory for tensor.
+   * @brief Allocate device memory for tensor.
   */
-  void Alloc(pten::TensorBase*);
+  void DeviceAlloc(pten::TensorBase*);
+
+  /**
+   * @brief Set the host Allocator object.
+   *
+   * @param allocator
+   */
+  void SetHostAllocator(Allocator*);
+
+  /**
+   * @brief Get the const host Allocator object.
+   *
+   * @return Allocator
+   */
+  const Allocator& GetHostAllocator() const;
+
+  /**
+   * @brief Allocate host memory for tensor.
+   */
+  void HostAlloc(pten::TensorBase*);

  // TODO(wilber): Just for the convenience of migrating the code, it will be
  // modified or removed later.