support Baidu Kunlun AI Accelerator (#25959)

* support Baidu AI Accelerator * test=kunlun * minor * test=kunlun * support xpu op in separate file * test=kunlun * update XPU error message and remove duplicated code * test=kunlun * minor * test=kunlun * minor * test=kunlun

support Baidu Kunlun AI Accelerator (#25959)
* support Baidu AI Accelerator * test=kunlun * minor * test=kunlun * support xpu op in separate file * test=kunlun * update XPU error message and remove duplicated code * test=kunlun * minor * test=kunlun * minor * test=kunlun
138ecf24 · QingshuChen · GitHub · 17bcaef4 · 138ecf24 · 138ecf24
42 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,10 @@ include(generic)            # simplify cmake module
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
 find_package(CUDA QUIET)
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
-
+option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN"        OFF)
+if (WITH_GPU  AND WITH_XPU)
+    message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
+endif()
 # cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
 if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
    message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -63,6 +63,11 @@ if(WITH_BOX_PS)
    add_definitions(-DPADDLE_WITH_BOX_PS)
 endif()

+if(WITH_XPU)
+    message(STATUS "Compile with XPU!")
+    add_definitions(-DPADDLE_WITH_XPU)
+endif()
+
 if(WITH_GPU)
    add_definitions(-DPADDLE_WITH_CUDA)
    add_definitions(-DEIGEN_USE_GPU)

--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
+if (NOT WITH_XPU)
+    return()
+endif()
+
+INCLUDE(ExternalProject)
+SET(XPU_PROJECT                 "extern_xpu")
+SET(XPU_URL    "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
+SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
+SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
+SET(XPU_API_INC_DIR             "${THIRD_PARTY_PATH}/install/xpu/api/include")
+SET(XPU_RUNTIME_INC_DIR         "${THIRD_PARTY_PATH}/install/xpu/runtime/include")
+SET(XPU_LIB_DIR                 "${THIRD_PARTY_PATH}/install/xpu/lib")
+
+SET(XPU_API_LIB_NAME            "libxpuapi.so")
+SET(XPU_RT_LIB_NAME             "libxpurt.so")
+SET(XPU_SIM_LIB_NAME            "libxpusim.so")
+SET(XPU_API_LIB                 "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
+SET(XPU_RT_LIB                  "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
+SET(XPU_SIM_LIB                 "${XPU_LIB_DIR}/${XPU_SIM_LIB_NAME}")
+
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
+
+INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
+INCLUDE_DIRECTORIES(${XPU_RUNTIME_INC_DIR})
+
+FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(XPU)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY xpu/api xpu/runtime xpu/lib \n"
+  "        DESTINATION ${XPU_INSTALL_DIR})\n")
+
+ExternalProject_Add(
+    ${XPU_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${XPU_SOURCE_DIR}
+    DOWNLOAD_DIR          ${XPU_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz
+                          && tar xvf xpu.tar.gz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
+)
+
+ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
+set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
+
+# generate a static dummy target to track xpulib dependencies
+# for cc_library(xxx SRCS xxx.c DEPS xpulib)
+generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")
+
+TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_SIM_LIB})
+ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -8,6 +8,7 @@ function(op_library TARGET)
    set(hip_cu_srcs)
    set(miopen_hip_cc_srcs)
    set(cu_cc_srcs)
+    set(xpu_cc_srcs)
    set(cudnn_cu_cc_srcs)
    set(cudnn_cu_srcs)
    set(CUDNN_FILE)
@@ -60,6 +61,12 @@ function(op_library TARGET)
                list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
            endif()
        endif()
+        if(WITH_XPU)
+            string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc)
+                list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc)
+            endif()
+        endif()
    else()
        foreach(src ${op_library_SRCS})
            if (${src} MATCHES ".*\\.hip.cu$")
@@ -76,6 +83,8 @@ function(op_library TARGET)
                list(APPEND mkldnn_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cu.cc$")
                list(APPEND cu_cc_srcs ${src})
+            elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$")
+                list(APPEND xpu_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cc$")
                list(APPEND cc_srcs ${src})
            else()
@@ -109,7 +118,7 @@ function(op_library TARGET)
        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
    else()
-        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
            ${op_common_deps})
    endif()

@@ -150,10 +159,11 @@ function(op_library TARGET)
    list(LENGTH cu_srcs cu_srcs_len)
    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
+    list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
-        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
+        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
        set(pybind_flag 1)
    endif()
@@ -179,6 +189,9 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
    endif()

+    if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
+    endif()
    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
      # Append first implemented MKLDNN activation operator
@@ -228,6 +241,7 @@ function(register_operators)

    file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
    string(REPLACE "_mkldnn" "" OPS "${OPS}")
+    string(REPLACE "_xpu" "" OPS "${OPS}")
    string(REPLACE ".cc" "" OPS "${OPS}")
    list(REMOVE_DUPLICATES OPS)
    list(LENGTH register_operators_DEPS register_operators_DEPS_len)

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -250,6 +250,11 @@ if(WITH_GPU)
    file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
 endif(WITH_GPU)

+if(WITH_XPU)
+    include(external/xpu)          # download, build, install xpu
+    list(APPEND third_party_deps extern_xpu)
+endif(WITH_XPU)
+
 if(WITH_PSLIB)
    include(external/pslib)          # download, build, install pslib
    list(APPEND third_party_deps extern_pslib)

--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -70,6 +70,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
    return ctx;
  }

+  inline ::DLContext operator()(const platform::XPUPlace &place) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("platform::XPUPlace is not supported"));
+  }
+
  inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
    ::DLContext ctx;

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -444,8 +444,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
  int64_t max_memory_size = GetEagerDeletionThreshold();
  std::unique_ptr<GarbageCollector> gc;
  if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
-#ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(place_)) {
+#ifdef PADDLE_WITH_CUDA
      if (IsFastEagerDeletionModeEnabled()) {
        gc.reset(new UnsafeFastGPUGarbageCollector(
            BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
@@ -453,13 +453,22 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
        gc.reset(new DefaultStreamGarbageCollector(
            BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
      }
-    } else if (platform::is_cpu_place(place_)) {
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No GPU gc found in CPU/XPU paddle"));
 #endif
+    } else if (platform::is_cpu_place(place_)) {
      gc.reset(new CPUGarbageCollector(
          BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size));
-#ifdef PADDLE_WITH_CUDA
-    }
+    } else if (platform::is_xpu_place(place_)) {
+#ifdef PADDLE_WITH_XPU
+      gc.reset(new XPUGarbageCollector(
+          BOOST_GET_CONST(platform::XPUPlace, place_), max_memory_size));
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
 #endif
+    }
  }

  for (int64_t i = start_op_index; i < end_op_index; ++i) {

--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -50,6 +50,15 @@ void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
  callback();
 }

+#ifdef PADDLE_WITH_XPU
+XPUGarbageCollector::XPUGarbageCollector(const platform::XPUPlace &place,
+                                         size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+void XPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
+  callback();
+}
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
    const platform::CUDAPlace &place, size_t max_memory_size)

--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -59,6 +59,16 @@ class CPUGarbageCollector : public GarbageCollector {
  void ClearCallback(const std::function<void()> &callback) override;
 };

+#ifdef PADDLE_WITH_XPU
+class XPUGarbageCollector : public GarbageCollector {
+ public:
+  XPUGarbageCollector(const platform::XPUPlace &place, size_t max_memory_size);
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 class UnsafeFastGPUGarbageCollector : public GarbageCollector {
 public:

--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@@ -59,6 +59,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
    // CPU, CUDA, PLAIN are same library type.
  } else if (s == std::string("CPU")) {
    return LibraryType::kPlain;
+  } else if (s == std::string("XPU")) {
+    return LibraryType::kPlain;
  } else if (s == std::string("CUDA")) {
    return LibraryType::kPlain;
  } else {

--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -268,6 +268,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)

+#define REGISTER_OP_XPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
+
 #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
                              customized_name,                     \
                              customized_type_value,               \
@@ -298,6 +301,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
      __VA_ARGS__)

+#define REGISTER_OP_XPU_KERNEL_FUNCTOR(op_type, ...)                  \
+  REGISTER_OP_KERNEL_EX(                                              \
+      op_type, XPU, ::paddle::platform::XPUPlace, DEFAULT_TYPE,       \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
+
 /**
 * Macro to mark what Operator and Kernel
 * we will use and tell the compiler to

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -34,6 +34,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_info.h"
+#endif

 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -165,6 +168,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #else
      auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
      platform::SetDeviceId(dev_id);
+#endif
+    } else if (platform::is_xpu_place(place)) {
+#ifndef PADDLE_WITH_XPU
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Cannot run operator on place %s", place));
+#else
+      auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+      platform::SetXPUDeviceId(dev_id);
 #endif
    }

@@ -1109,6 +1120,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
    kernel_iter = kernels.find(expected_kernel_key);
  }
+#endif
+#ifdef PADDLE_WITH_XPU
+  if (kernel_iter == kernels.end() &&
+      is_xpu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing XPU kernel: " << type_
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
  if (kernel_iter == kernels.end()) {
    PADDLE_THROW("op %s does not have kernel for %s", type_,

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -449,6 +449,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                                   const BuildStrategy &build_strategy,
                                   ir::Graph *graph)
    : member_(new ParallelExecutorPrivate(places, scope)) {
+  PADDLE_ENFORCE(places.size() > 0 && !is_xpu_place(places[0]),
+                 platform::errors::Unavailable(
+                     "XPU is not supported in ParallelExecutor"));
  ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                 member_->places_.size());
  member_->use_cuda_ = exec_strategy.use_cuda_;

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -54,18 +54,43 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
  }
+#ifdef PADDLE_WITH_XPU
+  else if (platform::is_xpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_xpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_xpu_place(src_place) &&
+             platform::is_xpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 #ifdef PADDLE_WITH_CUDA
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
                 size);
-  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
-             platform::is_cuda_pinned_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
-  } else if (platform::is_gpu_place(src_place) &&  // NOLINT
-             platform::is_cpu_place(dst_place)) {
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
    auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
    auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
    auto ctx_place = ctx.GetPlace();
@@ -75,8 +100,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
    auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
    auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
    auto ctx_place = ctx.GetPlace();
@@ -86,8 +112,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
-  } else if (platform::is_gpu_place(src_place) &&  // NOLINT
-             platform::is_cuda_pinned_place(dst_place)) {
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
    auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
    auto dst_cuda_pinned_place =
        BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place);
@@ -108,8 +135,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
    memory::Copy(dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size,
                 stream);
-  } else if (platform::is_cuda_pinned_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
    auto src_cuda_pinned_place =
        BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place);
    auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
@@ -130,8 +158,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
    memory::Copy(dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size,
                 stream);
-  } else if (platform::is_gpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
    auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
    auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
    auto ctx_place = ctx.GetPlace();
@@ -154,7 +183,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
        PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
      }
    }
-  } else {
+  }
+  else {  // NOLINT
    PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
  }
 #endif
@@ -200,44 +230,74 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
  }
+#ifdef PADDLE_WITH_XPU
+  else if (platform::is_xpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
+             platform::is_xpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_xpu_place(src_place) &&  // NOLINT
+             platform::is_xpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+  } else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 #ifdef PADDLE_WITH_CUDA
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
                 size);
-  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
-             platform::is_cuda_pinned_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
-  } else if (platform::is_gpu_place(src_place) &&  // NOLINT
-             platform::is_cuda_pinned_place(dst_place)) {
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::CUDAPlace, src_place), src_ptr, size,
                 nullptr);
-  } else if (platform::is_gpu_place(src_place) &&  // NOLINT
-             platform::is_cpu_place(dst_place)) {
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
    auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
    auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
    auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
    auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
-  } else if (platform::is_gpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
    auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
    auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
-  } else if (platform::is_cuda_pinned_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
    auto src_pinned_place =
        BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place);
    auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
    memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
                 nullptr);
-  } else {
+  }
+  else {  // NOLINT
    PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
  }
 #endif
@@ -276,6 +336,19 @@ class AnyVisitor : public boost::static_visitor<bool> {
  const framework::Tensor& tensor_;
  Predicate predicate_;

+  bool GetResultHelper(const framework::Tensor& out,
+                       const platform::Place& place) const {
+    platform::CPUPlace cpu;
+    framework::Tensor tmp;
+    tmp.Resize({1});
+    tmp.mutable_data<bool>(cpu);
+    auto ctx = platform::DeviceContextPool::Instance().Get(place);
+    ctx->Wait();
+    TensorCopy(out, cpu, *ctx, &tmp);
+    ctx->Wait();
+    return GetResult(tmp, cpu);
+  }
+
 public:
  AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
      : tensor_(tensor), predicate_(std::move(predicate)) {}
@@ -290,17 +363,14 @@ class AnyVisitor : public boost::static_visitor<bool> {
    return this->GetResult(out, place);
  }

+  bool GetResult(const framework::Tensor& out,
+                 const platform::XPUPlace& xpu) const {
+    return GetResultHelper(out, xpu);
+  }
+
  bool GetResult(const framework::Tensor& out,
                 const platform::CUDAPlace& gpu) const {
-    platform::CPUPlace cpu;
-    framework::Tensor tmp;
-    tmp.Resize({1});
-    tmp.mutable_data<bool>(cpu);
-    auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
-    gpuctx->Wait();
-    TensorCopy(out, cpu, *gpuctx, &tmp);
-    gpuctx->Wait();
-    return GetResult(tmp, cpu);
+    return GetResultHelper(out, gpu);
  }

  bool GetResult(const framework::Tensor& out,
@@ -418,6 +488,10 @@ struct BothFalseVisitor : public boost::static_visitor<> {
    VisitorImpl(place);
  }

+  void VisitorImpl(const platform::XPUPlace& xpu) const {
+    PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
+  }
+
  void VisitorImpl(const platform::CUDAPlace& gpu) const {
 #ifdef PADDLE_WITH_CUDA
    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(gpu);
@@ -498,6 +572,28 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
 #else
      PADDLE_THROW(platform::errors::Unimplemented(
          "CUDAPlace is not supported when not compiled with CUDA"));
+#endif
+    } else if (platform::is_xpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_XPU
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& xpu_dev_ctx =
+          static_cast<const platform::XPUDeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     BOOST_GET_CONST(platform::XPUPlace, tensor.place()),
+                     reinterpret_cast<const void*>(data), size_to_write);
+        xpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "XPUPlace is not supported when not compiled with XPU"));
 #endif
    } else {
      os.write(static_cast<const char*>(data_ptr),
@@ -552,8 +648,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
    void* buf;
    auto ctx = platform::CPUDeviceContext();
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
-    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
+        platform::is_xpu_place(dev_ctx.GetPlace())) {
+#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
      Tensor cpu_tensor;
      cpu_tensor.Resize(framework::make_ddim(shape));
      framework::VisitDataType(
@@ -563,8 +660,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
      auto dst_place = dev_ctx.GetPlace();
      framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
 #else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "CUDAPlace is not supported when not compiled with CUDA"));
+      if (platform::is_gpu_place(dev_ctx.GetPlace())) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "CUDAPlace is not supported when not compiled with CUDA"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "XPUPlace is not supported when not compiled with XPU"));
+      }
 #endif
    } else {
      framework::VisitDataType(
@@ -603,8 +705,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
    void* buf;
    auto ctx = platform::CPUDeviceContext();
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
-    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
+        platform::is_xpu_place(dev_ctx.GetPlace())) {
+#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
      Tensor cpu_tensor;
      cpu_tensor.Resize(framework::make_ddim(dims));
      framework::VisitDataType(
@@ -614,8 +717,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
      auto dst_place = dev_ctx.GetPlace();
      framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
 #else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "CUDAPlace is not supported when not compiled with CUDA"));
+      if (platform::is_gpu_place(dev_ctx.GetPlace())) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "CUDAPlace is not supported when not compiled with CUDA"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "XPUPlace is not supported when not compiled with XPU"));
+      }
 #endif
    } else {
      framework::VisitDataType(
@@ -700,6 +808,9 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
        reinterpret_cast<const platform::CUDADeviceContext&>(*ctx).stream());
  }
 #endif
+#ifdef PADDLE_WITH_XPU
+  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
+#endif
 }

 template <typename T>

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -76,6 +76,13 @@ class TensorAddFunctor : public boost::static_visitor<> {
    blas.AXPY(numel_, 1., x_, y_);
  }

+  void operator()(const platform::XPUPlace& place) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+
 #ifdef PADDLE_WITH_CUDA
  void operator()(const platform::CUDAPlace& place) {
    platform::CUDADeviceContext* ctx =

--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -100,6 +100,13 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;

  auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_XPU
+  if (kernel_iter == kernels.end() &&
+      is_xpu_place(expected_kernel_key.place_)) {
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
+#endif
  // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
                    platform::errors::NotFound(

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -23,6 +23,8 @@ cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
 if (WITH_GPU)
    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
+elseif(WITH_XPU)
+    set(AllocatorFacadeDeps xpu_info)
 else ()
    set(AllocatorFacadeDeps)
 endif()

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -39,6 +39,9 @@
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_info.h"
+#endif

 DEFINE_int64(
    gpu_allocator_retry_time, 10000,
@@ -62,6 +65,11 @@ class AllocatorFacadePrivate {
    switch (strategy) {
      case AllocatorStrategy::kNaiveBestFit: {
        InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_CUDA
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
@@ -74,6 +82,11 @@ class AllocatorFacadePrivate {

      case AllocatorStrategy::kAutoGrowth: {
        InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_CUDA
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
@@ -86,6 +99,11 @@ class AllocatorFacadePrivate {

      case AllocatorStrategy::kThreadLocal: {
        InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_CUDA
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
@@ -127,6 +145,13 @@ class AllocatorFacadePrivate {
 private:
  void InitSystemAllocators() {
    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
+#ifdef PADDLE_WITH_XPU
+    int device_count = platform::GetXPUDeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::XPUPlace p(i);
+      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+    }
+#endif
 #ifdef PADDLE_WITH_CUDA
    system_allocators_[platform::CUDAPinnedPlace()] =
        std::make_shared<CPUPinnedAllocator>();
@@ -164,6 +189,12 @@ class AllocatorFacadePrivate {
  }
 #endif

+#ifdef PADDLE_WITH_XPU
+  void InitNaiveBestFitXPUAllocator(platform::XPUPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+#endif
+
  class ZeroSizeAllocator : public Allocator {
   public:
    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
@@ -191,6 +222,12 @@ class AllocatorFacadePrivate {
    }
    places.emplace_back(platform::CUDAPinnedPlace());
 #endif
+#ifdef PADDLE_WITH_XPU
+    int device_count = platform::GetXPUDeviceCount();
+    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
+      places.emplace_back(platform::XPUPlace(dev_id));
+    }
+#endif

    for (auto& p : places) {
      zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -29,6 +29,9 @@
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#endif

 DEFINE_bool(init_allocated_mem, false,
            "It is a mistake that the values of the memory allocated by "
@@ -101,6 +104,100 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
  return GetCPUBuddyAllocator()->Used();
 }

+template <>
+void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
+#ifdef PADDLE_WITH_XPU
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  void *p = nullptr;
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  ret = xpu_set_device(place.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (FLAGS_init_allocated_mem) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "xpu memory FLAGS_init_allocated_mem is not implemented."));
+  }
+  ret = xpu_set_device(dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  VLOG(10) << "  pointer=" << p;
+  return p;
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+  return nullptr;
+#endif
+}
+
+template <>
+void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
+                              size_t size) {
+#ifdef PADDLE_WITH_XPU
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  ret = xpu_set_device(place.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  xpu_free(p);
+  ret = xpu_set_device(dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+#endif
+}
+
+template <>
+size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
+#ifdef PADDLE_WITH_XPU
+  printf("Used func return 0 for XPUPlace\n");
+  return 0;
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+#endif
+}
+
 #ifdef PADDLE_WITH_CUDA
 class GPUBuddyAllocatorList {
 private:

--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"

+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#endif
+
 namespace paddle {
 namespace memory {

@@ -29,6 +33,169 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
  std::memcpy(dst, src, num);
 }

+#ifdef PADDLE_WITH_XPU
+template <>
+void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (num <= 0) {
+    VLOG(0) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
+    return;
+  }
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  if (dev_id != dst_place.device) {
+    ret = xpu_set_device(dst_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+  ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id != dst_place.device) {
+    ret = xpu_set_device(dev_id);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+}
+
+template <>
+void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::XPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (num <= 0) {
+    VLOG(0) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
+    return;
+  }
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  if (dev_id != src_place.device) {
+    ret = xpu_set_device(src_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+  ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id != src_place.device) {
+    ret = xpu_set_device(dev_id);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+}
+
+template <>
+void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::XPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (num <= 0) {
+    VLOG(0) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
+    return;
+  }
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  if (dev_id != src_place.device || dev_id != dst_place.device) {
+    ret = xpu_set_device(src_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    void* tmp = malloc(num);
+    ret = xpu_memcpy(tmp, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    ret = xpu_set_device(dst_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    ret = xpu_memcpy(dst, tmp, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    ret = xpu_set_device(dev_id);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    free(tmp);
+  } else {
+    int ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+}
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K


--- a/paddle/fluid/operators/eye_op.cc
+++ b/paddle/fluid/operators/eye_op.cc
@@ -83,7 +83,6 @@ Return an identity tensor whose shape is [num_rows, num_columns].

 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
-using float16 = paddle::platform::float16;

 REGISTER_OPERATOR(
    eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference,
@@ -93,4 +92,4 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(eye, ops::EyeKernel<CPU, float>,
                       ops::EyeKernel<CPU, double>,
                       ops::EyeKernel<CPU, int64_t>, ops::EyeKernel<CPU, int>,
-                       ops::EyeKernel<CPU, float16>);
+                       ops::EyeKernel<CPU, paddle::platform::float16>);
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -73,6 +73,13 @@ struct TensorSetConstantCPU {
  float value_;
 };

+template <>
+void set_constant_with_place<platform::XPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<platform::CPUPlace>(
    const platform::DeviceContext& context, framework::Tensor* tensor,

--- a/paddle/fluid/operators/xpu/mul_xpu_op.cc
+++ b/paddle/fluid/operators/xpu/mul_xpu_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/operators/mul_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::OpKernelType;
+using framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class MulXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    const Tensor* y = context.Input<Tensor>("Y");
+    Tensor* z = context.Output<Tensor>("Out");
+    const Tensor x_matrix =
+        x->dims().size() > 2
+            ? framework::ReshapeToMatrix(
+                  *x, context.template Attr<int>("x_num_col_dims"))
+            : *x;
+    const Tensor y_matrix =
+        y->dims().size() > 2
+            ? framework::ReshapeToMatrix(
+                  *y, context.template Attr<int>("y_num_col_dims"))
+            : *y;
+    z->mutable_data<T>(context.GetPlace());
+    auto z_dim = z->dims();
+    if (z_dim.size() != 2) {
+      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+    }
+    bool trans_a = false;
+    bool trans_b = false;
+    int m = x_matrix.dims()[0];
+    int k = x_matrix.dims()[1];
+    int k1 = y_matrix.dims()[0];
+    int n = y_matrix.dims()[1];
+    PADDLE_ENFORCE_EQ(
+        k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
+    T alpha = static_cast<T>(1.0);
+    T beta = static_cast<T>(0.0);
+    const T* data_a = x_matrix.data<T>();
+    const T* data_b = y_matrix.data<T>();
+    T* data_c = z->data<T>();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int ret = xpu::fc_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k,
+                            alpha, data_a, data_b, beta, data_c);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    if (z_dim.size() != 2) {
+      z->Resize(z_dim);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MulGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
+    int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto x_matrix = x->dims().size() > 2
+                        ? framework::ReshapeToMatrix(*x, x_num_col_dims)
+                        : static_cast<const Tensor&>(*x);
+    auto y_matrix = y->dims().size() > 2
+                        ? framework::ReshapeToMatrix(*y, y_num_col_dims)
+                        : static_cast<const Tensor&>(*y);
+    auto* dout = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    Tensor dout_mat;
+    dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0],
+                     framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
+    auto* dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+    if (dx != nullptr) {
+      dx->set_lod(x->lod());
+    }
+    if (dy != nullptr) {
+      dy->set_lod(y->lod());
+    }
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      Tensor dx_matrix = dx->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dx, x_num_col_dims)
+                             : *dx;
+      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
+      // blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
+      bool trans_a = false;
+      bool trans_b = true;
+      int m = dout_mat.dims()[0];
+      int k = dout_mat.dims()[1];
+      int n = y_matrix.dims()[0];
+      int k1 = y_matrix.dims()[1];
+      PADDLE_ENFORCE_EQ(
+          k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
+      int lda = (!trans_a) ? k : m;
+      int ldb = (!trans_b) ? n : k;
+      int ldc = n;
+      T alpha = static_cast<T>(1.0);
+      T beta = static_cast<T>(0.0);
+      const T* data_a = dout->data<T>();
+      const T* data_b = y_matrix.data<T>();
+      T* data_c = dx_matrix.data<T>();
+      int ret =
+          xpu::gemm_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, alpha,
+                          data_a, lda, data_b, ldb, beta, data_c, ldc);
+      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                        platform::errors::External(
+                            "XPU API return wrong value[%d], please check "
+                            "where Baidu Kunlun Card is properly installed.",
+                            ret));
+    }
+
+    if (dy) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      Tensor dy_matrix = dy->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
+                             : *dy;
+      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
+      // blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
+      bool trans_a = true;
+      bool trans_b = false;
+      int k = x_matrix.dims()[0];
+      int m = x_matrix.dims()[1];
+      int k1 = dout_mat.dims()[0];
+      int n = dout_mat.dims()[1];
+      PADDLE_ENFORCE_EQ(
+          k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
+      int lda = (!trans_a) ? k : m;
+      int ldb = (!trans_b) ? n : k;
+      int ldc = n;
+      T alpha = static_cast<T>(1.0);
+      T beta = static_cast<T>(0.0);
+      const T* data_a = x_matrix.data<T>();
+      const T* data_b = dout->data<T>();
+      T* data_c = dy_matrix.data<T>();
+      int ret =
+          xpu::gemm_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, alpha,
+                          data_a, lda, data_b, ldb, beta, data_c, ldc);
+      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                        platform::errors::External(
+                            "XPU API return wrong value[%d], please check "
+                            "where Baidu Kunlun Card is properly installed.",
+                            ret));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    mul, ops::MulXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    mul_grad, ops::MulGradXPUKernel<paddle::platform::XPUDeviceContext, float>)
+#endif
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -4,6 +4,12 @@ if(WITH_GPU)
  proto_library(cuda_error_proto SRCS cuda_error.proto)
 endif(WITH_GPU)

+if(WITH_XPU)
+  set(XPU_CTX_DEPS xpulib)
+ELSE()
+  set(XPU_CTX_DEPS)
+endif(WITH_XPU)
+
 if (WITH_PYTHON)
  py_proto_compile(profiler_py_proto SRCS profiler.proto)
  add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
@@ -50,6 +56,10 @@ nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cu
 cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)

+if(WITH_XPU)
+cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce)
+endif()
+
 add_subdirectory(dynload)
 add_subdirectory(stream)

@@ -84,7 +94,7 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
-    ${dgc_deps} dlpack cudnn_workspace_helper)
+    ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})

 cc_library(collective_helper SRCS collective_helper.cc DEPS framework_proto  device_context enforce)


--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -61,7 +61,8 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
  if (it == device_contexts_.end()) {
    PADDLE_THROW(platform::errors::Unimplemented(
        "Place %s is not supported. Please check that your paddle compiles "
-        "with WITH_GPU option or check that your train process hold the "
+        "with WITH_GPU or WITH_XPU option or check that your train process "
+        "hold the "
        "correct gpu_id if you use Executor.",
        place));
  }
@@ -115,6 +116,14 @@ DeviceContextPool::DeviceContextPool(
      PADDLE_THROW(platform::errors::Unimplemented(
          "CUDAPlace is not supported. Please re-compile with WITH_GPU "
          "option."));
+#endif
+    } else if (platform::is_xpu_place(p)) {
+#ifdef PADDLE_WITH_XPU
+      EmplaceDeviceContext<XPUDeviceContext, XPUPlace>(&device_contexts_, p);
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("XPUPlace is not supported. Please "
+                                          "re-compile with WITH_XPU option."));
 #endif
    }
  }
@@ -134,6 +143,49 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {

 Place CPUDeviceContext::GetPlace() const { return place_; }

+#ifdef PADDLE_WITH_XPU
+XPUDeviceContext::XPUDeviceContext() { context_ = xpu::create_context(); }
+
+XPUDeviceContext::~XPUDeviceContext() { xpu::destroy_context(context_); }
+
+XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  ret = xpu_set_device(place.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  context_ = xpu::create_context();
+  ret = xpu_set_device(dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+}
+
+void XPUDeviceContext::Wait() const {
+  int ret = xpu_set_device(place_.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  xpu_wait();
+}
+
+Place XPUDeviceContext::GetPlace() const { return place_; }
+
+xpu::Context* XPUDeviceContext::x_context() const { return context_; }
+#endif
+
 #ifdef PADDLE_WITH_CUDA

 class EigenCudaStreamDevice : public Eigen::StreamInterface {

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -43,6 +43,10 @@ limitations under the License. */
 #endif
 #include "unsupported/Eigen/CXX11/Tensor"

+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#endif
+
 namespace paddle {
 namespace platform {

@@ -76,6 +80,35 @@ struct DefaultDeviceContextType<platform::CPUPlace> {
  using TYPE = CPUDeviceContext;
 };

+#ifdef PADDLE_WITH_XPU
+class XPUDeviceContext : public DeviceContext {
+ public:
+  XPUDeviceContext();
+  explicit XPUDeviceContext(XPUPlace place);
+  virtual ~XPUDeviceContext();
+  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  Place GetPlace() const override;
+  xpu::Context* x_context() const;
+
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override;
+
+ private:
+  XPUPlace place_;
+  xpu::Context* context_;
+
+  // Need to be the same with other DeviceContext,
+  // Eventhough eigen_device_ is not used in XPU
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+  DISABLE_COPY_AND_ASSIGN(XPUDeviceContext);
+};
+
+template <>
+struct DefaultDeviceContextType<platform::XPUPlace> {
+  using TYPE = XPUDeviceContext;
+};
+#endif
+
 #ifdef PADDLE_WITH_CUDA

 class EigenCudaStreamDevice;

--- a/paddle/fluid/platform/device_context_xpu_test.cc
+++ b/paddle/fluid/platform/device_context_xpu_test.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/device_context.h"
+
+#include <vector>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+TEST(Device, Init) {
+  using paddle::platform::DeviceContext;
+  using paddle::platform::XPUDeviceContext;
+  using paddle::platform::XPUPlace;
+
+  int count = paddle::platform::GetXPUDeviceCount();
+  for (int i = 0; i < count; i++) {
+    XPUDeviceContext* device_context = new XPUDeviceContext(XPUPlace(i));
+    xpu::Context* ctx = device_context->x_context();
+    ASSERT_NE(nullptr, ctx);
+    delete device_context;
+  }
+}
+
+TEST(Device, DeviceContextPool) {
+  using paddle::platform::DeviceContextPool;
+  using paddle::platform::XPUDeviceContext;
+  using paddle::platform::Place;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::XPUPlace;
+
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  auto cpu_dev_ctx1 = pool.Get(CPUPlace());
+  auto cpu_dev_ctx2 = pool.Get(CPUPlace());
+  ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);
+
+  std::vector<Place> xpu_places;
+  int count = paddle::platform::GetXPUDeviceCount();
+  for (int i = 0; i < count; ++i) {
+    auto dev_ctx = pool.Get(XPUPlace(i));
+    ASSERT_NE(dev_ctx, nullptr);
+  }
+}
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -33,6 +33,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"

+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu_info.h"
+#endif
+
 DECLARE_int32(paddle_num_threads);
 DEFINE_int32(multiple_of_cupti_buffer_size, 1,
             "Multiple of the CUPTI device buffer size. If the timestamps have "
@@ -151,6 +156,14 @@ void InitDevices(bool init_p2p) {
  } catch (const std::exception &exp) {
    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
  }
+#endif
+#ifdef PADDLE_WITH_XPU
+  try {
+    // use user specified XPUs in single-node multi-process mode.
+    devices = platform::GetXPUSelectedDevices();
+  } catch (const std::exception &exp) {
+    LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime.";
+  }
 #endif
  InitDevices(init_p2p, devices);
 }
@@ -165,7 +178,13 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
      LOG(WARNING) << "Invalid devices id.";
      continue;
    }
+
+#ifdef PADDLE_WITH_CUDA
    places.emplace_back(platform::CUDAPlace(devices[i]));
+#endif
+#ifdef PADDLE_WITH_XPU
+    places.emplace_back(platform::XPUPlace(devices[i]));
+#endif
  }
  if (init_p2p) {
    InitP2P(devices);

--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -20,7 +20,7 @@ TEST(InitDevices, CPU) {
  using paddle::framework::InitDevices;
  using paddle::platform::DeviceContextPool;

-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU)
  InitDevices(true);
  DeviceContextPool& pool = DeviceContextPool::Instance();
  ASSERT_EQ(pool.size(), 1U);
@@ -39,6 +39,18 @@ TEST(InitDevices, CUDA) {
 #endif
 }

+TEST(InitDevices, XPU) {
+  using paddle::framework::InitDevices;
+  using paddle::platform::DeviceContextPool;
+
+#ifdef PADDLE_WITH_XPU
+  int count = paddle::platform::GetXPUDeviceCount();
+  InitDevices(true);
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
+#endif
+}
+
 #ifndef _WIN32
 TEST(SignalHandle, SignalHandle) {
  std::string msg = "Signal raises";

--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -32,6 +32,7 @@ class PlacePrinter : public boost::static_visitor<> {
  void operator()(const CUDAPlace &p) {
    os_ << "CUDAPlace(" << p.device << ")";
  }
+  void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
  void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }

 private:
@@ -44,6 +45,10 @@ bool is_gpu_place(const Place &p) {
  return boost::apply_visitor(IsCUDAPlace(), p);
 }

+bool is_xpu_place(const Place &p) {
+  return boost::apply_visitor(IsXPUPlace(), p);
+}
+
 bool is_cpu_place(const Place &p) {
  return boost::apply_visitor(IsCPUPlace(), p);
 }
@@ -60,6 +65,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
  if (places_are_same_class(p1, p2)) {
    if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) {
      return true;
+    } else if (is_xpu_place(p1)) {
+      return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2);
    } else {
      return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2);
    }

--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -58,31 +58,58 @@ struct CUDAPinnedPlace {
  inline bool operator<(const CUDAPinnedPlace &) const { return false; }
 };

+// Place for Baidu Kunlun Accelerator
+struct XPUPlace {
+  XPUPlace() : XPUPlace(0) {}
+  explicit XPUPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const XPUPlace &o) const { return device == o.device; }
+  inline bool operator!=(const XPUPlace &o) const { return !(*this == o); }
+  inline bool operator<(const XPUPlace &o) const { return device < o.device; }
+
+  int device;
+};
+
 struct IsCUDAPlace : public boost::static_visitor<bool> {
  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
  bool operator()(const CUDAPlace &gpu) const { return true; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };

 struct IsCPUPlace : public boost::static_visitor<bool> {
  bool operator()(const CPUPlace &cpu) const { return true; }
+  bool operator()(const XPUPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };

 struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
 };

-class Place : public boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> {
+struct IsXPUPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &xpu) const { return true; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
+};
+
+class Place
+    : public boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace> {
 private:
-  using PlaceBase = boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace>;
+  using PlaceBase =
+      boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace>;

 public:
  Place() = default;
  Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {}     // NOLINT
+  Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {}     // NOLINT
  Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {}  // NOLINT
  Place(const CUDAPinnedPlace &cuda_pinned_place)                // NOLINT
      : PlaceBase(cuda_pinned_place) {}
@@ -98,6 +125,7 @@ class Place : public boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> {
 using PlaceList = std::vector<Place>;

 bool is_gpu_place(const Place &);
+bool is_xpu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool is_cuda_pinned_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
@@ -115,6 +143,16 @@ struct PlaceVisitorWrapper
    return visitor_(cpu);
  }

+  typename Visitor::result_type operator()(const XPUPlace &xpu) const {
+#ifdef PADDLE_WITH_XPU
+    return visitor_(xpu);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Paddle is not compiled with XPU. Cannot visit xpu device"));
+    return typename Visitor::result_type();
+#endif
+  }
+
  typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
 #ifdef PADDLE_WITH_CUDA
    return visitor_(cuda);

--- a/paddle/fluid/platform/place_test.cc
+++ b/paddle/fluid/platform/place_test.cc
@@ -18,19 +18,32 @@
 TEST(Place, Equality) {
  paddle::platform::CPUPlace cpu;
  paddle::platform::CUDAPlace g0(0), g1(1), gg0(0);
+  paddle::platform::XPUPlace x0(0), x1(1), xx0(0);

  EXPECT_EQ(cpu, cpu);
  EXPECT_EQ(g0, g0);
  EXPECT_EQ(g1, g1);
  EXPECT_EQ(g0, gg0);
+  EXPECT_EQ(x0, x0);
+  EXPECT_EQ(x1, x1);
+  EXPECT_EQ(x0, xx0);

  EXPECT_NE(g0, g1);
+  EXPECT_NE(x0, x1);

  EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0));
+  EXPECT_TRUE(paddle::platform::places_are_same_class(x0, xx0));
  EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu));
+  EXPECT_FALSE(paddle::platform::places_are_same_class(x0, cpu));
+  EXPECT_FALSE(paddle::platform::places_are_same_class(g0, x0));
 }

 TEST(Place, Print) {
+  {
+    std::stringstream ss;
+    ss << paddle::platform::XPUPlace(1);
+    EXPECT_EQ("XPUPlace(1)", ss.str());
+  }
  {
    std::stringstream ss;
    ss << paddle::platform::CUDAPlace(1);

--- a/paddle/fluid/platform/xpu_header.h
+++ b/paddle/fluid/platform/xpu_header.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+#include "xpu/api.h"
+#include "xpu/runtime.h"
+#include "xpu/runtime_ex.h"
+
+namespace xpu = baidu::xpu::api;
+#endif
--- a/paddle/fluid/platform/xpu_info.cc
+++ b/paddle/fluid/platform/xpu_info.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/xpu_info.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <string>
+#include "gflags/gflags.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/string/split.h"
+
+DEFINE_string(selected_xpus, "",
+              "A list of device ids separated by comma, like: 0,1,2,3. "
+              "This option is useful when doing multi process training and "
+              "each process have only one device (XPU). If you want to use "
+              "all visible devices, set this to empty string. NOTE: the "
+              "reason of doing this is that we want to use P2P communication"
+              "between XPU devices, use XPU_VISIBLE_DEVICES can only use"
+              "share-memory only.");
+
+namespace paddle {
+namespace platform {
+
+static int GetXPUDeviceCountImpl() {
+  const auto *xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES");
+  if (xpu_visible_devices != nullptr) {
+    std::string xpu_visible_devices_str(xpu_visible_devices);
+    if (std::all_of(xpu_visible_devices_str.begin(),
+                    xpu_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected.";
+      return 0;
+    }
+  }
+
+  int count = 0;
+  int ret = xpu_device_count(&count);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  return count;
+}
+
+int GetXPUDeviceCount() {
+  static auto dev_cnt = GetXPUDeviceCountImpl();
+  return dev_cnt;
+}
+
+int GetXPUCurrentDeviceId() {
+  int dev_id;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  return dev_id;
+}
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetXPUSelectedDevices() {
+  // use user specified XPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_xpus.empty()) {
+    auto devices_str = paddle::string::Split(FLAGS_selected_xpus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetXPUDeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
+void SetXPUDeviceId(int id) {
+  PADDLE_ENFORCE_LT(
+      id, GetXPUDeviceCount(),
+      platform::errors::InvalidArgument("id must less than XPU count"));
+  int ret = xpu_set_device(id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+}
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/xpu_info.h
+++ b/paddle/fluid/platform/xpu_info.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+#include <vector>
+
+namespace paddle {
+namespace platform {
+
+//! Get the total number of XPU devices in system.
+int GetXPUDeviceCount();
+
+//! Get the current XPU device id in system.
+int GetXPUCurrentDeviceId();
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetXPUSelectedDevices();
+
+//! Set the XPU device id for next execution.
+void SetXPUDeviceId(int device_id);
+
+}  // namespace platform
+}  // namespace paddle
+#endif
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -66,11 +66,13 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
    return place_obj.cast<platform::CPUPlace>();
  } else if (py::isinstance<platform::CUDAPlace>(place_obj)) {
    return place_obj.cast<platform::CUDAPlace>();
+  } else if (py::isinstance<platform::XPUPlace>(place_obj)) {
+    return place_obj.cast<platform::XPUPlace>();
  } else if (py::isinstance<platform::CUDAPinnedPlace>(place_obj)) {
    return place_obj.cast<platform::CUDAPinnedPlace>();
  } else {
    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Place should be one of CPUPlace/CUDAPlace/CUDAPinnedPlace"));
+        "Place should be one of CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace"));
  }
 }

@@ -92,6 +94,9 @@ static void InitTensorForVarBase(imperative::VarBase *self,
  if (platform::is_cpu_place(place)) {
    SetTensorFromPyArray<platform::CPUPlace>(
        tensor, array, BOOST_GET_CONST(platform::CPUPlace, place), zero_copy);
+  } else if (platform::is_xpu_place(place)) {
+    SetTensorFromPyArray<platform::XPUPlace>(
+        tensor, array, BOOST_GET_CONST(platform::XPUPlace, place), zero_copy);
  } else if (platform::is_gpu_place(place)) {
    SetTensorFromPyArray<platform::CUDAPlace>(
        tensor, array, BOOST_GET_CONST(platform::CUDAPlace, place), zero_copy);
@@ -101,7 +106,7 @@ static void InitTensorForVarBase(imperative::VarBase *self,
        zero_copy);
  } else {
    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Place should be one of CPUPlace/CUDAPlace/CUDAPinnedPlace"));
+        "Place should be one of CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace"));
  }
  if (stop_gradient != -1) {
    self->SetOverridedStopGradient(stop_gradient);
@@ -588,6 +593,10 @@ void BindImperative(py::module *m_ptr) {
           py::arg("value"), py::arg("place"), py::arg("persistable") = false,
           py::arg("zero_copy") = false, py::arg("name") = "",
           py::arg("stop_gradient") = -1)
+      .def("__init__", &InitVarBaseFromNumpyWithArg<platform::XPUPlace>,
+           py::arg("value"), py::arg("place"), py::arg("persistable") = false,
+           py::arg("zero_copy") = false, py::arg("name") = "",
+           py::arg("stop_gradient") = -1)
      .def("__init__", &InitVarBaseFromNumpyWithArg<platform::CUDAPlace>,
           py::arg("value"), py::arg("place"), py::arg("persistable") = false,
           py::arg("zero_copy") = false, py::arg("name") = "",
@@ -823,6 +832,10 @@ void BindImperative(py::module *m_ptr) {
              const platform::CUDAPinnedPlace &place,
              bool blocking) { return self.NewVarBase(place, blocking); },
           py::return_value_policy::copy)
+      .def("_copy_to",
+           [](const imperative::VarBase &self, const platform::XPUPlace &place,
+              bool blocking) { return self.NewVarBase(place, blocking); },
+           py::return_value_policy::copy)
      .def("_copy_to",
           [](const imperative::VarBase &self, const platform::CUDAPlace &place,
              bool blocking) { return self.NewVarBase(place, blocking); },
@@ -890,6 +903,9 @@ void BindImperative(py::module *m_ptr) {
            if (py::isinstance<platform::CUDAPlace>(obj)) {
              auto p = obj.cast<platform::CUDAPlace *>();
              self.SetExpectedPlace(*p);
+            } else if (py::isinstance<platform::XPUPlace>(obj)) {
+              auto p = obj.cast<platform::XPUPlace *>();
+              self.SetExpectedPlace(*p);
            } else if (py::isinstance<platform::CPUPlace>(obj)) {
              auto p = obj.cast<platform::CPUPlace *>();
              self.SetExpectedPlace(*p);
@@ -898,7 +914,8 @@ void BindImperative(py::module *m_ptr) {
              self.SetExpectedPlace(*p);
            } else {
              PADDLE_THROW(platform::errors::InvalidArgument(
-                  "Incompatible Place Type: supports CUDAPlace, CPUPlace, "
+                  "Incompatible Place Type: supports XPUPlace, CUDAPlace, "
+                  "CPUPlace, "
                  "and CUDAPinnedPlace, "
                  "but got Unknown Type!"));
            }
@@ -928,6 +945,19 @@ void BindImperative(py::module *m_ptr) {
                 *(imperative::AmpOperators::Instance().GetAllowOps()),
                 *(imperative::AmpOperators::Instance().GetBlockOps()));
           })
+      .def("trace",
+           [](imperative::Tracer &self, const std::string &type,
+              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs, const platform::XPUPlace &place,
+              bool trace_backward) {
+             auto ins_map = ConvertToNameVarBaseMap(ins);
+             auto outs_map = ConvertToNameVarBaseMap(outs);
+             {
+               py::gil_scoped_release release;
+               self.TraceOp(type, std::move(ins_map), std::move(outs_map),
+                            std::move(attrs), place, trace_backward);
+             }
+           })
      .def("trace",
           [](imperative::Tracer &self, const std::string &type,
              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -90,6 +90,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #endif

+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_info.h"
+#endif
+
 #ifdef PADDLE_WITH_DISTRIBUTE
 #include "paddle/fluid/pybind/communicator_py.h"
 #endif
@@ -118,6 +122,14 @@ bool IsCompiledWithCUDA() {
 #endif
 }

+bool IsCompiledWithXPU() {
+#ifndef PADDLE_WITH_XPU
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithMKLDNN() {
 #ifndef PADDLE_WITH_MKLDNN
  return false;
@@ -471,6 +483,10 @@ PYBIND11_MODULE(core_noavx, m) {
           [](Tensor &self, paddle::platform::CUDAPlace &place) {
             self.mutable_data<float>(place);
           })
+      .def("_alloc_float",
+           [](Tensor &self, paddle::platform::XPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
      .def("_alloc_float",
           [](Tensor &self, paddle::platform::CPUPlace &place) {
             self.mutable_data<float>(place);
@@ -483,6 +499,10 @@ PYBIND11_MODULE(core_noavx, m) {
           [](Tensor &self, paddle::platform::CPUPlace &place) {
             self.mutable_data<int>(place);
           })
+      .def("_alloc_int",
+           [](Tensor &self, paddle::platform::XPUPlace &place) {
+             self.mutable_data<int>(place);
+           })
      .def("_alloc_int",
           [](Tensor &self, paddle::platform::CUDAPlace &place) {
             self.mutable_data<int>(place);
@@ -500,6 +520,11 @@ PYBIND11_MODULE(core_noavx, m) {
              paddle::framework::proto::VarType::Type type) {
             return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
           })
+      .def("_mutable_data",
+           [](Tensor &self, paddle::platform::XPUPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
+           })
      .def("_mutable_data",
           [](Tensor &self, paddle::platform::CUDAPlace &place,
              paddle::framework::proto::VarType::Type type) {
@@ -513,6 +538,8 @@ PYBIND11_MODULE(core_noavx, m) {
      .def("_clear", &Tensor::clear)
      .def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+      .def("set", SetTensorFromPyArray<paddle::platform::XPUPlace>,
+           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
      .def("set", SetTensorFromPyArray<paddle::platform::CUDAPlace>,
           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
      .def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
@@ -522,7 +549,7 @@ PYBIND11_MODULE(core_noavx, m) {
        
        Args:
          lod (numpy.ndarray): The data to set.
-          place (CPUPlace|CUDAPlace|CUDAPinnedPlace): The place where the 
+          place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace): The place where the 
          LoDTensor is to be set.
          zero_copy (bool, optional): Whether to share memory with the input numpy array.
          This parameter only works with CPUPlace. Default: False.
@@ -1230,6 +1257,18 @@ All parameter, weight, gradient are variables in Paddle.
                      -> paddle::platform::DeviceContext* {
                    return new paddle::platform::CPUDeviceContext();
                  })
+      .def_static("create",
+                  [](paddle::platform::XPUPlace& place)
+                      -> paddle::platform::DeviceContext* {
+#ifndef PADDLE_WITH_XPU
+             PADDLE_THROW(
+                 platform::errors::PermissionDenied(
+                 "Cannot use XPUPlace in CPU/GPU version, "
+                 "Please recompile or reinstall Paddle with XPU support."));
+#else
+                    return new paddle::platform::XPUDeviceContext(place);
+#endif
+                  })
      .def_static("create",
                  [](paddle::platform::CUDAPlace& place)
                      -> paddle::platform::DeviceContext* {
@@ -1331,6 +1370,7 @@ All parameter, weight, gradient are variables in Paddle.
      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
      .def("_equals",
           &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
      .def("_get_device_id",
@@ -1338,6 +1378,60 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
      .def("__str__", string::to_string<const platform::CUDAPlace &>);

+  py::class_<platform::XPUPlace>(m, "XPUPlace", R"DOC(
+    **Note**:
+    Examples:
+        .. code-block:: python
+          import paddle.fluid as fluid
+          xpu_place = fluid.XPUPlace(0)
+        )DOC")
+      .def("__init__",
+           [](platform::XPUPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_XPU
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid XPUPlace(%d), device id must be 0 or "
+                   "positive integer",
+                   dev_id);
+               std::exit(-1);
+             }
+             if (UNLIKELY(dev_id >= platform::GetXPUDeviceCount())) {
+               if (platform::GetXPUDeviceCount() == 0) {
+                 LOG(ERROR) << "Cannot use XPU because there is no XPU "
+                               "detected on your "
+                               "machine.";
+                 std::exit(-1);
+               } else {
+                 LOG(ERROR) << string::Sprintf(
+                     "Invalid XPUPlace(%d), must inside [0, %d), because XPU "
+                     "number on your machine is %d",
+                     dev_id, platform::GetXPUDeviceCount(),
+                     platform::GetXPUDeviceCount());
+                 std::exit(-1);
+               }
+             }
+             new (&self) platform::XPUPlace(dev_id);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use XPU because you have installed CPU/GPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use XPU, please try to install XPU version "
+                 "PaddlePaddle by: pip install paddlepaddle-xpu\n"
+                 "If you only have CPU, please change XPUPlace(%d) to be "
+                 "CPUPlace().\n",
+                 dev_id);
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::XPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::XPUPlace, platform::CUDAPinnedPlace>)
+      .def("__str__", string::to_string<const platform::XPUPlace &>);
+
  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
    CPUPlace is a descriptor of a device.
    It represents a CPU device allocated or to be allocated with Tensor or LoDTensor.
@@ -1352,6 +1446,7 @@ All parameter, weight, gradient are variables in Paddle.
      .def(py::init<>())
      .def("_type", &PlaceIndex<platform::CPUPlace>)
      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
      .def("_equals",
@@ -1386,6 +1481,8 @@ All parameter, weight, gradient are variables in Paddle.
      .def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
      .def("_equals",
           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>)
      .def("_equals",
           &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
      .def("_equals",
@@ -1398,11 +1495,14 @@ All parameter, weight, gradient are variables in Paddle.
      .def("_equals", &IsSamePlace<platform::Place, platform::Place>)
      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
      .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
      .def("is_gpu_place",
           [](platform::Place &self) { return platform::is_gpu_place(self); })
      .def("is_cpu_place",
           [](platform::Place &self) { return platform::is_cpu_place(self); })
+      .def("is_xpu_place",
+           [](platform::Place &self) { return platform::is_xpu_place(self); })
      .def("is_cuda_pinned_place",
           [](platform::Place &self) {
             return platform::is_cuda_pinned_place(self);
@@ -1411,12 +1511,20 @@ All parameter, weight, gradient are variables in Paddle.
           [](platform::Place &self) {
             return BOOST_GET_CONST(platform::CUDAPlace, self).device;
           })
+      .def("xpu_device_id",
+           [](platform::Place &self) {
+             return BOOST_GET_CONST(platform::XPUPlace, self).device;
+           })
      .def("set_place", [](platform::Place &self,
                           const platform::Place &other) { self = other; })
      .def("set_place",
           [](platform::Place &self, const platform::CPUPlace &cpu_place) {
             self = cpu_place;
           })
+      .def("set_place",
+           [](platform::Place &self, const platform::XPUPlace &xpu_place) {
+             self = xpu_place;
+           })
      .def("set_place",
           [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
             self = gpu_place;
@@ -1444,6 +1552,9 @@ All parameter, weight, gradient are variables in Paddle.
      .def("run",
           [](OperatorBase &self, const Scope &scope,
              const platform::CPUPlace &place) { self.Run(scope, place); })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::XPUPlace &place) { self.Run(scope, place); })
      .def("run",
           [](OperatorBase &self, const Scope &scope,
              const platform::CUDAPlace &place) { self.Run(scope, place); })
@@ -1544,6 +1655,7 @@ All parameter, weight, gradient are variables in Paddle.
        [](bool init_p2p) { framework::InitDevices(init_p2p); });

  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+  m.def("is_compiled_with_xpu", IsCompiledWithXPU);
  m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
  m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
  m.def("is_compiled_with_dist", IsCompiledWithDIST);

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -145,8 +145,14 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
  T b = static_cast<T>(0);
  if (platform::is_cpu_place(self.place())) {
    b = self.data<T>()[offset];
+  } else if (platform::is_xpu_place(self.place())) {
+#ifdef PADDLE_WITH_XPU
+    const T *a = self.data<T>();
+    auto p = BOOST_GET_CONST(platform::XPUPlace, self.place());
+    paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T));
+#endif
+  } else if (platform::is_gpu_place(self.place())) {
 #ifdef PADDLE_WITH_CUDA
-  } else {
    const T *a = self.data<T>();
    auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place());
    paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
@@ -163,8 +169,14 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
                        "The offset exceeds the size of tensor."));
  if (platform::is_cpu_place(self->place())) {
    self->mutable_data<T>(self->place())[offset] = elem;
+  } else if (platform::is_xpu_place(self->place())) {
+#ifdef PADDLE_WITH_XPU
+    auto p = BOOST_GET_CONST(platform::XPUPlace, self->place());
+    T *a = self->mutable_data<T>(p);
+    paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T));
+#endif
+  } else if (platform::is_gpu_place(self->place())) {
 #ifdef PADDLE_WITH_CUDA
-  } else {
    auto p = BOOST_GET_CONST(platform::CUDAPlace, self->place());
    T *a = self->mutable_data<T>(p);
    paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
@@ -194,6 +206,16 @@ void SetTensorFromPyArrayT(
      auto dst = self->mutable_data<T>(place);
      std::memcpy(dst, array.data(), array.nbytes());
    }
+  } else if (paddle::platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU
+    auto dst = self->mutable_data<T>(place);
+    xpu_memcpy(dst, array.data(), array.nbytes(),
+               XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use XPUPlace in CPU/GPU version, "
+        "Please recompile or reinstall Paddle with XPU support."));
+#endif
  } else {
 #ifdef PADDLE_WITH_CUDA
    auto dst = self->mutable_data<T>(place);
@@ -354,8 +376,13 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
  if (platform::is_cpu_place(place)) {
    output->mutable_data(BOOST_GET_CONST(platform::CPUPlace, place),
                         self.type());
-#ifdef PADDLE_WITH_CUDA
+  } else if (platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU
+    output->mutable_data(BOOST_GET_CONST(platform::XPUPlace, place),
+                         self.type());
+#endif
  } else {
+#ifdef PADDLE_WITH_CUDA
    if (platform::is_cuda_pinned_place(place)) {
      output->mutable_data(BOOST_GET_CONST(platform::CUDAPinnedPlace, place),
                           self.type());
@@ -516,6 +543,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
    return py::array();
  }
  bool is_gpu_tensor = platform::is_gpu_place(tensor.place());
+  bool is_xpu_tensor = platform::is_xpu_place(tensor.place());
  const auto &tensor_dims = tensor.dims();
  auto tensor_dtype = tensor.type();
  size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);
@@ -534,7 +562,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,

  std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type());

-  if (!is_gpu_tensor) {
+  if (!is_gpu_tensor && !is_xpu_tensor) {
    if (!need_deep_copy) {
      return py::array(py::buffer_info(
          const_cast<void *>(tensor_buf_ptr), sizeof_dtype, py_dtype_str,
@@ -557,28 +585,54 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
                           copy_bytes);
      return py_arr;
    }
-  }
-
+  } else if (is_xpu_tensor) {
+#ifdef PADDLE_WITH_XPU
+    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
+    PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
+                      platform::errors::InvalidArgument(
+                          "PyArray is not writable, in which case memory leak "
+                          "or double free would occur"));
+    PADDLE_ENFORCE_EQ(
+        py_arr.owndata(), true,
+        platform::errors::InvalidArgument(
+            "PyArray does not own data, in which case  memory leak "
+            "or double free would occur"));
+
+    size_t copy_bytes = sizeof_dtype * numel;
+    auto p = BOOST_GET_CONST(platform::XPUPlace, tensor.place());
+    paddle::memory::Copy(platform::CPUPlace(), py_arr.mutable_data(), p,
+                         tensor_buf_ptr, copy_bytes);
+    return py_arr;
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use XPUPlace in CPU/GPU version, "
+        "Please recompile or reinstall Paddle with XPU support."));
+#endif
+  } else if (is_gpu_tensor) {
 #ifdef PADDLE_WITH_CUDA
-  py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
-  PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
-                    platform::errors::InvalidArgument(
-                        "PyArray is not writable, in which case memory leak "
-                        "or double free would occur"));
-  PADDLE_ENFORCE_EQ(py_arr.owndata(), true,
-                    platform::errors::InvalidArgument(
-                        "PyArray does not own data, in which case  memory leak "
-                        "or double free would occur"));
-
-  size_t copy_bytes = sizeof_dtype * numel;
-  paddle::platform::GpuMemcpySync(py_arr.mutable_data(), tensor_buf_ptr,
-                                  copy_bytes, cudaMemcpyDeviceToHost);
-  return py_arr;
+    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
+    PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
+                      platform::errors::InvalidArgument(
+                          "PyArray is not writable, in which case memory leak "
+                          "or double free would occur"));
+    PADDLE_ENFORCE_EQ(
+        py_arr.owndata(), true,
+        platform::errors::InvalidArgument(
+            "PyArray does not own data, in which case  memory leak "
+            "or double free would occur"));
+
+    size_t copy_bytes = sizeof_dtype * numel;
+    paddle::platform::GpuMemcpySync(py_arr.mutable_data(), tensor_buf_ptr,
+                                    copy_bytes, cudaMemcpyDeviceToHost);
+    return py_arr;
 #else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "Cannot use CUDAPlace in CPU only version, "
-      "Please recompile or reinstall Paddle with CUDA support."));
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use CUDAPlace in CPU only version, "
+        "Please recompile or reinstall Paddle with CUDA support."));
 #endif
+  }
+  PADDLE_THROW(platform::errors::Unimplemented("Place is not supported"));
+  return py::array();
 }

 }  // namespace pybind

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -68,7 +68,7 @@ from .input import embedding, one_hot
 from . import distribute_lookup_table
 from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
-from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
+from .core import LoDTensor, LoDTensorArray, CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
 from .incubate import fleet
 from .incubate import data_generator
 from .transpiler import DistributeTranspiler, \
@@ -119,6 +119,7 @@ __all__ = framework.__all__ + executor.__all__ + \
        'LoDTensor',
        'LoDTensorArray',
        'CPUPlace',
+        'XPUPlace',
        'CUDAPlace',
        'CUDAPinnedPlace',
        'Tensor',

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -48,6 +48,7 @@ __all__ = [
    'cuda_pinned_places',
    'in_dygraph_mode',
    'is_compiled_with_cuda',
+    'is_compiled_with_xpu',
    'Variable',
    'ComplexVariable',
    'load_op_library',
@@ -310,6 +311,21 @@ def _cuda_ids():
    return device_ids


+def is_compiled_with_xpu():
+    """
+    Whether this whl package can be used to run the model on XPU.
+
+    Returns (bool): support xpu or not.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            support_xpu = fluid.is_compiled_with_xpu()
+    """
+    return core.is_compiled_with_xpu()
+
+
 def is_compiled_with_cuda():
    """
    Whether this whl package can be used to run the model on GPU.

--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -175,5 +175,57 @@ class TestFP16MulOp2(TestMulOp2):
                no_grad_set=set('Y'))


+@unittest.skipIf(not core.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUMulOp1(TestMulOp):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        place = core.XPUPlace(0)
+        self.check_output_with_place(place, atol=1e-1)
+
+    def test_check_grad_normal(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=0.5)
+
+    def test_check_grad_ingore_x(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+
+
+@unittest.skipIf(not core.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUMulOp2(TestMulOp2):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        place = core.XPUPlace(0)
+        self.check_output_with_place(place, atol=2e-1)
+
+    def test_check_grad_normal(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=0.9)
+
+    def test_check_grad_ingore_x(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', max_relative_error=0.9, no_grad_set=set('Y'))
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -303,6 +303,23 @@ if '${WITH_MKLDNN}' == 'ON':
    else:
        package_data['paddle.libs']+=['mkldnn.dll']

+if '${WITH_XPU}' == 'ON':
+    # only change rpath in Release mode,
+    if '${CMAKE_BUILD_TYPE}' == 'Release':
+        if os.name != 'nt':
+            if "@APPLE@" == "1":
+                command = "install_name_tool -id \"@loader_path/\" ${XPU_API_LIB}"
+            else:
+                command = "patchelf --set-rpath '$ORIGIN/' ${XPU_API_LIB}"
+            if os.system(command) != 0:
+                raise Exception("patch ${XPU_API_LIB} failed, command: %s" % command)
+    shutil.copy('${XPU_API_LIB}', libs_path)
+    shutil.copy('${XPU_RT_LIB}', libs_path)
+    shutil.copy('${XPU_SIM_LIB}', libs_path)
+    package_data['paddle.libs']+=['${XPU_API_LIB_NAME}',
+                                  '${XPU_RT_LIB_NAME}',
+                                  '${XPU_SIM_LIB_NAME}']
+
 # copy libfuild_framework.so to libs
 if os.name != 'nt' and sys.platform != 'darwin':
    paddle_framework_lib='${FLUID_FRAMEWORK_SHARED_LIB}'