support xpu with analysis predictor, test=develop (#30832) (#30863)

d199edd8 · 石晓伟 · GitHub · d1ae7b98 · d199edd8 · d199edd8
19 changed file
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -5,36 +5,35 @@ endif()
 INCLUDE(ExternalProject)
 SET(XPU_PROJECT                 "extern_xpu")
-if (WITH_AARCH64)
+if(NOT XPU_SDK_ROOT)
+  if (WITH_AARCH64)
      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/aarch64/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
-elseif(WITH_SUNWAY)
+  elseif(WITH_SUNWAY)
      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
-else()
+  else()
      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
-endif()
+  endif()
-SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
+  SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
-SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
+  SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
-SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
+  SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
-SET(XPU_API_INC_DIR             "${THIRD_PARTY_PATH}/install/xpu/include")
+  SET(XPU_API_INC_DIR             "${THIRD_PARTY_PATH}/install/xpu/include")
-SET(XPU_LIB_DIR                 "${THIRD_PARTY_PATH}/install/xpu/lib")
+  SET(XPU_LIB_DIR                 "${THIRD_PARTY_PATH}/install/xpu/lib")
-SET(XPU_API_LIB_NAME            "libxpuapi.so")
+  SET(XPU_API_LIB_NAME            "libxpuapi.so")
-SET(XPU_RT_LIB_NAME             "libxpurt.so")
+  SET(XPU_RT_LIB_NAME             "libxpurt.so")
-SET(XPU_API_LIB                 "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
+  SET(XPU_API_LIB                 "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
-SET(XPU_RT_LIB                  "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
+  SET(XPU_RT_LIB                  "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
+  SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
-INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
+  FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
-FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
    "PROJECT(XPU)\n"
    "cmake_minimum_required(VERSION 3.0)\n"
    "install(DIRECTORY xpu/include xpu/lib \n"
    "        DESTINATION ${XPU_INSTALL_DIR})\n")
-ExternalProject_Add(
+  ExternalProject_Add(
      ${XPU_PROJECT}
      ${EXTERNAL_PROJECT_LOG_ARGS}
      PREFIX                ${XPU_SOURCE_DIR}
@@ -45,8 +44,14 @@ ExternalProject_Add(
      UPDATE_COMMAND        ""
      CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
      CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
-)
+  )
+else()
+  SET(XPU_API_INC_DIR   "${XPU_SDK_ROOT}/XTDK/include/")
+  SET(XPU_API_LIB "${XPU_SDK_ROOT}/XTDK/shlib/libxpuapi.so")
+  SET(XPU_RT_LIB "${XPU_SDK_ROOT}/XTDK/runtime/shlib/libxpurt.so")
+endif()
+INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
 ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
 set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
@@ -69,4 +74,14 @@ else(WITH_XPU_BKCL)
  TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} )
 endif(WITH_XPU_BKCL)
-ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
+if(NOT XPU_SDK_ROOT)
+  ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
+else()
+  ADD_CUSTOM_TARGET(extern_xpu DEPENDS xpulib)
+endif()
+# Ensure that xpu/api.h can be included without dependency errors.
+file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc CONTENT "")
+add_library(xpu_headers_dummy STATIC ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc)
+add_dependencies(xpu_headers_dummy extern_xpu)
+link_libraries(xpu_headers_dummy)
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -33,6 +33,8 @@ PassStrategy *AnalysisConfig::pass_builder() const {
    if (use_gpu_) {
      LOG(INFO) << "Create GPU IR passes";
      pass_builder_.reset(new GpuPassStrategy);
+    } else if (use_xpu_) {
+      pass_builder_.reset(new XpuPassStrategy);
    } else {
      LOG(INFO) << "Create CPU IR passes";
      pass_builder_.reset(new CpuPassStrategy);
@@ -73,7 +75,7 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
  use_gpu_ = true;
  memory_pool_init_size_mb_ = memory_pool_init_size_mb;
  FLAGS_initial_gpu_memory_in_mb = memory_pool_init_size_mb_;
-  device_id_ = device_id;
+  gpu_device_id_ = device_id;
 #else
  LOG(ERROR) << "Please compile with gpu to EnableGpu()";
  use_gpu_ = false;
@@ -115,7 +117,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  // GPU related.
  CP_MEMBER(use_gpu_);
  CP_MEMBER(use_cudnn_);
-  CP_MEMBER(device_id_);
+  CP_MEMBER(gpu_device_id_);
+  CP_MEMBER(xpu_device_id_);
  CP_MEMBER(memory_pool_init_size_mb_);
  CP_MEMBER(enable_memory_optim_);
@@ -174,8 +177,14 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(thread_local_stream_);
  if (use_gpu_) {
+    PADDLE_ENFORCE_EQ(use_xpu_, false,
+                      platform::errors::InvalidArgument(
+                          "Only one choice can be made between CPU and XPU."));
    pass_builder_.reset(new GpuPassStrategy(
        *static_cast<GpuPassStrategy *>(other.pass_builder())));
+  } else if (use_xpu_) {
+    pass_builder_.reset(new XpuPassStrategy(
+        *static_cast<XpuPassStrategy *>(other.pass_builder())));
  } else {
    pass_builder_.reset(new CpuPassStrategy(
        *static_cast<CpuPassStrategy *>(other.pass_builder())));
@@ -333,6 +342,12 @@ void AnalysisConfig::Update() {
        // Append after the Affine_channel_conv_fuse pass.
        pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
      }
+    } else if (use_xpu()) {
+      PADDLE_ENFORCE_EQ(
+          use_gpu(), false,
+          platform::errors::InvalidArgument(
+              "Only one choice can be made between CPU and XPU."));
+      pass_builder_.reset(new XpuPassStrategy);
    } else {
      pass_builder_.reset(new CpuPassStrategy);
    }
@@ -341,7 +356,13 @@ void AnalysisConfig::Update() {
    if (use_gpu()) {
      pass_builder_.reset(new GpuPassStrategy(
          *static_cast<GpuPassStrategy *>(pass_builder_.get())));
+    } else if (use_xpu()) {
+      PADDLE_ENFORCE_EQ(
+          use_gpu(), false,
+          platform::errors::InvalidArgument(
+              "Only one choice can be made between CPU and XPU."));
+      pass_builder_.reset(new XpuPassStrategy(
+          *static_cast<XpuPassStrategy *>(pass_builder_.get())));
    } else {
      pass_builder_.reset(new CpuPassStrategy(
          *static_cast<CpuPassStrategy *>(pass_builder_.get())));
@@ -420,19 +441,16 @@ void AnalysisConfig::Update() {
  }
  if (use_xpu_) {
-#ifndef LITE_SUBGRAPH_WITH_XPU
+#if (defined LITE_SUBGRAPH_WITH_XPU) || (defined PADDLE_WITH_XPU)
-    PADDLE_THROW(platform::errors::Unavailable(
-        "You tried to use an XPU device, but Paddle was not compiled "
-        "with XPU-runtime."));
-#endif
-    if (!use_lite_) {
-      LOG(WARNING) << "Because XPU currently only works in Paddle-Lite "
-                      "subgraph mode, please make sure you have enabled it.";
-    }
    PADDLE_ENFORCE_EQ(use_gpu_, false,
                      platform::errors::Unavailable(
                          "Currently, XPU and GPU cannot be enabled in the "
                          "same analysis configuration."));
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "You tried to use an XPU device, but Paddle was not compiled "
+        "with XPU-runtime."));
+#endif
  }
  if (ir_debug_) {
@@ -448,7 +466,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
  ss << use_gpu_;
  ss << use_fc_padding_;
-  ss << device_id_;
+  ss << gpu_device_id_;
+  ss << xpu_device_id_;
  ss << memory_pool_init_size_mb_;
  ss << use_tensorrt_;
@@ -507,7 +526,7 @@ float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
  // Get the GPU memory details and calculate the fraction of memory for the
  // GPU memory pool.
  size_t gpu_total, gpu_available;
-  platform::SetDeviceId(device_id_);
+  platform::SetDeviceId(gpu_device_id_);
  platform::GpuMemoryUsage(&gpu_available, &gpu_total);
  double total_gpu_memory = gpu_total / 1024. / 1024.;
  float fraction_of_gpu_memory =
@@ -548,7 +567,7 @@ NativeConfig AnalysisConfig::ToNativeConfig() const {
  config.prog_file = prog_file_;
  config.param_file = params_file_;
  config.use_gpu = use_gpu_;
-  config.device = device_id_;
+  config.device = gpu_device_id_;
  config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
  config.specify_input_name = specify_input_name_;
  return config;

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -103,7 +103,10 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
    std::memcpy(static_cast<void *>(input_ptr), pt.data.data(),
                pt.data.length());
-  } else {
+  } else if (platform::is_gpu_place(place)) {
+    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), false,
+                      platform::errors::InvalidArgument(
+                          "Only one choice can be made between CPU and XPU."));
 #ifdef PADDLE_WITH_CUDA
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto *dev_ctx =
@@ -116,6 +119,18 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
    PADDLE_THROW(paddle::platform::errors::Fatal(
        "Not compile with CUDA, should not reach here."));
 #endif
+  } else if (platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU
+    auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place);
+    memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
+                 platform::CPUPlace(), pt.data.data(), pt.data.length());
+#else
+    PADDLE_THROW(paddle::platform::errors::Fatal(
+        "Not compile with XPU, should not reach here."));
+#endif
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "The analysis predictor supports CPU, GPU and XPU now."));
  }
  // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
  framework::LoD lod;
@@ -182,6 +197,12 @@ bool AnalysisPredictor::PrepareScope(
           ++dev_id) {
        memory::Release(platform::CUDAPlace(dev_id));
      }
+#endif
+#ifdef PADDLE_WITH_XPU
+      for (int dev_id = 0; dev_id < paddle::platform::GetXPUDeviceCount();
+           ++dev_id) {
+        memory::Release(platform::XPUPlace(dev_id));
+      }
 #endif
      memory::Release(platform::CPUPlace());
    });
@@ -219,7 +240,9 @@ bool AnalysisPredictor::PrepareProgram(
 }
 bool AnalysisPredictor::CreateExecutor() {
  if (config_.use_gpu()) {
-    status_use_gpu_ = true;
+    PADDLE_ENFORCE_EQ(config_.use_xpu(), false,
+                      platform::errors::InvalidArgument(
+                          "Only one choice can be made between CPU and XPU."));
    place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
 #ifdef PADDLE_WITH_CUDA
    if (config_.thread_local_stream_enabled()) {
@@ -230,6 +253,8 @@ bool AnalysisPredictor::CreateExecutor() {
      ctx->ResetThreadContext(platform::stream::Priority::kNormal);
    }
 #endif
+  } else if (config_.use_xpu()) {
+    place_ = paddle::platform::XPUPlace(config_.xpu_device_id());
  } else {
    place_ = paddle::platform::CPUPlace();
  }
@@ -734,11 +759,16 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
  res->SetName(name);
  if (platform::is_cpu_place(place_)) {
    res->SetPlace(PaddlePlace::kCPU);
+  } else if (platform::is_xpu_place(place_)) {
+    PADDLE_ENFORCE_EQ(config_.use_gpu(), false,
+                      platform::errors::InvalidArgument(
+                          "Only one choice can be made between CPU and XPU."));
+    auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
+    res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
  } else {
    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
  }
  return res;
 }
@@ -755,6 +785,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
  res->SetName(name);
  if (platform::is_cpu_place(place_)) {
    res->SetPlace(PaddlePlace::kCPU);
+  } else if (platform::is_xpu_place(place_)) {
+    auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
+    res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
  } else {
    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -415,7 +415,6 @@ class AnalysisPredictor : public PaddlePredictor {
 private:
  // Some status here that help to determine the status inside the predictor.
  bool status_is_cloned_{false};
-  bool status_use_gpu_{false};
 };
 }  // namespace paddle
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -80,7 +80,12 @@ bool NativePaddlePredictor::Init(
  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
  if (config_.use_gpu) {
+    PADDLE_ENFORCE_EQ(config_.use_xpu, false,
+                      platform::errors::InvalidArgument(
+                          "Only one choice can be made between CPU and XPU."));
    place_ = paddle::platform::CUDAPlace(config_.device);
+  } else if (config_.use_xpu) {
+    place_ = paddle::platform::XPUPlace(config_.device);
  } else {
    place_ = paddle::platform::CPUPlace();
  }
@@ -240,7 +245,11 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
      // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
      std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
                  inputs[i].data.length());
-    } else {
+    } else if (platform::is_gpu_place(place_)) {
+      PADDLE_ENFORCE_EQ(
+          platform::is_xpu_place(place_), false,
+          platform::errors::InvalidArgument(
+              "Only one choice can be made between CPU and XPU."));
 #ifdef PADDLE_WITH_CUDA
      platform::DeviceContextPool &pool =
          platform::DeviceContextPool::Instance();
@@ -253,6 +262,16 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
 #else
      PADDLE_THROW(platform::errors::Unavailable(
          "Not compile with CUDA, should not reach here."));
+#endif
+    } else {
+#ifdef PADDLE_WITH_XPU
+      auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
+      memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
+                   platform::CPUPlace(), inputs[i].data.data(),
+                   inputs[i].data.length());
+#else
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Not compile with XPU, should not reach here."));
 #endif
    }

--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -58,19 +58,15 @@ NativeConfig GetConfig() {
  config.model_dir = FLAGS_word2vec_dirname;
  LOG(INFO) << "dirname  " << config.model_dir;
  config.fraction_of_gpu_memory = 0.15;
-#ifdef PADDLE_WITH_CUDA
-  config.use_gpu = true;
-#else
-  config.use_gpu = false;
-#endif
  config.device = 0;
  return config;
 }
-void MainWord2Vec(bool use_gpu) {
+void MainWord2Vec(const paddle::PaddlePlace& place) {
  NativeConfig config = GetConfig();
  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
-  config.use_gpu = use_gpu;
+  config.use_gpu = paddle::gpu_place_used(place);
+  config.use_xpu = paddle::xpu_place_used(place);
  framework::LoDTensor first_word, second_word, third_word, fourth_word;
  framework::LoD lod{{0, 1}};
@@ -117,11 +113,12 @@ void MainWord2Vec(bool use_gpu) {
  }
 }
-void MainImageClassification(bool use_gpu) {
+void MainImageClassification(const paddle::PaddlePlace& place) {
  int batch_size = 2;
  bool repeat = false;
  NativeConfig config = GetConfig();
-  config.use_gpu = use_gpu;
+  config.use_gpu = paddle::gpu_place_used(place);
+  config.use_xpu = paddle::xpu_place_used(place);
  config.model_dir =
      FLAGS_book_dirname + "/image_classification_resnet.inference.model";
@@ -162,9 +159,10 @@ void MainImageClassification(bool use_gpu) {
  }
 }
-void MainThreadsWord2Vec(bool use_gpu) {
+void MainThreadsWord2Vec(const paddle::PaddlePlace& place) {
  NativeConfig config = GetConfig();
-  config.use_gpu = use_gpu;
+  config.use_gpu = paddle::gpu_place_used(place);
+  config.use_xpu = paddle::xpu_place_used(place);
  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
  // prepare inputs data and reference results
@@ -223,11 +221,12 @@ void MainThreadsWord2Vec(bool use_gpu) {
  }
 }
-void MainThreadsImageClassification(bool use_gpu) {
+void MainThreadsImageClassification(const paddle::PaddlePlace& place) {
  constexpr int num_jobs = 4;  // each job run 1 batch
  constexpr int batch_size = 1;
  NativeConfig config = GetConfig();
-  config.use_gpu = use_gpu;
+  config.use_gpu = paddle::gpu_place_used(place);
+  config.use_xpu = paddle::xpu_place_used(place);
  config.model_dir =
      FLAGS_book_dirname + "/image_classification_resnet.inference.model";
@@ -276,29 +275,42 @@ void MainThreadsImageClassification(bool use_gpu) {
  }
 }
-TEST(inference_api_native, word2vec_cpu) { MainWord2Vec(false /*use_gpu*/); }
+TEST(inference_api_native, word2vec_cpu) {
+  MainWord2Vec(paddle::PaddlePlace::kCPU);
+}
 TEST(inference_api_native, word2vec_cpu_threads) {
-  MainThreadsWord2Vec(false /*use_gpu*/);
+  MainThreadsWord2Vec(paddle::PaddlePlace::kCPU);
 }
 TEST(inference_api_native, image_classification_cpu) {
-  MainImageClassification(false /*use_gpu*/);
+  MainImageClassification(paddle::PaddlePlace::kCPU);
 }
 TEST(inference_api_native, image_classification_cpu_threads) {
-  MainThreadsImageClassification(false /*use_gpu*/);
+  MainThreadsImageClassification(paddle::PaddlePlace::kCPU);
 }
+#ifdef PADDLE_WITH_XPU
+TEST(inference_api_native, word2vec_xpu) {
+  MainWord2Vec(paddle::PaddlePlace::kXPU);
+}
+TEST(inference_api_native, image_classification_xpu) {
+  MainImageClassification(paddle::PaddlePlace::kXPU);
+}
+#endif
 #ifdef PADDLE_WITH_CUDA
-TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); }
+TEST(inference_api_native, word2vec_gpu) {
+  MainWord2Vec(paddle::PaddlePlace::kGPU);
+}
 // Turn off temporarily for the unstable result.
 // TEST(inference_api_native, word2vec_gpu_threads) {
-//   MainThreadsWord2Vec(true /*use_gpu*/);
+//   MainThreadsWord2Vec(paddle::PaddlePlace::kGPU);
 // }
 TEST(inference_api_native, image_classification_gpu) {
-  MainImageClassification(true /*use_gpu*/);
+  MainImageClassification(paddle::PaddlePlace::kGPU);
 }
 // Turn off temporarily for the unstable result.
 // TEST(inference_api_native, image_classification_gpu_threads) {
-//   MainThreadsImageClassification(true /*use_gpu*/);
+//   MainThreadsImageClassification(paddle::PaddlePlace::kGPU);
 // }
 #endif

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -185,11 +185,23 @@ struct PD_INFER_DECL AnalysisConfig {
  ///
  bool use_gpu() const { return use_gpu_; }
  ///
+  /// \brief A boolean state telling whether the XPU is turned on.
+  ///
+  /// \return bool Whether the XPU is turned on.
+  ///
+  bool use_xpu() const { return use_xpu_; }
+  ///
+  /// \brief Get the GPU device id.
+  ///
+  /// \return int The GPU device id.
+  ///
+  int gpu_device_id() const { return gpu_device_id_; }
+  ///
  /// \brief Get the GPU device id.
  ///
  /// \return int The GPU device id.
  ///
-  int gpu_device_id() const { return device_id_; }
+  int xpu_device_id() const { return xpu_device_id_; }
  ///
  /// \brief Get the initial size in MB of the GPU memory pool.
  ///
@@ -579,7 +591,8 @@ struct PD_INFER_DECL AnalysisConfig {
  // GPU related.
  bool use_gpu_{false};
-  int device_id_{0};
+  int gpu_device_id_{0};
+  int xpu_device_id_{0};
  uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
  bool use_cudnn_{false};

--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -161,7 +161,7 @@ struct PD_INFER_DECL PaddleTensor {
  std::vector<std::vector<size_t>> lod;  ///<  Tensor+LoD equals LoDTensor
 };
-enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
+enum class PaddlePlace { kUNK = -1, kCPU, kGPU, kXPU };
 /// \brief Represents an n-dimensional array of values.
 /// The ZeroCopyTensor is used to store the input or output of the network.
@@ -360,6 +360,7 @@ class PD_INFER_DECL PaddlePredictor {
 struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config {
  NativeConfig();
  /// GPU related fields.
+  bool use_xpu{false};
  bool use_gpu{false};
  int device{0};
  float fraction_of_gpu_memory{

--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -140,11 +140,16 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
  /// \return A bool variable implying whether we are in gpu mode.
  bool use_gpu() const { return use_gpu_; }
+  /// \brief Check if we are using xpu.
+  /// \return A bool variable implying whether we are in xpu mode.
+  bool use_xpu() const { return use_xpu_; }
  /// \brief Default destructor.
  virtual ~PassStrategy() = default;
 protected:
  /// \cond Protected
+  bool use_xpu_{false};
  bool use_gpu_{false};
  bool use_mkldnn_{false};
  /// \endcond
@@ -226,6 +231,14 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
  /// \endcond
 };
+/// \class XpuPassStrategy
+/// \brief The XPU passes controller, it is used in AnalysisPredictor with XPU
+/// mode.
+class PD_INFER_DECL XpuPassStrategy final : public PassStrategy {
+ public:
+  XpuPassStrategy() : PassStrategy({}) {}
+};
 /// \brief List of tensorRT subgraph passes.
 PD_INFER_DECL extern const std::vector<std::string> kTRTSubgraphPasses;

--- a/paddle/fluid/inference/capi/paddle_c_api.h
+++ b/paddle/fluid/inference/capi/paddle_c_api.h
@@ -165,12 +165,19 @@ PADDLE_CAPI_EXPORT extern void PD_EnableUseGpu(PD_AnalysisConfig* config,
                                               int memory_pool_init_size_mb,
                                               int device_id);
+PADDLE_CAPI_EXPORT extern void PD_EnableXpu(PD_AnalysisConfig* config,
+                                            int l3_workspace_size);
 PADDLE_CAPI_EXPORT extern void PD_DisableGpu(PD_AnalysisConfig* config);
 PADDLE_CAPI_EXPORT extern bool PD_UseGpu(const PD_AnalysisConfig* config);
+PADDLE_CAPI_EXPORT extern bool PD_UseXpu(const PD_AnalysisConfig* config);
 PADDLE_CAPI_EXPORT extern int PD_GpuDeviceId(const PD_AnalysisConfig* config);
+PADDLE_CAPI_EXPORT extern int PD_XpuDeviceId(const PD_AnalysisConfig* config);
 PADDLE_CAPI_EXPORT extern int PD_MemoryPoolInitSizeMb(
    const PD_AnalysisConfig* config);

--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -111,6 +111,14 @@ void PD_EnableUseGpu(PD_AnalysisConfig* config, int memory_pool_init_size_mb,
                              device_id);
 }
+void PD_EnableXpu(PD_AnalysisConfig* config, int l3_workspace_size) {
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
+  config->config.EnableXpu(l3_workspace_size);
+}
 void PD_DisableGpu(PD_AnalysisConfig* config) {
  PADDLE_ENFORCE_NOT_NULL(
      config,
@@ -127,6 +135,14 @@ bool PD_UseGpu(const PD_AnalysisConfig* config) {
  return config->config.use_gpu();
 }
+bool PD_UseXpu(const PD_AnalysisConfig* config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
+  return config->config.use_xpu();
+}
 int PD_GpuDeviceId(const PD_AnalysisConfig* config) {
  PADDLE_ENFORCE_NOT_NULL(
      config,
@@ -135,6 +151,14 @@ int PD_GpuDeviceId(const PD_AnalysisConfig* config) {
  return config->config.gpu_device_id();
 }
+int PD_XpuDeviceId(const PD_AnalysisConfig* config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
+  return config->config.xpu_device_id();
+}
 int PD_MemoryPoolInitSizeMb(const PD_AnalysisConfig* config) {
  PADDLE_ENFORCE_NOT_NULL(
      config,

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -499,6 +499,9 @@ if(WITH_GPU AND TENSORRT_FOUND)
    inference_analysis_test(test_analyzer_capi_gpu SRCS analyzer_capi_gpu_tester.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
+    inference_analysis_test(test_analyzer_capi_xpu SRCS analyzer_capi_xpu_tester.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
+            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
    set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
    if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR}/small_quant_model.tgz)

--- a/paddle/fluid/inference/tests/api/analyzer_capi_xpu_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_xpu_tester.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi/paddle_c_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+#ifdef PADDLE_WITH_XPU
+TEST(PD_AnalysisConfig, use_xpu) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  PD_AnalysisConfig *config = PD_NewAnalysisConfig();
+  PD_SwitchUseFeedFetchOps(config, false);
+  PD_SwitchSpecifyInputNames(config, true);
+  PD_SwitchIrDebug(config, true);
+  PD_SetModel(config, model_dir.c_str(), nullptr);
+  PD_SetOptimCacheDir(config, (FLAGS_infer_model + "/OptimCacheDir").c_str());
+  const char *model_dir_ = PD_ModelDir(config);
+  LOG(INFO) << model_dir_;
+  PD_EnableXpu(config, 0xfffc00);
+  bool use_xpu = PD_UseXpu(config);
+  CHECK(use_xpu) << "NO";
+  int device = PD_XpuDeviceId(config);
+  CHECK(0 == device) << "NO";
+  PD_SwitchIrOptim(config, true);
+  bool ir_optim = PD_IrOptim(config);
+  CHECK(ir_optim) << "NO";
+  PD_EnableMemoryOptim(config);
+  bool memory_optim_enable = PD_MemoryOptimEnabled(config);
+  CHECK(memory_optim_enable) << "NO";
+  PD_EnableProfile(config);
+  bool profiler_enable = PD_ProfileEnabled(config);
+  CHECK(profiler_enable) << "NO";
+  PD_SetInValid(config);
+  bool is_valid = PD_IsValid(config);
+  CHECK(!is_valid) << "NO";
+  PD_DeleteAnalysisConfig(config);
+}
+#endif
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
@@ -58,6 +58,24 @@ int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) {
  return 0;
 }
+#ifdef PADDLE_WITH_XPU
+TEST(AnalysisPredictor, native_xpu) {
+  AnalysisConfig config;
+  config.EnableXpu();
+  config.SetModel(FLAGS_infer_model + "/" + "mul_model");
+  test_main(config);
+}
+#endif
+#ifdef LITE_SUBGRAPH_WITH_XPU
+TEST(AnalysisPredictor, lite_xpu) {
+  AnalysisConfig config;
+  config.EnableXpu();
+  config.SetModel(FLAGS_infer_model + "/" + "mul_model");
+  config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
+}
+#endif
 #ifdef PADDLE_WITH_CUDA
 TEST(AnalysisPredictor, thread_local_stream) {
  const size_t thread_num = 5;

--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -27,6 +27,18 @@ limitations under the License. */
 DECLARE_bool(use_mkldnn);
+namespace paddle {
+bool gpu_place_used(const paddle::PaddlePlace& place) {
+  return place == paddle::PaddlePlace::kGPU;
+}
+bool xpu_place_used(const paddle::PaddlePlace& place) {
+  return place == paddle::PaddlePlace::kXPU;
+}
+bool cpu_place_used(const paddle::PaddlePlace& place) {
+  return place == paddle::PaddlePlace::kCPU;
+}
+}  // namespace paddle
 template <typename T>
 void SetupTensor(paddle::framework::LoDTensor* input,
                 paddle::framework::DDim dims, T lower, T upper) {

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -197,12 +197,12 @@ void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
 template <>
 uint64_t Release<platform::XPUPlace>(const platform::XPUPlace &place) {
 #ifdef PADDLE_WITH_XPU
-  PADDLE_THROW(
+  LOG(WARNING) << "Release XPU pool is not supported now, no action here.";
-      platform::errors::PermissionDenied("Release XPU pool is not supported."));
 #else
  PADDLE_THROW(
      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
 #endif
+  return -1;
 }
 template <>

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -58,7 +58,7 @@ cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 if(WITH_XPU)
-cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce)
+cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib)
 endif()
 add_subdirectory(dynload)

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -369,7 +369,8 @@ void BindPaddlePlace(py::module *m) {
  py::enum_<PaddlePlace>(*m, "PaddlePlace")
      .value("UNK", PaddlePlace::kUNK)
      .value("CPU", PaddlePlace::kCPU)
-      .value("GPU", PaddlePlace::kGPU);
+      .value("GPU", PaddlePlace::kGPU)
+      .value("XPU", PaddlePlace::kXPU);
 }
 void BindPaddlePredictor(py::module *m) {
@@ -398,6 +399,7 @@ void BindNativeConfig(py::module *m) {
  py::class_<NativeConfig, PaddlePredictor::Config>(*m, "NativeConfig")
      .def(py::init<>())
      .def_readwrite("use_gpu", &NativeConfig::use_gpu)
+      .def_readwrite("use_xpu", &NativeConfig::use_xpu)
      .def_readwrite("device", &NativeConfig::device)
      .def_readwrite("fraction_of_gpu_memory",
                     &NativeConfig::fraction_of_gpu_memory)
@@ -459,7 +461,9 @@ void BindAnalysisConfig(py::module *m) {
           py::arg("l3_workspace_size"))
      .def("disable_gpu", &AnalysisConfig::DisableGpu)
      .def("use_gpu", &AnalysisConfig::use_gpu)
+      .def("use_xpu", &AnalysisConfig::use_xpu)
      .def("gpu_device_id", &AnalysisConfig::gpu_device_id)
+      .def("xpu_device_id", &AnalysisConfig::xpu_device_id)
      .def("memory_pool_init_size_mb",
           &AnalysisConfig::memory_pool_init_size_mb)
      .def("fraction_of_gpu_memory_for_pool",

--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -26,7 +26,20 @@ import sys
 paddle.enable_static()
-def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
+def get_place(target):
+    if target == "cuda":
+        return fluid.CUDAPlace(0)
+    elif target == "xpu":
+        return fluid.XPUPlace(0)
+    elif target == "cpu":
+        return fluid.CPUPlace()
+    else:
+        raise ValueError(
+            "Target `{0}` is not on the support list: `cuda`, `xpu` and `cpu`.".
+            format(target))
+def train(target, is_sparse, is_parallel, save_dirname, is_local=True):
    PASS_NUM = 100
    EMBED_SIZE = 32
    HIDDEN_SIZE = 256
@@ -93,7 +106,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
    train_reader = paddle.batch(
        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    place = get_place(target)
    exe = fluid.Executor(place)
    feeder = fluid.DataFeeder(
        feed_list=[first_word, second_word, third_word, forth_word, next_word],
@@ -143,13 +156,12 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
            train_loop(t.get_trainer_program())
-def infer(use_cuda, save_dirname=None):
+def infer(target, save_dirname=None):
    if save_dirname is None:
        return
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    place = get_place(target)
    exe = fluid.Executor(place)
    inference_scope = fluid.core.Scope()
    with fluid.scope_guard(inference_scope):
        # Use fluid.io.load_inference_model to obtain the inference program desc,
@@ -211,10 +223,12 @@ def infer(use_cuda, save_dirname=None):
        infer_config = fluid.core.NativeConfig()
        infer_config.model_dir = 'word2vec.inference.model'
-        infer_config.use_gpu = use_cuda
+        if target == "cuda":
-        if use_cuda:
+            infer_config.use_gpu = True
            infer_config.device = 0
            infer_config.fraction_of_gpu_memory = 0.15
+        elif target == "xpu":
+            infer_config.use_xpu = True
        compiled_program = fluid.compiler.CompiledProgram(inference_program)
        compiled_program._with_inference_optimize(infer_config)
        assert compiled_program._is_inference is True
@@ -222,11 +236,13 @@ def infer(use_cuda, save_dirname=None):
        np_data = np.array(results[0])
        infer_out = infer_outputs[0].data.float_data()
        for a, b in zip(np_data[0], infer_out):
-            assert np.isclose(a, b), "a: {}, b: {}".format(a, b)
+            assert np.isclose(a, b, rtol=5e-5), "a: {}, b: {}".format(a, b)
-def main(use_cuda, is_sparse, is_parallel):
+def main(target, is_sparse, is_parallel):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
+    if target == "cuda" and not fluid.core.is_compiled_with_cuda():
+        return
+    if target == "xpu" and not fluid.core.is_compiled_with_xpu():
        return
    if not is_parallel:
@@ -234,8 +250,13 @@ def main(use_cuda, is_sparse, is_parallel):
    else:
        save_dirname = None
-    train(use_cuda, is_sparse, is_parallel, save_dirname)
+    if target == "xpu":
-    infer(use_cuda, save_dirname)
+        # This model cannot be trained with xpu temporarily,
+        # so only inference is turned on.
+        train("cpu", is_sparse, is_parallel, save_dirname)
+    else:
+        train(target, is_sparse, is_parallel, save_dirname)
+    infer(target, save_dirname)
 FULL_TEST = os.getenv('FULL_TEST',
@@ -247,8 +268,8 @@ class W2VTest(unittest.TestCase):
    pass
-def inject_test_method(use_cuda, is_sparse, is_parallel):
+def inject_test_method(target, is_sparse, is_parallel):
-    fn_name = "test_{0}_{1}_{2}".format("cuda" if use_cuda else "cpu", "sparse"
+    fn_name = "test_{0}_{1}_{2}".format(target, "sparse"
                                        if is_sparse else "dense", "parallel"
                                        if is_parallel else "normal")
@@ -259,11 +280,10 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
        with fluid.scope_guard(scope):
            with fluid.program_guard(prog, startup_prog):
                main(
-                    use_cuda=use_cuda,
+                    target=target, is_sparse=is_sparse, is_parallel=is_parallel)
-                    is_sparse=is_sparse,
-                    is_parallel=is_parallel)
-    if (not fluid.core.is_compiled_with_cuda() or use_cuda) and is_sparse:
+    if (not fluid.core.is_compiled_with_cuda() or
+            target == "cuda") and is_sparse:
        fn = __impl__
    else:
        # skip the other test when on CI server
@@ -273,10 +293,10 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
    setattr(W2VTest, fn_name, fn)
-for use_cuda in (False, True):
+for target in ("cuda", "cpu", "xpu"):
    for is_sparse in (False, True):
        for is_parallel in (False, ):
-            inject_test_method(use_cuda, is_sparse, is_parallel)
+            inject_test_method(target, is_sparse, is_parallel)
 if __name__ == '__main__':
    unittest.main()