bug fix of xpu lite engine, test=develop (#30918)

* bug fix of xpu lite engine, test=develop * xpu zero copy tensor, test=develop * revert paddle/fluid/inference/tests/api/CMakeLists.txt

bug fix of xpu lite engine, test=develop (#30918)
* bug fix of xpu lite engine, test=develop * xpu zero copy tensor, test=develop * revert paddle/fluid/inference/tests/api/CMakeLists.txt
99bd16eb · 石晓伟 · GitHub · 2e932338 · 99bd16eb · 99bd16eb
4 changed file
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -254,7 +254,29 @@ bool AnalysisPredictor::CreateExecutor() {
    }
 #endif
  } else if (config_.use_xpu()) {
+    if (config_.lite_engine_enabled()) {
+#ifdef LITE_SUBGRAPH_WITH_XPU
+      // Currently, Paddle-Lite's XPU user interface only supports the transfer
+      // of Host data pointers. If it is currently used as a subgraph, execution
+      // efficiency will be sacrificed, so it is temporarily set to cpu place.
+      // And, the current lite engine of xpu must execute all parts of the
+      // model.
+      place_ = paddle::platform::CPUPlace();
+#else
+      PADDLE_THROW(platform::errors::Unavailable(
+          "You tried to use an XPU lite engine, but Paddle was not compiled "
+          "with it."));
+#endif  // LITE_SUBGRAPH_WITH_XPU
+    } else {
+#ifdef PADDLE_WITH_XPU
      place_ = paddle::platform::XPUPlace(config_.xpu_device_id());
+#else
+      PADDLE_THROW(platform::errors::Unavailable(
+          "You tried to use XPU forward propagation (inference without lite "
+          "engine), but Paddle was not compiled "
+          "with WITH_XPU."));
+#endif  // PADDLE_WITH_XPU
+    }
  } else {
    place_ = paddle::platform::CPUPlace();
  }
@@ -760,11 +782,17 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
  if (platform::is_cpu_place(place_)) {
    res->SetPlace(PaddlePlace::kCPU);
  } else if (platform::is_xpu_place(place_)) {
-    PADDLE_ENFORCE_EQ(config_.use_gpu(), false,
+    if (config_.lite_engine_enabled()) {
-                      platform::errors::InvalidArgument(
+      // Currently, Paddle-Lite's XPU user interface only supports the transfer
-                          "Only one choice can be made between CPU and XPU."));
+      // of host data pointers. If it is currently used as a subgraph, execution
+      // efficiency will be sacrificed, so it is temporarily set to cpu place.
+      // And, the current lite engine of xpu must execute all parts of the
+      // model.
+      res->SetPlace(PaddlePlace::kCPU);
+    } else {
      auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
      res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
+    }
  } else {
    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
@@ -786,8 +814,17 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
  if (platform::is_cpu_place(place_)) {
    res->SetPlace(PaddlePlace::kCPU);
  } else if (platform::is_xpu_place(place_)) {
+    if (config_.lite_engine_enabled()) {
+      // Currently, Paddle-Lite's XPU user interface only supports the transfer
+      // of host data pointers. If it is currently used as a subgraph, execution
+      // efficiency will be sacrificed, so it is temporarily set to cpu place.
+      // And, the current lite engine of xpu must execute all parts of the
+      // model.
+      res->SetPlace(PaddlePlace::kCPU);
+    } else {
      auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
      res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
+    }
  } else {
    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());

--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -115,7 +115,7 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) {
  if (place_ == PaddlePlace::kCPU) {
    auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
    std::memcpy(static_cast<void *>(t_data), data, ele_size);
-  } else {
+  } else if (place_ == PaddlePlace::kGPU) {
 #ifdef PADDLE_WITH_CUDA
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    platform::CUDAPlace gpu_place(device_);
@@ -129,6 +129,19 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) {
    PADDLE_THROW(platform::errors::Unavailable(
        "Not compiled with CUDA, should not reach here."));
 #endif
+  } else if (place_ == PaddlePlace::kXPU) {
+#ifdef PADDLE_WITH_XPU
+    platform::XPUPlace xpu_place(device_);
+    auto *t_data = tensor->mutable_data<T>(xpu_place);
+    memory::Copy(xpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
+                 data, ele_size);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not compiled with XPU, should not reach here."));
+#endif
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "The analysis predictor supports CPU, GPU and XPU now."));
  }
 }
@@ -141,7 +154,7 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
  if (platform::is_cpu_place(t_place)) {
    std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
-  } else {
+  } else if (place_ == PaddlePlace::kGPU) {
 #ifdef PADDLE_WITH_CUDA
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, t_place);
@@ -155,6 +168,18 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
    PADDLE_THROW(platform::errors::Unavailable(
        "Not compile with CUDA, should not reach here."));
 #endif
+  } else if (place_ == PaddlePlace::kXPU) {
+#ifdef PADDLE_WITH_XPU
+    auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, t_place);
+    memory::Copy(platform::CPUPlace(), static_cast<void *>(data), xpu_place,
+                 t_data, ele_num * sizeof(T));
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not compile with XPU, should not reach here."));
+#endif
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "The analysis predictor supports CPU, GPU and XPU now."));
  }
 }
 template PD_INFER_DECL void ZeroCopyTensor::copy_from_cpu<float>(

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -197,9 +197,9 @@ struct PD_INFER_DECL AnalysisConfig {
  ///
  int gpu_device_id() const { return gpu_device_id_; }
  ///
-  /// \brief Get the GPU device id.
+  /// \brief Get the XPU device id.
  ///
-  /// \return int The GPU device id.
+  /// \return int The XPU device id.
  ///
  int xpu_device_id() const { return xpu_device_id_; }
  ///

--- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
@@ -24,8 +24,10 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
-int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) {
+int test_predictor(const AnalysisConfig& config_in,
+                   Barrier* barrier = nullptr) {
  static std::mutex mutex;
+  AnalysisConfig config{config_in};
  std::unique_ptr<PaddlePredictor> predictor;
  {
    std::unique_lock<std::mutex> lock(mutex);
@@ -58,12 +60,50 @@ int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) {
  return 0;
 }
+int test_predictor_zero_copy(const AnalysisConfig& config_in,
+                             Barrier* barrier = nullptr) {
+  static std::mutex mutex;
+  AnalysisConfig config{config_in};
+  config.SwitchUseFeedFetchOps(false);
+  std::unique_ptr<PaddlePredictor> predictor;
+  {
+    std::unique_lock<std::mutex> lock(mutex);
+    predictor = std::move(CreatePaddlePredictor(config));
+  }
+  if (barrier) {
+    barrier->Wait();
+  }
+  std::vector<float> input({1});
+  auto in_tensor{predictor->GetInputTensor(predictor->GetInputNames().front())};
+  in_tensor->Reshape({1, 1});
+  in_tensor->copy_from_cpu(input.data());
+  predictor->ZeroCopyRun();
+  auto out_tensor{
+      predictor->GetOutputTensor(predictor->GetOutputNames().front())};
+  std::vector<float> data_o(10);
+  out_tensor->copy_to_cpu(data_o.data());
+  const std::vector<float> truth_values = {
+      -0.00621776f, -0.00620937f, 0.00990623f,  -0.0039817f, -0.00074315f,
+      0.61229795f,  -0.00491806f, -0.00068755f, 0.18409646f, 0.30090684f};
+  const size_t expected_size = 1;
+  EXPECT_EQ(predictor->GetOutputNames().size(), expected_size);
+  for (size_t j = 0; j < truth_values.size(); ++j) {
+    EXPECT_LT(std::abs(data_o[j] - truth_values[j]), 10e-6);
+  }
+  return 0;
+}
 #ifdef PADDLE_WITH_XPU
 TEST(AnalysisPredictor, native_xpu) {
  AnalysisConfig config;
  config.EnableXpu();
  config.SetModel(FLAGS_infer_model + "/" + "mul_model");
-  test_main(config);
+  test_predictor(config);
+  test_predictor_zero_copy(config);
 }
 #endif
@@ -73,6 +113,8 @@ TEST(AnalysisPredictor, lite_xpu) {
  config.EnableXpu();
  config.SetModel(FLAGS_infer_model + "/" + "mul_model");
  config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
+  test_predictor(config);
+  test_predictor_zero_copy(config);
 }
 #endif
@@ -87,7 +129,8 @@ TEST(AnalysisPredictor, thread_local_stream) {
      config.EnableUseGpu(100, 0);
      config.SetModel(FLAGS_infer_model + "/" + "mul_model");
      config.EnableGpuMultiStream();
-      test_main(config, &barrier);
+      test_predictor(config, &barrier);
+      test_predictor_zero_copy(config);
    });
  }
  for (auto& th : threads) {
@@ -100,7 +143,8 @@ TEST(AnalysisPredictor, lite_engine) {
  config.EnableUseGpu(100, 0);
  config.SetModel(FLAGS_infer_model + "/" + "mul_model");
  config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
-  test_main(config);
+  test_predictor(config);
+  test_predictor_zero_copy(config);
 }
 #endif