From fa0c0fb26cd659d8e1f8296b1059d2b4e99fc4e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Sat, 20 Feb 2021 15:54:55 +0800
Subject: [PATCH] bug fix of xpu lite engine, test=develop (#30918) (#31046)

* bug fix of xpu lite engine, test=develop

* xpu zero copy tensor, test=develop

* revert paddle/fluid/inference/tests/api/CMakeLists.txt
---
 .../fluid/inference/api/analysis_predictor.cc | 53 ++++++++++++++++---
 .../inference/api/details/zero_copy_tensor.cc | 29 +++++++++-
 .../inference/api/paddle_analysis_config.h    |  4 +-
 .../tests/api/lite_mul_model_test.cc          | 52 ++++++++++++++++--
 4 files changed, 122 insertions(+), 16 deletions(-)
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 2ba7ee2694..536f0b7407 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -254,7 +254,29 @@ bool AnalysisPredictor::CreateExecutor() {
     }
 #endif
   } else if (config_.use_xpu()) {
-    place_ = paddle::platform::XPUPlace(config_.xpu_device_id());
+    if (config_.lite_engine_enabled()) {
+#ifdef LITE_SUBGRAPH_WITH_XPU
+      // Currently, Paddle-Lite's XPU user interface only supports the transfer
+      // of Host data pointers. If it is currently used as a subgraph, execution
+      // efficiency will be sacrificed, so it is temporarily set to cpu place.
+      // And, the current lite engine of xpu must execute all parts of the
+      // model.
+      place_ = paddle::platform::CPUPlace();
+#else
+      PADDLE_THROW(platform::errors::Unavailable(
+          "You tried to use an XPU lite engine, but Paddle was not compiled "
+          "with it."));
+#endif  // LITE_SUBGRAPH_WITH_XPU
+    } else {
+#ifdef PADDLE_WITH_XPU
+      place_ = paddle::platform::XPUPlace(config_.xpu_device_id());
+#else
+      PADDLE_THROW(platform::errors::Unavailable(
+          "You tried to use XPU forward propagation (inference without lite "
+          "engine), but Paddle was not compiled "
+          "with WITH_XPU."));
+#endif  // PADDLE_WITH_XPU
+    }
   } else {
     place_ = paddle::platform::CPUPlace();
   }
@@ -767,11 +789,17 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
   if (platform::is_cpu_place(place_)) {
     res->SetPlace(PaddlePlace::kCPU);
   } else if (platform::is_xpu_place(place_)) {
-    PADDLE_ENFORCE_EQ(config_.use_gpu(), false,
-                      platform::errors::InvalidArgument(
-                          "Only one choice can be made between CPU and XPU."));
-    auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
-    res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
+    if (config_.lite_engine_enabled()) {
+      // Currently, Paddle-Lite's XPU user interface only supports the transfer
+      // of host data pointers. If it is currently used as a subgraph, execution
+      // efficiency will be sacrificed, so it is temporarily set to cpu place.
+      // And, the current lite engine of xpu must execute all parts of the
+      // model.
+      res->SetPlace(PaddlePlace::kCPU);
+    } else {
+      auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
+      res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
+    }
   } else {
     auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
@@ -793,8 +821,17 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
   if (platform::is_cpu_place(place_)) {
     res->SetPlace(PaddlePlace::kCPU);
   } else if (platform::is_xpu_place(place_)) {
-    auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
-    res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
+    if (config_.lite_engine_enabled()) {
+      // Currently, Paddle-Lite's XPU user interface only supports the transfer
+      // of host data pointers. If it is currently used as a subgraph, execution
+      // efficiency will be sacrificed, so it is temporarily set to cpu place.
+      // And, the current lite engine of xpu must execute all parts of the
+      // model.
+      res->SetPlace(PaddlePlace::kCPU);
+    } else {
+      auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
+      res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
+    }
   } else {
     auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 46755eeda6..f44530019e 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -115,7 +115,7 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) {
   if (place_ == PaddlePlace::kCPU) {
     auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
     std::memcpy(static_cast<void *>(t_data), data, ele_size);
-  } else {
+  } else if (place_ == PaddlePlace::kGPU) {
 #ifdef PADDLE_WITH_CUDA
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     platform::CUDAPlace gpu_place(device_);
@@ -129,6 +129,19 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) {
     PADDLE_THROW(platform::errors::Unavailable(
         "Not compiled with CUDA, should not reach here."));
 #endif
+  } else if (place_ == PaddlePlace::kXPU) {
+#ifdef PADDLE_WITH_XPU
+    platform::XPUPlace xpu_place(device_);
+    auto *t_data = tensor->mutable_data<T>(xpu_place);
+    memory::Copy(xpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
+                 data, ele_size);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not compiled with XPU, should not reach here."));
+#endif
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "The analysis predictor supports CPU, GPU and XPU now."));
   }
 }
 
@@ -141,7 +154,7 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
 
   if (platform::is_cpu_place(t_place)) {
     std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
-  } else {
+  } else if (place_ == PaddlePlace::kGPU) {
 #ifdef PADDLE_WITH_CUDA
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, t_place);
@@ -155,6 +168,18 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
     PADDLE_THROW(platform::errors::Unavailable(
         "Not compile with CUDA, should not reach here."));
 #endif
+  } else if (place_ == PaddlePlace::kXPU) {
+#ifdef PADDLE_WITH_XPU
+    auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, t_place);
+    memory::Copy(platform::CPUPlace(), static_cast<void *>(data), xpu_place,
+                 t_data, ele_num * sizeof(T));
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not compile with XPU, should not reach here."));
+#endif
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "The analysis predictor supports CPU, GPU and XPU now."));
   }
 }
 template PD_INFER_DECL void ZeroCopyTensor::copy_from_cpu<float>(
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index c892284d91..e492b32cb6 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -197,9 +197,9 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   int gpu_device_id() const { return gpu_device_id_; }
   ///
-  /// \brief Get the GPU device id.
+  /// \brief Get the XPU device id.
   ///
-  /// \return int The GPU device id.
+  /// \return int The XPU device id.
   ///
   int xpu_device_id() const { return xpu_device_id_; }
   ///
diff --git a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
index 9c1ad79942..23bcd40af8 100644
--- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
@@ -24,8 +24,10 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
 
-int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) {
+int test_predictor(const AnalysisConfig& config_in,
+                   Barrier* barrier = nullptr) {
   static std::mutex mutex;
+  AnalysisConfig config{config_in};
   std::unique_ptr<PaddlePredictor> predictor;
   {
     std::unique_lock<std::mutex> lock(mutex);
@@ -58,12 +60,50 @@ int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) {
   return 0;
 }
 
+int test_predictor_zero_copy(const AnalysisConfig& config_in,
+                             Barrier* barrier = nullptr) {
+  static std::mutex mutex;
+  AnalysisConfig config{config_in};
+  config.SwitchUseFeedFetchOps(false);
+  std::unique_ptr<PaddlePredictor> predictor;
+  {
+    std::unique_lock<std::mutex> lock(mutex);
+    predictor = std::move(CreatePaddlePredictor(config));
+  }
+  if (barrier) {
+    barrier->Wait();
+  }
+
+  std::vector<float> input({1});
+  auto in_tensor{predictor->GetInputTensor(predictor->GetInputNames().front())};
+  in_tensor->Reshape({1, 1});
+  in_tensor->copy_from_cpu(input.data());
+
+  predictor->ZeroCopyRun();
+
+  auto out_tensor{
+      predictor->GetOutputTensor(predictor->GetOutputNames().front())};
+  std::vector<float> data_o(10);
+  out_tensor->copy_to_cpu(data_o.data());
+
+  const std::vector<float> truth_values = {
+      -0.00621776f, -0.00620937f, 0.00990623f,  -0.0039817f, -0.00074315f,
+      0.61229795f,  -0.00491806f, -0.00068755f, 0.18409646f, 0.30090684f};
+  const size_t expected_size = 1;
+  EXPECT_EQ(predictor->GetOutputNames().size(), expected_size);
+  for (size_t j = 0; j < truth_values.size(); ++j) {
+    EXPECT_LT(std::abs(data_o[j] - truth_values[j]), 10e-6);
+  }
+  return 0;
+}
+
 #ifdef PADDLE_WITH_XPU
 TEST(AnalysisPredictor, native_xpu) {
   AnalysisConfig config;
   config.EnableXpu();
   config.SetModel(FLAGS_infer_model + "/" + "mul_model");
-  test_main(config);
+  test_predictor(config);
+  test_predictor_zero_copy(config);
 }
 #endif
 
@@ -73,6 +113,8 @@ TEST(AnalysisPredictor, lite_xpu) {
   config.EnableXpu();
   config.SetModel(FLAGS_infer_model + "/" + "mul_model");
   config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
+  test_predictor(config);
+  test_predictor_zero_copy(config);
 }
 #endif
 
@@ -87,7 +129,8 @@ TEST(AnalysisPredictor, thread_local_stream) {
       config.EnableUseGpu(100, 0);
       config.SetModel(FLAGS_infer_model + "/" + "mul_model");
       config.EnableGpuMultiStream();
-      test_main(config, &barrier);
+      test_predictor(config, &barrier);
+      test_predictor_zero_copy(config);
     });
   }
   for (auto& th : threads) {
@@ -100,7 +143,8 @@ TEST(AnalysisPredictor, lite_engine) {
   config.EnableUseGpu(100, 0);
   config.SetModel(FLAGS_infer_model + "/" + "mul_model");
   config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
-  test_main(config);
+  test_predictor(config);
+  test_predictor_zero_copy(config);
 }
 #endif
 
-- 
GitLab