diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 2ba7ee26940c72257b1c9e35199f68b496dc8a87..536f0b7407fc3239d37a8df21a8820f2052478e6 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -254,7 +254,29 @@ bool AnalysisPredictor::CreateExecutor() { } #endif } else if (config_.use_xpu()) { - place_ = paddle::platform::XPUPlace(config_.xpu_device_id()); + if (config_.lite_engine_enabled()) { +#ifdef LITE_SUBGRAPH_WITH_XPU + // Currently, Paddle-Lite's XPU user interface only supports the transfer + // of Host data pointers. If it is currently used as a subgraph, execution + // efficiency will be sacrificed, so it is temporarily set to cpu place. + // And, the current lite engine of xpu must execute all parts of the + // model. + place_ = paddle::platform::CPUPlace(); +#else + PADDLE_THROW(platform::errors::Unavailable( + "You tried to use an XPU lite engine, but Paddle was not compiled " + "with it.")); +#endif // LITE_SUBGRAPH_WITH_XPU + } else { +#ifdef PADDLE_WITH_XPU + place_ = paddle::platform::XPUPlace(config_.xpu_device_id()); +#else + PADDLE_THROW(platform::errors::Unavailable( + "You tried to use XPU forward propagation (inference without lite " + "engine), but Paddle was not compiled " + "with WITH_XPU.")); +#endif // PADDLE_WITH_XPU + } } else { place_ = paddle::platform::CPUPlace(); } @@ -767,11 +789,17 @@ std::unique_ptr AnalysisPredictor::GetInputTensor( if (platform::is_cpu_place(place_)) { res->SetPlace(PaddlePlace::kCPU); } else if (platform::is_xpu_place(place_)) { - PADDLE_ENFORCE_EQ(config_.use_gpu(), false, - platform::errors::InvalidArgument( - "Only one choice can be made between CPU and XPU.")); - auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_); - res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId()); + if (config_.lite_engine_enabled()) { + // Currently, Paddle-Lite's XPU user interface only supports the transfer + // of host data pointers. If it is currently used as a subgraph, execution + // efficiency will be sacrificed, so it is temporarily set to cpu place. + // And, the current lite engine of xpu must execute all parts of the + // model. + res->SetPlace(PaddlePlace::kCPU); + } else { + auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_); + res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId()); + } } else { auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_); res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); @@ -793,8 +821,17 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( if (platform::is_cpu_place(place_)) { res->SetPlace(PaddlePlace::kCPU); } else if (platform::is_xpu_place(place_)) { - auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_); - res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId()); + if (config_.lite_engine_enabled()) { + // Currently, Paddle-Lite's XPU user interface only supports the transfer + // of host data pointers. If it is currently used as a subgraph, execution + // efficiency will be sacrificed, so it is temporarily set to cpu place. + // And, the current lite engine of xpu must execute all parts of the + // model. + res->SetPlace(PaddlePlace::kCPU); + } else { + auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_); + res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId()); + } } else { auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_); res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 46755eeda660ae8f4c54d318f6450fbf1d48b1f7..f44530019ea0091100e3481e80bda2f801134d31 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -115,7 +115,7 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) { if (place_ == PaddlePlace::kCPU) { auto *t_data = tensor->mutable_data(platform::CPUPlace()); std::memcpy(static_cast(t_data), data, ele_size); - } else { + } else if (place_ == PaddlePlace::kGPU) { #ifdef PADDLE_WITH_CUDA platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::CUDAPlace gpu_place(device_); @@ -129,6 +129,19 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) { PADDLE_THROW(platform::errors::Unavailable( "Not compiled with CUDA, should not reach here.")); #endif + } else if (place_ == PaddlePlace::kXPU) { +#ifdef PADDLE_WITH_XPU + platform::XPUPlace xpu_place(device_); + auto *t_data = tensor->mutable_data(xpu_place); + memory::Copy(xpu_place, static_cast(t_data), platform::CPUPlace(), + data, ele_size); +#else + PADDLE_THROW(platform::errors::Unavailable( + "Not compiled with XPU, should not reach here.")); +#endif + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "The analysis predictor supports CPU, GPU and XPU now.")); } } @@ -141,7 +154,7 @@ void ZeroCopyTensor::copy_to_cpu(T *data) { if (platform::is_cpu_place(t_place)) { std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); - } else { + } else if (place_ == PaddlePlace::kGPU) { #ifdef PADDLE_WITH_CUDA platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, t_place); @@ -155,6 +168,18 @@ void ZeroCopyTensor::copy_to_cpu(T *data) { PADDLE_THROW(platform::errors::Unavailable( "Not compile with CUDA, should not reach here.")); #endif + } else if (place_ == PaddlePlace::kXPU) { +#ifdef PADDLE_WITH_XPU + auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, t_place); + memory::Copy(platform::CPUPlace(), static_cast(data), xpu_place, + t_data, ele_num * sizeof(T)); +#else + PADDLE_THROW(platform::errors::Unavailable( + "Not compile with XPU, should not reach here.")); +#endif + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "The analysis predictor supports CPU, GPU and XPU now.")); } } template PD_INFER_DECL void ZeroCopyTensor::copy_from_cpu( diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index c892284d91fec211ac35d45662fe1cdc46d98d3e..e492b32cb6cbefcc121b616450170e5cc22bb913 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -197,9 +197,9 @@ struct PD_INFER_DECL AnalysisConfig { /// int gpu_device_id() const { return gpu_device_id_; } /// - /// \brief Get the GPU device id. + /// \brief Get the XPU device id. /// - /// \return int The GPU device id. + /// \return int The XPU device id. /// int xpu_device_id() const { return xpu_device_id_; } /// diff --git a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc index 9c1ad79942d00d46b69f22bc297c5a6f17b4ff6e..23bcd40af80184475f28a15770b3f1a40c97cb32 100644 --- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc +++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc @@ -24,8 +24,10 @@ limitations under the License. */ namespace paddle { namespace inference { -int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) { +int test_predictor(const AnalysisConfig& config_in, + Barrier* barrier = nullptr) { static std::mutex mutex; + AnalysisConfig config{config_in}; std::unique_ptr predictor; { std::unique_lock lock(mutex); @@ -58,12 +60,50 @@ int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) { return 0; } +int test_predictor_zero_copy(const AnalysisConfig& config_in, + Barrier* barrier = nullptr) { + static std::mutex mutex; + AnalysisConfig config{config_in}; + config.SwitchUseFeedFetchOps(false); + std::unique_ptr predictor; + { + std::unique_lock lock(mutex); + predictor = std::move(CreatePaddlePredictor(config)); + } + if (barrier) { + barrier->Wait(); + } + + std::vector input({1}); + auto in_tensor{predictor->GetInputTensor(predictor->GetInputNames().front())}; + in_tensor->Reshape({1, 1}); + in_tensor->copy_from_cpu(input.data()); + + predictor->ZeroCopyRun(); + + auto out_tensor{ + predictor->GetOutputTensor(predictor->GetOutputNames().front())}; + std::vector data_o(10); + out_tensor->copy_to_cpu(data_o.data()); + + const std::vector truth_values = { + -0.00621776f, -0.00620937f, 0.00990623f, -0.0039817f, -0.00074315f, + 0.61229795f, -0.00491806f, -0.00068755f, 0.18409646f, 0.30090684f}; + const size_t expected_size = 1; + EXPECT_EQ(predictor->GetOutputNames().size(), expected_size); + for (size_t j = 0; j < truth_values.size(); ++j) { + EXPECT_LT(std::abs(data_o[j] - truth_values[j]), 10e-6); + } + return 0; +} + #ifdef PADDLE_WITH_XPU TEST(AnalysisPredictor, native_xpu) { AnalysisConfig config; config.EnableXpu(); config.SetModel(FLAGS_infer_model + "/" + "mul_model"); - test_main(config); + test_predictor(config); + test_predictor_zero_copy(config); } #endif @@ -73,6 +113,8 @@ TEST(AnalysisPredictor, lite_xpu) { config.EnableXpu(); config.SetModel(FLAGS_infer_model + "/" + "mul_model"); config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32); + test_predictor(config); + test_predictor_zero_copy(config); } #endif @@ -87,7 +129,8 @@ TEST(AnalysisPredictor, thread_local_stream) { config.EnableUseGpu(100, 0); config.SetModel(FLAGS_infer_model + "/" + "mul_model"); config.EnableGpuMultiStream(); - test_main(config, &barrier); + test_predictor(config, &barrier); + test_predictor_zero_copy(config); }); } for (auto& th : threads) { @@ -100,7 +143,8 @@ TEST(AnalysisPredictor, lite_engine) { config.EnableUseGpu(100, 0); config.SetModel(FLAGS_infer_model + "/" + "mul_model"); config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32); - test_main(config); + test_predictor(config); + test_predictor_zero_copy(config); } #endif