未验证 提交 99bd16eb 编写于 作者: 石晓伟 提交者: GitHub

bug fix of xpu lite engine, test=develop (#30918)

* bug fix of xpu lite engine, test=develop

* xpu zero copy tensor, test=develop

* revert paddle/fluid/inference/tests/api/CMakeLists.txt
上级 2e932338
...@@ -254,7 +254,29 @@ bool AnalysisPredictor::CreateExecutor() { ...@@ -254,7 +254,29 @@ bool AnalysisPredictor::CreateExecutor() {
} }
#endif #endif
} else if (config_.use_xpu()) { } else if (config_.use_xpu()) {
place_ = paddle::platform::XPUPlace(config_.xpu_device_id()); if (config_.lite_engine_enabled()) {
#ifdef LITE_SUBGRAPH_WITH_XPU
// Currently, Paddle-Lite's XPU user interface only supports the transfer
// of Host data pointers. If it is currently used as a subgraph, execution
// efficiency will be sacrificed, so it is temporarily set to cpu place.
// And, the current lite engine of xpu must execute all parts of the
// model.
place_ = paddle::platform::CPUPlace();
#else
PADDLE_THROW(platform::errors::Unavailable(
"You tried to use an XPU lite engine, but Paddle was not compiled "
"with it."));
#endif // LITE_SUBGRAPH_WITH_XPU
} else {
#ifdef PADDLE_WITH_XPU
place_ = paddle::platform::XPUPlace(config_.xpu_device_id());
#else
PADDLE_THROW(platform::errors::Unavailable(
"You tried to use XPU forward propagation (inference without lite "
"engine), but Paddle was not compiled "
"with WITH_XPU."));
#endif // PADDLE_WITH_XPU
}
} else { } else {
place_ = paddle::platform::CPUPlace(); place_ = paddle::platform::CPUPlace();
} }
...@@ -760,11 +782,17 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor( ...@@ -760,11 +782,17 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
if (platform::is_cpu_place(place_)) { if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU); res->SetPlace(PaddlePlace::kCPU);
} else if (platform::is_xpu_place(place_)) { } else if (platform::is_xpu_place(place_)) {
PADDLE_ENFORCE_EQ(config_.use_gpu(), false, if (config_.lite_engine_enabled()) {
platform::errors::InvalidArgument( // Currently, Paddle-Lite's XPU user interface only supports the transfer
"Only one choice can be made between CPU and XPU.")); // of host data pointers. If it is currently used as a subgraph, execution
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_); // efficiency will be sacrificed, so it is temporarily set to cpu place.
res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId()); // And, the current lite engine of xpu must execute all parts of the
// model.
res->SetPlace(PaddlePlace::kCPU);
} else {
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
}
} else { } else {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_); auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
...@@ -786,8 +814,17 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor( ...@@ -786,8 +814,17 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
if (platform::is_cpu_place(place_)) { if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU); res->SetPlace(PaddlePlace::kCPU);
} else if (platform::is_xpu_place(place_)) { } else if (platform::is_xpu_place(place_)) {
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_); if (config_.lite_engine_enabled()) {
res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId()); // Currently, Paddle-Lite's XPU user interface only supports the transfer
// of host data pointers. If it is currently used as a subgraph, execution
// efficiency will be sacrificed, so it is temporarily set to cpu place.
// And, the current lite engine of xpu must execute all parts of the
// model.
res->SetPlace(PaddlePlace::kCPU);
} else {
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
}
} else { } else {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_); auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
......
...@@ -115,7 +115,7 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) { ...@@ -115,7 +115,7 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) {
if (place_ == PaddlePlace::kCPU) { if (place_ == PaddlePlace::kCPU) {
auto *t_data = tensor->mutable_data<T>(platform::CPUPlace()); auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
std::memcpy(static_cast<void *>(t_data), data, ele_size); std::memcpy(static_cast<void *>(t_data), data, ele_size);
} else { } else if (place_ == PaddlePlace::kGPU) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
platform::CUDAPlace gpu_place(device_); platform::CUDAPlace gpu_place(device_);
...@@ -129,6 +129,19 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) { ...@@ -129,6 +129,19 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) {
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Not compiled with CUDA, should not reach here.")); "Not compiled with CUDA, should not reach here."));
#endif #endif
} else if (place_ == PaddlePlace::kXPU) {
#ifdef PADDLE_WITH_XPU
platform::XPUPlace xpu_place(device_);
auto *t_data = tensor->mutable_data<T>(xpu_place);
memory::Copy(xpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
data, ele_size);
#else
PADDLE_THROW(platform::errors::Unavailable(
"Not compiled with XPU, should not reach here."));
#endif
} else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"The analysis predictor supports CPU, GPU and XPU now."));
} }
} }
...@@ -141,7 +154,7 @@ void ZeroCopyTensor::copy_to_cpu(T *data) { ...@@ -141,7 +154,7 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
if (platform::is_cpu_place(t_place)) { if (platform::is_cpu_place(t_place)) {
std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T)); std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
} else { } else if (place_ == PaddlePlace::kGPU) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, t_place); auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, t_place);
...@@ -155,6 +168,18 @@ void ZeroCopyTensor::copy_to_cpu(T *data) { ...@@ -155,6 +168,18 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Not compile with CUDA, should not reach here.")); "Not compile with CUDA, should not reach here."));
#endif #endif
} else if (place_ == PaddlePlace::kXPU) {
#ifdef PADDLE_WITH_XPU
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, t_place);
memory::Copy(platform::CPUPlace(), static_cast<void *>(data), xpu_place,
t_data, ele_num * sizeof(T));
#else
PADDLE_THROW(platform::errors::Unavailable(
"Not compile with XPU, should not reach here."));
#endif
} else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"The analysis predictor supports CPU, GPU and XPU now."));
} }
} }
template PD_INFER_DECL void ZeroCopyTensor::copy_from_cpu<float>( template PD_INFER_DECL void ZeroCopyTensor::copy_from_cpu<float>(
......
...@@ -197,9 +197,9 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -197,9 +197,9 @@ struct PD_INFER_DECL AnalysisConfig {
/// ///
int gpu_device_id() const { return gpu_device_id_; } int gpu_device_id() const { return gpu_device_id_; }
/// ///
/// \brief Get the GPU device id. /// \brief Get the XPU device id.
/// ///
/// \return int The GPU device id. /// \return int The XPU device id.
/// ///
int xpu_device_id() const { return xpu_device_id_; } int xpu_device_id() const { return xpu_device_id_; }
/// ///
......
...@@ -24,8 +24,10 @@ limitations under the License. */ ...@@ -24,8 +24,10 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace inference { namespace inference {
int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) { int test_predictor(const AnalysisConfig& config_in,
Barrier* barrier = nullptr) {
static std::mutex mutex; static std::mutex mutex;
AnalysisConfig config{config_in};
std::unique_ptr<PaddlePredictor> predictor; std::unique_ptr<PaddlePredictor> predictor;
{ {
std::unique_lock<std::mutex> lock(mutex); std::unique_lock<std::mutex> lock(mutex);
...@@ -58,12 +60,50 @@ int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) { ...@@ -58,12 +60,50 @@ int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) {
return 0; return 0;
} }
int test_predictor_zero_copy(const AnalysisConfig& config_in,
Barrier* barrier = nullptr) {
static std::mutex mutex;
AnalysisConfig config{config_in};
config.SwitchUseFeedFetchOps(false);
std::unique_ptr<PaddlePredictor> predictor;
{
std::unique_lock<std::mutex> lock(mutex);
predictor = std::move(CreatePaddlePredictor(config));
}
if (barrier) {
barrier->Wait();
}
std::vector<float> input({1});
auto in_tensor{predictor->GetInputTensor(predictor->GetInputNames().front())};
in_tensor->Reshape({1, 1});
in_tensor->copy_from_cpu(input.data());
predictor->ZeroCopyRun();
auto out_tensor{
predictor->GetOutputTensor(predictor->GetOutputNames().front())};
std::vector<float> data_o(10);
out_tensor->copy_to_cpu(data_o.data());
const std::vector<float> truth_values = {
-0.00621776f, -0.00620937f, 0.00990623f, -0.0039817f, -0.00074315f,
0.61229795f, -0.00491806f, -0.00068755f, 0.18409646f, 0.30090684f};
const size_t expected_size = 1;
EXPECT_EQ(predictor->GetOutputNames().size(), expected_size);
for (size_t j = 0; j < truth_values.size(); ++j) {
EXPECT_LT(std::abs(data_o[j] - truth_values[j]), 10e-6);
}
return 0;
}
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
TEST(AnalysisPredictor, native_xpu) { TEST(AnalysisPredictor, native_xpu) {
AnalysisConfig config; AnalysisConfig config;
config.EnableXpu(); config.EnableXpu();
config.SetModel(FLAGS_infer_model + "/" + "mul_model"); config.SetModel(FLAGS_infer_model + "/" + "mul_model");
test_main(config); test_predictor(config);
test_predictor_zero_copy(config);
} }
#endif #endif
...@@ -73,6 +113,8 @@ TEST(AnalysisPredictor, lite_xpu) { ...@@ -73,6 +113,8 @@ TEST(AnalysisPredictor, lite_xpu) {
config.EnableXpu(); config.EnableXpu();
config.SetModel(FLAGS_infer_model + "/" + "mul_model"); config.SetModel(FLAGS_infer_model + "/" + "mul_model");
config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32); config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
test_predictor(config);
test_predictor_zero_copy(config);
} }
#endif #endif
...@@ -87,7 +129,8 @@ TEST(AnalysisPredictor, thread_local_stream) { ...@@ -87,7 +129,8 @@ TEST(AnalysisPredictor, thread_local_stream) {
config.EnableUseGpu(100, 0); config.EnableUseGpu(100, 0);
config.SetModel(FLAGS_infer_model + "/" + "mul_model"); config.SetModel(FLAGS_infer_model + "/" + "mul_model");
config.EnableGpuMultiStream(); config.EnableGpuMultiStream();
test_main(config, &barrier); test_predictor(config, &barrier);
test_predictor_zero_copy(config);
}); });
} }
for (auto& th : threads) { for (auto& th : threads) {
...@@ -100,7 +143,8 @@ TEST(AnalysisPredictor, lite_engine) { ...@@ -100,7 +143,8 @@ TEST(AnalysisPredictor, lite_engine) {
config.EnableUseGpu(100, 0); config.EnableUseGpu(100, 0);
config.SetModel(FLAGS_infer_model + "/" + "mul_model"); config.SetModel(FLAGS_infer_model + "/" + "mul_model");
config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32); config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
test_main(config); test_predictor(config);
test_predictor_zero_copy(config);
} }
#endif #endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册