未验证 提交 fa0c0fb2 编写于 作者: 石晓伟 提交者: GitHub

bug fix of xpu lite engine, test=develop (#30918) (#31046)

* bug fix of xpu lite engine, test=develop

* xpu zero copy tensor, test=develop

* revert paddle/fluid/inference/tests/api/CMakeLists.txt
上级 2072c64d
......@@ -254,7 +254,29 @@ bool AnalysisPredictor::CreateExecutor() {
}
#endif
} else if (config_.use_xpu()) {
place_ = paddle::platform::XPUPlace(config_.xpu_device_id());
if (config_.lite_engine_enabled()) {
#ifdef LITE_SUBGRAPH_WITH_XPU
// Currently, Paddle-Lite's XPU user interface only supports the transfer
// of Host data pointers. If it is currently used as a subgraph, execution
// efficiency will be sacrificed, so it is temporarily set to cpu place.
// And, the current lite engine of xpu must execute all parts of the
// model.
place_ = paddle::platform::CPUPlace();
#else
PADDLE_THROW(platform::errors::Unavailable(
"You tried to use an XPU lite engine, but Paddle was not compiled "
"with it."));
#endif // LITE_SUBGRAPH_WITH_XPU
} else {
#ifdef PADDLE_WITH_XPU
place_ = paddle::platform::XPUPlace(config_.xpu_device_id());
#else
PADDLE_THROW(platform::errors::Unavailable(
"You tried to use XPU forward propagation (inference without lite "
"engine), but Paddle was not compiled "
"with WITH_XPU."));
#endif // PADDLE_WITH_XPU
}
} else {
place_ = paddle::platform::CPUPlace();
}
......@@ -767,11 +789,17 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU);
} else if (platform::is_xpu_place(place_)) {
PADDLE_ENFORCE_EQ(config_.use_gpu(), false,
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
if (config_.lite_engine_enabled()) {
// Currently, Paddle-Lite's XPU user interface only supports the transfer
// of host data pointers. If it is currently used as a subgraph, execution
// efficiency will be sacrificed, so it is temporarily set to cpu place.
// And, the current lite engine of xpu must execute all parts of the
// model.
res->SetPlace(PaddlePlace::kCPU);
} else {
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
}
} else {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
......@@ -793,8 +821,17 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU);
} else if (platform::is_xpu_place(place_)) {
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
if (config_.lite_engine_enabled()) {
// Currently, Paddle-Lite's XPU user interface only supports the transfer
// of host data pointers. If it is currently used as a subgraph, execution
// efficiency will be sacrificed, so it is temporarily set to cpu place.
// And, the current lite engine of xpu must execute all parts of the
// model.
res->SetPlace(PaddlePlace::kCPU);
} else {
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
}
} else {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
......
......@@ -115,7 +115,7 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) {
if (place_ == PaddlePlace::kCPU) {
auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
std::memcpy(static_cast<void *>(t_data), data, ele_size);
} else {
} else if (place_ == PaddlePlace::kGPU) {
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
platform::CUDAPlace gpu_place(device_);
......@@ -129,6 +129,19 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) {
PADDLE_THROW(platform::errors::Unavailable(
"Not compiled with CUDA, should not reach here."));
#endif
} else if (place_ == PaddlePlace::kXPU) {
#ifdef PADDLE_WITH_XPU
platform::XPUPlace xpu_place(device_);
auto *t_data = tensor->mutable_data<T>(xpu_place);
memory::Copy(xpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
data, ele_size);
#else
PADDLE_THROW(platform::errors::Unavailable(
"Not compiled with XPU, should not reach here."));
#endif
} else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"The analysis predictor supports CPU, GPU and XPU now."));
}
}
......@@ -141,7 +154,7 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
if (platform::is_cpu_place(t_place)) {
std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
} else {
} else if (place_ == PaddlePlace::kGPU) {
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, t_place);
......@@ -155,6 +168,18 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
PADDLE_THROW(platform::errors::Unavailable(
"Not compile with CUDA, should not reach here."));
#endif
} else if (place_ == PaddlePlace::kXPU) {
#ifdef PADDLE_WITH_XPU
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, t_place);
memory::Copy(platform::CPUPlace(), static_cast<void *>(data), xpu_place,
t_data, ele_num * sizeof(T));
#else
PADDLE_THROW(platform::errors::Unavailable(
"Not compile with XPU, should not reach here."));
#endif
} else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"The analysis predictor supports CPU, GPU and XPU now."));
}
}
template PD_INFER_DECL void ZeroCopyTensor::copy_from_cpu<float>(
......
......@@ -197,9 +197,9 @@ struct PD_INFER_DECL AnalysisConfig {
///
int gpu_device_id() const { return gpu_device_id_; }
///
/// \brief Get the GPU device id.
/// \brief Get the XPU device id.
///
/// \return int The GPU device id.
/// \return int The XPU device id.
///
int xpu_device_id() const { return xpu_device_id_; }
///
......
......@@ -24,8 +24,10 @@ limitations under the License. */
namespace paddle {
namespace inference {
int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) {
int test_predictor(const AnalysisConfig& config_in,
Barrier* barrier = nullptr) {
static std::mutex mutex;
AnalysisConfig config{config_in};
std::unique_ptr<PaddlePredictor> predictor;
{
std::unique_lock<std::mutex> lock(mutex);
......@@ -58,12 +60,50 @@ int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) {
return 0;
}
int test_predictor_zero_copy(const AnalysisConfig& config_in,
Barrier* barrier = nullptr) {
static std::mutex mutex;
AnalysisConfig config{config_in};
config.SwitchUseFeedFetchOps(false);
std::unique_ptr<PaddlePredictor> predictor;
{
std::unique_lock<std::mutex> lock(mutex);
predictor = std::move(CreatePaddlePredictor(config));
}
if (barrier) {
barrier->Wait();
}
std::vector<float> input({1});
auto in_tensor{predictor->GetInputTensor(predictor->GetInputNames().front())};
in_tensor->Reshape({1, 1});
in_tensor->copy_from_cpu(input.data());
predictor->ZeroCopyRun();
auto out_tensor{
predictor->GetOutputTensor(predictor->GetOutputNames().front())};
std::vector<float> data_o(10);
out_tensor->copy_to_cpu(data_o.data());
const std::vector<float> truth_values = {
-0.00621776f, -0.00620937f, 0.00990623f, -0.0039817f, -0.00074315f,
0.61229795f, -0.00491806f, -0.00068755f, 0.18409646f, 0.30090684f};
const size_t expected_size = 1;
EXPECT_EQ(predictor->GetOutputNames().size(), expected_size);
for (size_t j = 0; j < truth_values.size(); ++j) {
EXPECT_LT(std::abs(data_o[j] - truth_values[j]), 10e-6);
}
return 0;
}
#ifdef PADDLE_WITH_XPU
TEST(AnalysisPredictor, native_xpu) {
AnalysisConfig config;
config.EnableXpu();
config.SetModel(FLAGS_infer_model + "/" + "mul_model");
test_main(config);
test_predictor(config);
test_predictor_zero_copy(config);
}
#endif
......@@ -73,6 +113,8 @@ TEST(AnalysisPredictor, lite_xpu) {
config.EnableXpu();
config.SetModel(FLAGS_infer_model + "/" + "mul_model");
config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
test_predictor(config);
test_predictor_zero_copy(config);
}
#endif
......@@ -87,7 +129,8 @@ TEST(AnalysisPredictor, thread_local_stream) {
config.EnableUseGpu(100, 0);
config.SetModel(FLAGS_infer_model + "/" + "mul_model");
config.EnableGpuMultiStream();
test_main(config, &barrier);
test_predictor(config, &barrier);
test_predictor_zero_copy(config);
});
}
for (auto& th : threads) {
......@@ -100,7 +143,8 @@ TEST(AnalysisPredictor, lite_engine) {
config.EnableUseGpu(100, 0);
config.SetModel(FLAGS_infer_model + "/" + "mul_model");
config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
test_main(config);
test_predictor(config);
test_predictor_zero_copy(config);
}
#endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册