From 3b18d96b576862d41c023f97461acc4f15614bb0 Mon Sep 17 00:00:00 2001 From: james Date: Fri, 18 Nov 2022 14:22:12 +0800 Subject: [PATCH] fix device id issue for xpu eager mode (#48076) * fix device id issue for xpu eager xpu device id is not correctly set in eager mode, thus vars are on dev0 unless XPUDeviceGurad is called, leading to this error message for all node rank != 0: "NotImplementedError: (Unimplemented) Place Place(xpu:0) is not supported." * fix typo * fix pybind error --- paddle/fluid/distributed/collective/ProcessGroupBKCL.cc | 1 + .../eager/auto_code_generator/generator/python_c_gen.py | 9 +++++++++ paddle/fluid/pybind/distributed_py.cc | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc index a5c80cb0410..8dfb65d9813 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc @@ -105,6 +105,7 @@ void ProcessGroupBKCL::BroadcastUniqueBKCLID(BKCLUniqueId* bkcl_id) { void ProcessGroupBKCL::CreateBKCLEnvCache(const Place& place, const std::string& place_key) { + platform::XPUDeviceGuard guard(place.GetDeviceId()); BKCLUniqueId bkcl_id; if (rank_ == 0) { PADDLE_ENFORCE_XPU_SUCCESS(bkcl_get_unique_id(&bkcl_id)); diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py index 8e3944b79c3..aacde58fa7b 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py @@ -128,6 +128,15 @@ FUNCTION_SET_DEVICE_TEMPLATE = """{} if (paddle::platform::is_gpu_place(place #else PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( "PaddlePaddle should compile with CUSTOM_DEVICE if use CustomPlace.")); +#endif + }} + if (paddle::platform::is_xpu_place(place)) {{ +#if defined(PADDLE_WITH_XPU) + phi::backends::xpu::SetXPUDeviceId(place.device); + VLOG(4) <<"CurrentDeviceId: " << phi::backends::xpu::GetXPUCurrentDeviceId() << " from " << (int)place.device; +#else + PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with XPU if use XPUPlace.")); #endif }} """ diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index dbc4c57c656..52160ea99a0 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -1284,7 +1284,7 @@ void BindDistributed(py::module *m) { auto processGroupBKCL = py::class_>( - *m, "ProcessGroupBKCL", ProcessGroup) + *m, "ProcessGroupBKCL", ProcessGroupStream) .def(py::init &, int, int, -- GitLab