From 3b18d96b576862d41c023f97461acc4f15614bb0 Mon Sep 17 00:00:00 2001 From: james Date: Fri, 18 Nov 2022 14:22:12 +0800 Subject: [PATCH] fix device id issue for xpu eager mode (#48076) * fix device id issue for xpu eager xpu device id is not correctly set in eager mode, thus vars are on dev0 unless XPUDeviceGurad is called, leading to this error message for all node rank != 0: "NotImplementedError: (Unimplemented) Place Place(xpu:0) is not supported." * fix typo * fix pybind error --- paddle/fluid/distributed/collective/ProcessGroupBKCL.cc | 1 + .../eager/auto_code_generator/generator/python_c_gen.py | 9 +++++++++ paddle/fluid/pybind/distributed_py.cc | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc index a5c80cb041..8dfb65d981 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc @@ -105,6 +105,7 @@ void ProcessGroupBKCL::BroadcastUniqueBKCLID(BKCLUniqueId* bkcl_id) { void ProcessGroupBKCL::CreateBKCLEnvCache(const Place& place, const std::string& place_key) { + platform::XPUDeviceGuard guard(place.GetDeviceId()); BKCLUniqueId bkcl_id; if (rank_ == 0) { PADDLE_ENFORCE_XPU_SUCCESS(bkcl_get_unique_id(&bkcl_id)); diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py index 8e3944b79c..aacde58fa7 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py @@ -128,6 +128,15 @@ FUNCTION_SET_DEVICE_TEMPLATE = """{} if (paddle::platform::is_gpu_place(place #else PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( "PaddlePaddle should compile with CUSTOM_DEVICE if use CustomPlace.")); +#endif + }} + if (paddle::platform::is_xpu_place(place)) {{ +#if defined(PADDLE_WITH_XPU) + phi::backends::xpu::SetXPUDeviceId(place.device); + VLOG(4) <<"CurrentDeviceId: " << phi::backends::xpu::GetXPUCurrentDeviceId() << " from " << (int)place.device; +#else + PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with XPU if use XPUPlace.")); #endif }} """ diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index dbc4c57c65..52160ea99a 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -1284,7 +1284,7 @@ void BindDistributed(py::module *m) { auto processGroupBKCL = py::class_>( - *m, "ProcessGroupBKCL", ProcessGroup) + *m, "ProcessGroupBKCL", ProcessGroupStream) .def(py::init &, int, int, -- GitLab