From 7a705727fb1b6677cd47c697ea35507808867e8a Mon Sep 17 00:00:00 2001 From: ronnywang Date: Wed, 12 Jul 2023 13:07:36 +0800 Subject: [PATCH] [CustomDevice] fix release error in process_group_custom (#55293) * [CustomDevice] fix release error for process_group_custom * update --- .../distributed/collective/custom_ccl_tools.h | 2 +- paddle/fluid/pybind/pybind.cc | 2 +- paddle/phi/backends/custom/custom_context.cc | 2 +- paddle/phi/backends/device_manager.cc | 9 ++++--- paddle/phi/backends/device_manager.h | 2 +- paddle/phi/backends/event.cc | 26 ++++++++++++++++--- paddle/phi/backends/event.h | 2 ++ paddle/phi/backends/stream.cc | 25 +++++++++++++++--- paddle/phi/backends/stream.h | 2 ++ 9 files changed, 57 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/distributed/collective/custom_ccl_tools.h b/paddle/fluid/distributed/collective/custom_ccl_tools.h index d3ebc639a3c..4388c607548 100644 --- a/paddle/fluid/distributed/collective/custom_ccl_tools.h +++ b/paddle/fluid/distributed/collective/custom_ccl_tools.h @@ -148,7 +148,7 @@ class CustomCCLCommManager { ~CustomCCLCommManager() noexcept { std::unique_lock lock(mutex_); - if (ccl_comm_) { + if (phi::DeviceManager::HasDeviceType(device_type_) && ccl_comm_) { phi::DeviceManager::CCLDestroyComm(device_type_, ccl_comm_); } } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 1efbf2a1a1e..c4036944bc1 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1021,7 +1021,7 @@ PYBIND11_MODULE(libpaddle, m) { platform::CustomTracer::Release(); platform::CustomDeviceEventResourcePool::Release(); platform::CustomDeviceStreamResourcePool::Release(); - phi::DeviceManager::Clear(); + phi::DeviceManager::Release(); #endif }); diff --git a/paddle/phi/backends/custom/custom_context.cc b/paddle/phi/backends/custom/custom_context.cc index ddba0baea7e..a4139f77e96 100644 --- a/paddle/phi/backends/custom/custom_context.cc +++ b/paddle/phi/backends/custom/custom_context.cc @@ -76,7 +76,7 @@ CustomContext::CustomContext(const CustomPlace& place) impl_->Init(); } -CustomContext::~CustomContext() { impl_->Init(); } +CustomContext::~CustomContext() { impl_.reset(); } phi::ccl::CCLComm CustomContext::xccl_comm() const { return impl_->xccl_comm(); diff --git a/paddle/phi/backends/device_manager.cc b/paddle/phi/backends/device_manager.cc index c02ec991241..cf7b6ec0cbc 100644 --- a/paddle/phi/backends/device_manager.cc +++ b/paddle/phi/backends/device_manager.cc @@ -343,8 +343,9 @@ std::vector DeviceManager::GetAllCustomDeviceList() { } bool DeviceManager::HasDeviceType(const std::string& device_type) { - auto dev_impl = GetDeviceInterfaceWithType(device_type); - return dev_impl != nullptr; + phi::AutoRDLock lock(&_global_device_manager_rw_lock); + auto& dev_impl_map = Instance().device_impl_map_; + return dev_impl_map.find(device_type) != dev_impl_map.end(); } bool DeviceManager::IsCustom(const std::string& device_type) { @@ -670,7 +671,9 @@ DeviceManager& DeviceManager::Instance() { return platform_manager; } -void DeviceManager::Clear() { +void DeviceManager::Release() { + stream::Stream::ReleaseAll(); + event::Event::ReleaseAll(); Instance().device_map_.clear(); Instance().device_impl_map_.clear(); } diff --git a/paddle/phi/backends/device_manager.h b/paddle/phi/backends/device_manager.h index 52496f54647..327de2365ba 100644 --- a/paddle/phi/backends/device_manager.h +++ b/paddle/phi/backends/device_manager.h @@ -273,7 +273,7 @@ class DeviceManager { uint64_t start_ns, void* context); - static void Clear(); + static void Release(); private: DISABLE_COPY_AND_ASSIGN(DeviceManager); diff --git a/paddle/phi/backends/event.cc b/paddle/phi/backends/event.cc index 371e858a3fe..2a87865623a 100644 --- a/paddle/phi/backends/event.cc +++ b/paddle/phi/backends/event.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include + #include "paddle/phi/backends/event.h" #include "glog/logging.h" @@ -22,6 +24,14 @@ namespace phi { namespace event { +std::list g_events; + +void Event::ReleaseAll() { + for (auto* event : g_events) { + event->Destroy(); + } +} + event_t Event::raw_event() const { return event_; } void Event::set_event(event_t event) { event_ = event; } @@ -32,7 +42,10 @@ Event::Event(const Place& place, event_t event) event_(event), own_data_(false) {} -Event::~Event() { Destroy(); } +Event::~Event() { + g_events.remove(this); + Destroy(); +} bool Event::Init(const Place& place, Flag flags) { place_ = place; @@ -45,14 +58,19 @@ bool Event::Init(const Place& place, Flag flags) { VLOG(3) << "Init Event: " << event_ << ", place: " << place_ << ", flag:" << static_cast(flags); own_data_ = true; + g_events.push_back(this); return true; } void Event::Destroy() { - if (own_data_) { - phi::DeviceManager::SetDevice(place_); - device_->DestroyEvent(this); + if (device_) { + if (own_data_) { + phi::DeviceManager::SetDevice(place_); + device_->DestroyEvent(this); + } own_data_ = false; + event_ = nullptr; + device_ = nullptr; } } diff --git a/paddle/phi/backends/event.h b/paddle/phi/backends/event.h index a58083ff289..1dac619c2ab 100644 --- a/paddle/phi/backends/event.h +++ b/paddle/phi/backends/event.h @@ -49,6 +49,8 @@ class Event { void Synchronize() const; const Place& GetPlace() const; + static void ReleaseAll(); + private: DISABLE_COPY_AND_ASSIGN(Event); Place place_; diff --git a/paddle/phi/backends/stream.cc b/paddle/phi/backends/stream.cc index 52bb1d2e549..b328a1e626c 100644 --- a/paddle/phi/backends/stream.cc +++ b/paddle/phi/backends/stream.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include + #include "paddle/phi/backends/stream.h" #include "glog/logging.h" @@ -22,7 +24,18 @@ namespace phi { namespace stream { -Stream::~Stream() { Destroy(); } +std::list g_streams; + +void Stream::ReleaseAll() { + for (auto* stream : g_streams) { + stream->Destroy(); + } +} + +Stream::~Stream() { + g_streams.remove(this); + Destroy(); +} const stream_t& Stream::raw_stream() const { return stream_; } @@ -52,6 +65,7 @@ bool Stream::Init(const Place& place, << ", priority: " << static_cast(priority) << ", flag:" << static_cast(flag); own_data_ = true; + g_streams.push_back(this); return true; } @@ -83,11 +97,14 @@ void Stream::Wait() const { void Stream::WaitCallback() const { callback_manager_->Wait(); } void Stream::Destroy() { - if (own_data_ && stream_ != nullptr) { - phi::DeviceManager::SetDevice(place_); - device_->DestroyStream(this); + if (device_) { + if (own_data_) { + phi::DeviceManager::SetDevice(place_); + device_->DestroyStream(this); + } own_data_ = false; stream_ = nullptr; + device_ = nullptr; } } diff --git a/paddle/phi/backends/stream.h b/paddle/phi/backends/stream.h index 3e0099553f1..4219b1cec49 100644 --- a/paddle/phi/backends/stream.h +++ b/paddle/phi/backends/stream.h @@ -66,6 +66,8 @@ class Stream { void Synchronize() const; const Place& GetPlace() const; + static void ReleaseAll(); + private: DISABLE_COPY_AND_ASSIGN(Stream); Place place_; -- GitLab