From 5c6ae96b1a92ab0903880fd09e0b2c75f762ac98 Mon Sep 17 00:00:00 2001 From: ronnywang Date: Wed, 23 Aug 2023 16:35:13 +0800 Subject: [PATCH] [CustomDevice] Fix device occupancy (#56556) --- paddle/fluid/framework/custom_operator.cc | 4 +-- .../memory/allocation/allocator_facade.cc | 18 ++++-------- .../custom/custom_device_resource_pool.cc | 28 +++++++++--------- paddle/fluid/platform/profiler_helper.h | 5 ++-- paddle/phi/backends/device_manager.cc | 29 ++++++++++--------- 5 files changed, 39 insertions(+), 45 deletions(-) diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index 633d48bf734..0a9573b42f8 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -946,9 +946,7 @@ static void RegisterOperatorKernel( #ifdef PADDLE_WITH_CUSTOM_DEVICE auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (const auto& dev_type : device_types) { - for (size_t dev_id = 0; - dev_id < phi::DeviceManager::GetDeviceCount(dev_type); - dev_id++) { + for (auto& dev_id : phi::DeviceManager::GetSelectedDeviceList(dev_type)) { RegisterOperatorKernelWithPlace(name, op_kernel_func, proto::VarType::RAW, diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 7887045e9dc..0800f091837 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -206,9 +206,8 @@ class AllocatorFacadePrivate { #ifdef PADDLE_WITH_CUSTOM_DEVICE auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (const auto& dev_type : device_types) { - for (size_t dev_id = 0; - dev_id < phi::DeviceManager::GetDeviceCount(dev_type); - ++dev_id) { + for (auto& dev_id : + phi::DeviceManager::GetSelectedDeviceList(dev_type)) { InitNaiveBestFitCustomDeviceAllocator( platform::CustomPlace(dev_type, dev_id)); } @@ -272,9 +271,8 @@ class AllocatorFacadePrivate { #ifdef PADDLE_WITH_CUSTOM_DEVICE auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (const auto& dev_type : device_types) { - for (size_t dev_id = 0; - dev_id < phi::DeviceManager::GetDeviceCount(dev_type); - ++dev_id) { + for (auto& dev_id : + phi::DeviceManager::GetSelectedDeviceList(dev_type)) { InitAutoGrowthCustomDeviceAllocator( platform::CustomPlace(dev_type, dev_id), allow_free_idle_chunk); } @@ -1212,9 +1210,7 @@ class AllocatorFacadePrivate { #ifdef PADDLE_WITH_CUSTOM_DEVICE auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (const auto& dev_type : device_types) { - for (size_t dev_id = 0; - dev_id < phi::DeviceManager::GetDeviceCount(dev_type); - dev_id++) { + for (auto& dev_id : phi::DeviceManager::GetSelectedDeviceList(dev_type)) { platform::CustomPlace p(dev_type, dev_id); system_allocators_[p] = std::make_shared(p); } @@ -1248,9 +1244,7 @@ class AllocatorFacadePrivate { #ifdef PADDLE_WITH_CUSTOM_DEVICE auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (const auto& dev_type : device_types) { - for (size_t dev_id = 0; - dev_id < phi::DeviceManager::GetDeviceCount(dev_type); - dev_id++) { + for (auto& dev_id : phi::DeviceManager::GetSelectedDeviceList(dev_type)) { places.emplace_back(platform::CustomPlace(dev_type, dev_id)); } } diff --git a/paddle/fluid/platform/device/custom/custom_device_resource_pool.cc b/paddle/fluid/platform/device/custom/custom_device_resource_pool.cc index c7afa716715..e8d006b5629 100644 --- a/paddle/fluid/platform/device/custom/custom_device_resource_pool.cc +++ b/paddle/fluid/platform/device/custom/custom_device_resource_pool.cc @@ -27,9 +27,10 @@ CustomDeviceStreamResourcePool::CustomDeviceStreamResourcePool( platform::errors::PreconditionNotMet( "Required device shall be CustomPlace, but received %d. ", place)); - int dev_cnt = phi::DeviceManager::GetDeviceCount(place.GetDeviceType()); - pool_.reserve(dev_cnt); - for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) { + auto selected_devices = + phi::DeviceManager::GetSelectedDeviceList(place.GetDeviceType()); + pool_.reserve(selected_devices.size()); + for (auto& dev_idx : selected_devices) { auto creator = [place, dev_idx, this] { auto place_ = phi::CustomPlace(place.GetDeviceType(), dev_idx); phi::DeviceManager::SetDevice(place_); @@ -82,12 +83,11 @@ CustomDeviceStreamResourcePool& CustomDeviceStreamResourcePool::Instance( if (pool.find(place.GetDeviceType()) == pool.end()) { pool.insert({place.GetDeviceType(), std::vector()}); - for (size_t i = 0; - i < phi::DeviceManager::GetDeviceCount(place.GetDeviceType()); - ++i) { + for (auto& dev_id : + phi::DeviceManager::GetSelectedDeviceList(place.GetDeviceType())) { pool[place.GetDeviceType()].emplace_back( new CustomDeviceStreamResourcePool( - paddle::platform::CustomPlace(place.GetDeviceType(), i))); + paddle::platform::CustomPlace(place.GetDeviceType(), dev_id))); } } PADDLE_ENFORCE_LT( @@ -125,9 +125,10 @@ CustomDeviceEventResourcePool::CustomDeviceEventResourcePool( platform::errors::PreconditionNotMet( "Required device shall be CustomPlace, but received %d. ", place)); - int dev_cnt = phi::DeviceManager::GetDeviceCount(place.GetDeviceType()); - pool_.reserve(dev_cnt); - for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) { + auto selected_devices = + phi::DeviceManager::GetSelectedDeviceList(place.GetDeviceType()); + pool_.reserve(selected_devices.size()); + for (auto& dev_idx : selected_devices) { auto creator = [place, dev_idx, this] { auto place_ = phi::CustomPlace(place.GetDeviceType(), dev_idx); phi::DeviceManager::SetDevice(place_); @@ -180,12 +181,11 @@ CustomDeviceEventResourcePool& CustomDeviceEventResourcePool::Instance( if (pool.find(place.GetDeviceType()) == pool.end()) { pool.insert( {place.GetDeviceType(), std::vector()}); - for (size_t i = 0; - i < phi::DeviceManager::GetDeviceCount(place.GetDeviceType()); - ++i) { + for (auto& dev_id : + phi::DeviceManager::GetSelectedDeviceList(place.GetDeviceType())) { pool[place.GetDeviceType()].emplace_back( new CustomDeviceEventResourcePool( - paddle::platform::CustomPlace(place.GetDeviceType(), i))); + paddle::platform::CustomPlace(place.GetDeviceType(), dev_id))); } } PADDLE_ENFORCE_LT( diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index 1d34d5fd27b..9835e7525c5 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -116,9 +116,8 @@ void SynchronizeAllDevice() { auto dev_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (const auto &dev_type : dev_types) { int pre_device_id = phi::DeviceManager::GetDevice(dev_type); - auto dev_cnt = phi::DeviceManager::GetDeviceCount(dev_type); - for (size_t i = 0; i < dev_cnt; i++) { - auto place = paddle::platform::CustomPlace(dev_type, i); + for (auto &dev_id : phi::DeviceManager::GetSelectedDeviceList(dev_type)) { + auto place = paddle::platform::CustomPlace(dev_type, dev_id); phi::DeviceManager::SetDevice(place); phi::DeviceManager::SynchronizeDevice(place); } diff --git a/paddle/phi/backends/device_manager.cc b/paddle/phi/backends/device_manager.cc index b5412cbd130..0b0aa42c362 100644 --- a/paddle/phi/backends/device_manager.cc +++ b/paddle/phi/backends/device_manager.cc @@ -465,21 +465,24 @@ std::vector DeviceManager::GetDeviceList( std::vector DeviceManager::GetSelectedDeviceList( const std::string& device_type) { - std::vector devices; - std::string FLAGS = "FLAGS_selected_" + device_type + "s"; - auto FLAGS_selected_devices = getenv(FLAGS.c_str()); - if (FLAGS_selected_devices) { - auto devices_str = paddle::string::Split(FLAGS_selected_devices, ','); - for (auto id : devices_str) { - devices.push_back(atoi(id.c_str())); - } - } else { - int count = DeviceManager::GetDeviceCount(device_type); - for (int i = 0; i < count; ++i) { - devices.push_back(i); + static std::unordered_map> device_list_map; + if (device_list_map.find(device_type) == device_list_map.end()) { + std::vector& device_list = device_list_map[device_type]; + std::string FLAGS = "FLAGS_selected_" + device_type + "s"; + auto FLAGS_selected_devices = getenv(FLAGS.c_str()); + if (FLAGS_selected_devices) { + auto devices_str = paddle::string::Split(FLAGS_selected_devices, ','); + for (auto id : devices_str) { + device_list.push_back(atoi(id.c_str())); + } + } else { + int count = DeviceManager::GetDeviceCount(device_type); + for (int i = 0; i < count; ++i) { + device_list.push_back(i); + } } } - return devices; + return device_list_map[device_type]; } void DeviceManager::CCLDestroyComm(const std::string& device_type, -- GitLab