From 907433a78028032b97a24e1e8ae72b7a4c818223 Mon Sep 17 00:00:00 2001 From: Huang Jiyi <43315610+huangjiyi@users.noreply.github.com> Date: Thu, 16 Mar 2023 15:37:25 +0800 Subject: [PATCH] [phi decoupling] remove fluid gpu_info usage in phi (#51699) * remove fluid thread_data_registry * update * fix bug --- .../workqueue/thread_data_registry.h | 177 ------------------ paddle/fluid/memory/stats.h | 4 +- paddle/fluid/platform/device/gpu/gpu_info.cc | 19 +- paddle/fluid/platform/device/gpu/gpu_info.h | 3 +- paddle/fluid/platform/init.cc | 4 + .../platform/profiler/host_event_recorder.h | 1 - paddle/phi/backends/gpu/gpu_info.cc | 27 +++ paddle/phi/backends/gpu/gpu_info.h | 9 +- paddle/phi/common/memory_utils.cc | 7 + paddle/phi/common/memory_utils.h | 28 +++ paddle/phi/kernels/gpudnn/conv_gpudnn_base.h | 3 +- paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 1 - 12 files changed, 79 insertions(+), 204 deletions(-) delete mode 100644 paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h diff --git a/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h b/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h deleted file mode 100644 index 5d61d65be2d..00000000000 --- a/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h +++ /dev/null @@ -1,177 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -namespace paddle { -namespace framework { - -template -class ThreadDataRegistry { - public: - // Singleton - static ThreadDataRegistry& GetInstance() { - static ThreadDataRegistry instance; - return instance; - } - - T* GetMutableCurrentThreadData() { return &CurrentThreadData(); } - - const T& GetCurrentThreadData() { return CurrentThreadData(); } - - template ::value>> - void SetCurrentThreadData(const T& val) { - CurrentThreadData() = val; - } - - // Returns current snapshot of all threads. Make sure there is no thread - // create/destory when using it. - template < - typename Alias = T, - typename = std::enable_if_t::value>> - std::unordered_map GetAllThreadDataByValue() { - return impl_->GetAllThreadDataByValue(); - } - - // Returns current snapshot of all threads. Make sure there is no thread - // create/destory when using it. - std::unordered_map> - GetAllThreadDataByRef() { - return impl_->GetAllThreadDataByRef(); - } - - private: -// types -// Lock types -#if defined(__clang__) || defined(__GNUC__) // CLANG or GCC -#ifndef __APPLE__ -#if __cplusplus >= 201703L - using LockType = std::shared_mutex; - using SharedLockGuardType = std::shared_lock; -#elif __cplusplus >= 201402L - using LockType = std::shared_timed_mutex; - using SharedLockGuardType = std::shared_lock; -#else - using LockType = std::mutex; - using SharedLockGuardType = std::lock_guard; -#endif -// Special case : mac. https://github.com/facebook/react-native/issues/31250 -#else - using LockType = std::mutex; - using SharedLockGuardType = std::lock_guard; -#endif -#elif defined(_MSC_VER) // MSVC -#if _MSVC_LANG >= 201703L - using LockType = std::shared_mutex; - using SharedLockGuardType = std::shared_lock; -#elif _MSVC_LANG >= 201402L - using LockType = std::shared_timed_mutex; - using SharedLockGuardType = std::shared_lock; -#else - using LockType = std::mutex; - using SharedLockGuardType = std::lock_guard; -#endif -#else // other compilers - using LockType = std::mutex; - using SharedLockGuardType = std::lock_guard; -#endif - - class ThreadDataHolder; - class ThreadDataRegistryImpl { - public: - void RegisterData(uint64_t tid, ThreadDataHolder* tls_obj) { - std::lock_guard guard(lock_); - tid_map_[tid] = tls_obj; - } - - void UnregisterData(uint64_t tid) { - std::lock_guard guard(lock_); - tid_map_.erase(tid); - } - - template < - typename Alias = T, - typename = std::enable_if_t::value>> - std::unordered_map GetAllThreadDataByValue() { - std::unordered_map data_copy; - SharedLockGuardType guard(lock_); - data_copy.reserve(tid_map_.size()); - for (auto& kv : tid_map_) { - data_copy.emplace(kv.first, kv.second->GetData()); - } - return data_copy; - } - - std::unordered_map> - GetAllThreadDataByRef() { - std::unordered_map> data_ref; - SharedLockGuardType guard(lock_); - data_ref.reserve(tid_map_.size()); - for (auto& kv : tid_map_) { - data_ref.emplace(kv.first, std::ref(kv.second->GetData())); - } - return data_ref; - } - - private: - LockType lock_; - std::unordered_map tid_map_; // not owned - }; - - class ThreadDataHolder { - public: - explicit ThreadDataHolder( - std::shared_ptr registry) { - registry_ = std::move(registry); - tid_ = std::hash()(std::this_thread::get_id()); - registry_->RegisterData(tid_, this); - } - - ~ThreadDataHolder() { registry_->UnregisterData(tid_); } - - T& GetData() { return data_; } - - private: - std::shared_ptr registry_; - uint64_t tid_; - T data_; - }; - - // methods - ThreadDataRegistry() { impl_ = std::make_shared(); } - - ThreadDataRegistry(const ThreadDataRegistry&) = delete; - - ThreadDataRegistry& operator=(const ThreadDataRegistry&) = delete; - - T& CurrentThreadData() { - static thread_local ThreadDataHolder thread_data(impl_); - return thread_data.GetData(); - } - - // data - std::shared_ptr impl_; -}; - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h index 2399e41d93a..d8cb7b812ad 100644 --- a/paddle/fluid/memory/stats.h +++ b/paddle/fluid/memory/stats.h @@ -18,15 +18,15 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/phi/common/thread_data_registry.h" namespace paddle { namespace memory { -using framework::ThreadDataRegistry; +using phi::ThreadDataRegistry; struct ThreadLocalStatBase { int64_t current{0}; diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index 8023403df07..3373ef51836 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -61,8 +61,6 @@ PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb, "Whether to print the message of gpu memory usage " "MB as a unit of measurement."); -constexpr static float fraction_reserve_gpu_memory = 0.05f; - USE_GPU_MEM_STAT; namespace paddle { namespace platform { @@ -77,20 +75,7 @@ void GpuMemoryUsage(size_t *available, size_t *total) { } size_t GpuAvailableMemToAlloc() { - size_t total = 0; - size_t available = 0; - GpuMemoryUsage(&available, &total); - size_t reserving = - static_cast(fraction_reserve_gpu_memory * available); - // If available size is less than minimum chunk size, no usable memory exists - size_t available_to_alloc = available - reserving; - size_t min_chunk_size = GpuMinChunkSize(); - if (available_to_alloc < min_chunk_size) { - available_to_alloc = 0; - } - VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20) - << "M, " << (available_to_alloc >> 20) << "M available to allocate"; - return available_to_alloc; + return phi::backends::gpu::GpuAvailableMemToAlloc(); } size_t GpuMaxAllocSize() { @@ -124,6 +109,8 @@ size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); } size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); } +size_t GpuMinChunkSize() { return phi::backends::gpu::GpuMinChunkSize(); } + size_t GpuMaxChunkSize() { size_t max_chunk_size = GpuMaxAllocSize(); VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M"; diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h index bb876f5c526..925a150b60e 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.h +++ b/paddle/fluid/platform/device/gpu/gpu_info.h @@ -82,7 +82,8 @@ size_t GpuInitAllocSize(); //! Get the re-allocation size of current GPU device. size_t GpuReallocSize(); -using phi::backends::gpu::GpuMinChunkSize; +//! Get the minimum chunk size for GPU buddy allocator. +size_t GpuMinChunkSize(); //! Get the maximum chunk size for GPU buddy allocator. size_t GpuMaxChunkSize(); diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 7ceaeb8259a..cde64029d9c 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_info.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/dynload/cupti.h" @@ -468,6 +469,9 @@ void InitMemoryMethod() { memory_method->copy = paddle::memory::Copy; memory_method->device_memory_stat_current_value = paddle::memory::DeviceMemoryStatCurrentValue; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage; +#endif memory_utils.Init(std::move(memory_method)); }); } diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h index 7b7152be743..b93457099ba 100644 --- a/paddle/fluid/platform/profiler/host_event_recorder.h +++ b/paddle/fluid/platform/profiler/host_event_recorder.h @@ -18,7 +18,6 @@ #include #include -#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h" #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/os_info.h" #include "paddle/phi/api/profiler/host_event_recorder.h" diff --git a/paddle/phi/backends/gpu/gpu_info.cc b/paddle/phi/backends/gpu/gpu_info.cc index a2399554ba8..417ff4c72e8 100644 --- a/paddle/phi/backends/gpu/gpu_info.cc +++ b/paddle/phi/backends/gpu/gpu_info.cc @@ -18,6 +18,9 @@ limitations under the License. */ #include #include "gflags/gflags.h" +#include "glog/logging.h" + +#include "paddle/phi/common/memory_utils.h" DECLARE_string(selected_gpus); @@ -56,6 +59,30 @@ std::vector GetSelectedDevices() { return devices; } +constexpr static float fraction_reserve_gpu_memory = 0.05f; + +size_t GpuAvailableMemToAlloc() { + size_t total = 0; + size_t available = 0; + memory_utils::GpuMemoryUsage(&available, &total); + size_t reserving = + static_cast(fraction_reserve_gpu_memory * available); + // If available size is less than minimum chunk size, no usable memory exists + size_t available_to_alloc = available - reserving; + size_t min_chunk_size = GpuMinChunkSize(); + if (available_to_alloc < min_chunk_size) { + available_to_alloc = 0; + } + VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20) + << "M, " << (available_to_alloc >> 20) << "M available to allocate"; + return available_to_alloc; +} + +size_t GpuMinChunkSize() { + // Allow to allocate the minimum chunk size is 256 bytes. + return 1 << 8; +} + } // namespace gpu } // namespace backends } // namespace phi diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h index 6ba8863bc39..ebf57bd06eb 100644 --- a/paddle/phi/backends/gpu/gpu_info.h +++ b/paddle/phi/backends/gpu/gpu_info.h @@ -70,11 +70,12 @@ const gpuDeviceProp &GetDeviceProperties(int id); //! Set the GPU device id for next execution. void SetDeviceId(int device_id); +//! Get the available memory to allocate, which is the size of available gpu +//! minus reserving. +size_t GpuAvailableMemToAlloc(); + //! Get the minimum chunk size for GPU buddy allocator. -inline size_t GpuMinChunkSize() { - // Allow to allocate the minimum chunk size is 256 bytes. - return 1 << 8; -} +size_t GpuMinChunkSize(); //! Copy memory from address src to dst asynchronously. void GpuMemcpyAsync(void *dst, diff --git a/paddle/phi/common/memory_utils.cc b/paddle/phi/common/memory_utils.cc index cc1e9d75b6f..4a8c8f776b2 100644 --- a/paddle/phi/common/memory_utils.cc +++ b/paddle/phi/common/memory_utils.cc @@ -68,6 +68,13 @@ int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) { return MemoryUtils::Instance().DeviceMemoryStatCurrentValue(stat_type, dev_id); } + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +void GpuMemoryUsage(size_t* available, size_t* total) { + return MemoryUtils::Instance().GpuMemoryUsage(available, total); +} +#endif + } // namespace memory_utils } // namespace phi diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h index 6dbf3d5d07a..180385c6c52 100644 --- a/paddle/phi/common/memory_utils.h +++ b/paddle/phi/common/memory_utils.h @@ -113,6 +113,16 @@ struct MemoryInterface { */ int64_t (*device_memory_stat_current_value)(const std::string& stat_type, int dev_id); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + /** + * @brief get the memory usage of current GPU device. + * + * @param[size_t] available device available memory to alloc + * @param[size_t] total device total memory + */ + void (*gpu_memory_usage)(size_t* available, size_t* total); +#endif }; class MemoryUtils { @@ -234,6 +244,18 @@ class MemoryUtils { return memory_method_->device_memory_stat_current_value(stat_type, dev_id); } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + void GpuMemoryUsage(size_t* available, size_t* total) { + CheckMemoryMethod(); + PADDLE_ENFORCE_NOT_NULL( + memory_method_->gpu_memory_usage, + phi::errors::Unavailable( + "gpu_memory_usage method in memory_method_ is not initiazed " + "yet. You need init it first.")); + return memory_method_->gpu_memory_usage(available, total); + } +#endif + void CheckMemoryMethod() { PADDLE_ENFORCE_NE( memory_method_.get(), @@ -288,7 +310,13 @@ void Copy(const Place& dst_place, const Place& src_place, const void* src, size_t num); + int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +void GpuMemoryUsage(size_t* available, size_t* total); +#endif + } // namespace memory_utils } // namespace phi diff --git a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h index 79da6d3b1b4..6e75d40d451 100644 --- a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h +++ b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h @@ -20,7 +20,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/kernels/autotune/cache.h" @@ -53,7 +52,7 @@ static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) { memory_utils::DeviceMemoryStatCurrentValue("Allocated", device_id); int64_t reserved = memory_utils::DeviceMemoryStatCurrentValue("Reserved", device_id); - int64_t availble = paddle::platform::GpuAvailableMemToAlloc(); + int64_t availble = phi::backends::gpu::GpuAvailableMemToAlloc(); VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated) << " MB, reserved=" << ToMegaBytes(reserved) << " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB."; diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index 8570b4be577..d81c03ceabb 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -627,7 +627,6 @@ void ConvCudnnGradKernel(const Context& ctx, compute_format == phi::backends::gpu::DataLayout::kNHWC ? phi::backends::gpu::DataLayout::kNHWC : phi::backends::gpu::DataLayout::kNCHW; - // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout if (transformed_input.dims().size() == 5) { layout = compute_format == phi::backends::gpu::DataLayout::kNHWC ? phi::backends::gpu::DataLayout::kNDHWC -- GitLab