diff --git a/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h b/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h deleted file mode 100644 index 5d61d65be2d129c293c63a4ee62d6e26bd9201be..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h +++ /dev/null @@ -1,177 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -namespace paddle { -namespace framework { - -template -class ThreadDataRegistry { - public: - // Singleton - static ThreadDataRegistry& GetInstance() { - static ThreadDataRegistry instance; - return instance; - } - - T* GetMutableCurrentThreadData() { return &CurrentThreadData(); } - - const T& GetCurrentThreadData() { return CurrentThreadData(); } - - template ::value>> - void SetCurrentThreadData(const T& val) { - CurrentThreadData() = val; - } - - // Returns current snapshot of all threads. Make sure there is no thread - // create/destory when using it. - template < - typename Alias = T, - typename = std::enable_if_t::value>> - std::unordered_map GetAllThreadDataByValue() { - return impl_->GetAllThreadDataByValue(); - } - - // Returns current snapshot of all threads. Make sure there is no thread - // create/destory when using it. - std::unordered_map> - GetAllThreadDataByRef() { - return impl_->GetAllThreadDataByRef(); - } - - private: -// types -// Lock types -#if defined(__clang__) || defined(__GNUC__) // CLANG or GCC -#ifndef __APPLE__ -#if __cplusplus >= 201703L - using LockType = std::shared_mutex; - using SharedLockGuardType = std::shared_lock; -#elif __cplusplus >= 201402L - using LockType = std::shared_timed_mutex; - using SharedLockGuardType = std::shared_lock; -#else - using LockType = std::mutex; - using SharedLockGuardType = std::lock_guard; -#endif -// Special case : mac. https://github.com/facebook/react-native/issues/31250 -#else - using LockType = std::mutex; - using SharedLockGuardType = std::lock_guard; -#endif -#elif defined(_MSC_VER) // MSVC -#if _MSVC_LANG >= 201703L - using LockType = std::shared_mutex; - using SharedLockGuardType = std::shared_lock; -#elif _MSVC_LANG >= 201402L - using LockType = std::shared_timed_mutex; - using SharedLockGuardType = std::shared_lock; -#else - using LockType = std::mutex; - using SharedLockGuardType = std::lock_guard; -#endif -#else // other compilers - using LockType = std::mutex; - using SharedLockGuardType = std::lock_guard; -#endif - - class ThreadDataHolder; - class ThreadDataRegistryImpl { - public: - void RegisterData(uint64_t tid, ThreadDataHolder* tls_obj) { - std::lock_guard guard(lock_); - tid_map_[tid] = tls_obj; - } - - void UnregisterData(uint64_t tid) { - std::lock_guard guard(lock_); - tid_map_.erase(tid); - } - - template < - typename Alias = T, - typename = std::enable_if_t::value>> - std::unordered_map GetAllThreadDataByValue() { - std::unordered_map data_copy; - SharedLockGuardType guard(lock_); - data_copy.reserve(tid_map_.size()); - for (auto& kv : tid_map_) { - data_copy.emplace(kv.first, kv.second->GetData()); - } - return data_copy; - } - - std::unordered_map> - GetAllThreadDataByRef() { - std::unordered_map> data_ref; - SharedLockGuardType guard(lock_); - data_ref.reserve(tid_map_.size()); - for (auto& kv : tid_map_) { - data_ref.emplace(kv.first, std::ref(kv.second->GetData())); - } - return data_ref; - } - - private: - LockType lock_; - std::unordered_map tid_map_; // not owned - }; - - class ThreadDataHolder { - public: - explicit ThreadDataHolder( - std::shared_ptr registry) { - registry_ = std::move(registry); - tid_ = std::hash()(std::this_thread::get_id()); - registry_->RegisterData(tid_, this); - } - - ~ThreadDataHolder() { registry_->UnregisterData(tid_); } - - T& GetData() { return data_; } - - private: - std::shared_ptr registry_; - uint64_t tid_; - T data_; - }; - - // methods - ThreadDataRegistry() { impl_ = std::make_shared(); } - - ThreadDataRegistry(const ThreadDataRegistry&) = delete; - - ThreadDataRegistry& operator=(const ThreadDataRegistry&) = delete; - - T& CurrentThreadData() { - static thread_local ThreadDataHolder thread_data(impl_); - return thread_data.GetData(); - } - - // data - std::shared_ptr impl_; -}; - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h index 2399e41d93ade5f9467a227f70253f68d8b1cd0d..d8cb7b812ad27cc35d3eba9787d1826798b617b1 100644 --- a/paddle/fluid/memory/stats.h +++ b/paddle/fluid/memory/stats.h @@ -18,15 +18,15 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/phi/common/thread_data_registry.h" namespace paddle { namespace memory { -using framework::ThreadDataRegistry; +using phi::ThreadDataRegistry; struct ThreadLocalStatBase { int64_t current{0}; diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index 8023403df078d4878eec6687825c42ad5cb17d6a..3373ef51836eaf988b2dd4272564c713ea2931b6 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -61,8 +61,6 @@ PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb, "Whether to print the message of gpu memory usage " "MB as a unit of measurement."); -constexpr static float fraction_reserve_gpu_memory = 0.05f; - USE_GPU_MEM_STAT; namespace paddle { namespace platform { @@ -77,20 +75,7 @@ void GpuMemoryUsage(size_t *available, size_t *total) { } size_t GpuAvailableMemToAlloc() { - size_t total = 0; - size_t available = 0; - GpuMemoryUsage(&available, &total); - size_t reserving = - static_cast(fraction_reserve_gpu_memory * available); - // If available size is less than minimum chunk size, no usable memory exists - size_t available_to_alloc = available - reserving; - size_t min_chunk_size = GpuMinChunkSize(); - if (available_to_alloc < min_chunk_size) { - available_to_alloc = 0; - } - VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20) - << "M, " << (available_to_alloc >> 20) << "M available to allocate"; - return available_to_alloc; + return phi::backends::gpu::GpuAvailableMemToAlloc(); } size_t GpuMaxAllocSize() { @@ -124,6 +109,8 @@ size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); } size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); } +size_t GpuMinChunkSize() { return phi::backends::gpu::GpuMinChunkSize(); } + size_t GpuMaxChunkSize() { size_t max_chunk_size = GpuMaxAllocSize(); VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M"; diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h index bb876f5c526d5147be296d9337bcd98fc1498b0a..925a150b60efb01911c113f540b2349eece873e7 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.h +++ b/paddle/fluid/platform/device/gpu/gpu_info.h @@ -82,7 +82,8 @@ size_t GpuInitAllocSize(); //! Get the re-allocation size of current GPU device. size_t GpuReallocSize(); -using phi::backends::gpu::GpuMinChunkSize; +//! Get the minimum chunk size for GPU buddy allocator. +size_t GpuMinChunkSize(); //! Get the maximum chunk size for GPU buddy allocator. size_t GpuMaxChunkSize(); diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 7ceaeb8259a8d6c776f88863a358ffcee1a22573..cde64029d9c0a36a36f46f7bad33e0d05bbec0eb 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_info.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/dynload/cupti.h" @@ -468,6 +469,9 @@ void InitMemoryMethod() { memory_method->copy = paddle::memory::Copy; memory_method->device_memory_stat_current_value = paddle::memory::DeviceMemoryStatCurrentValue; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage; +#endif memory_utils.Init(std::move(memory_method)); }); } diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h index 7b7152be7437847dcfe12280632a523182a85aed..b93457099bae4100c3c007a4c45c5018989b35b4 100644 --- a/paddle/fluid/platform/profiler/host_event_recorder.h +++ b/paddle/fluid/platform/profiler/host_event_recorder.h @@ -18,7 +18,6 @@ #include #include -#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h" #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/os_info.h" #include "paddle/phi/api/profiler/host_event_recorder.h" diff --git a/paddle/phi/backends/gpu/gpu_info.cc b/paddle/phi/backends/gpu/gpu_info.cc index a2399554ba85311830d54a362e70173199c1a930..417ff4c72e86c907266e0b5d83c150cd64f87921 100644 --- a/paddle/phi/backends/gpu/gpu_info.cc +++ b/paddle/phi/backends/gpu/gpu_info.cc @@ -18,6 +18,9 @@ limitations under the License. */ #include #include "gflags/gflags.h" +#include "glog/logging.h" + +#include "paddle/phi/common/memory_utils.h" DECLARE_string(selected_gpus); @@ -56,6 +59,30 @@ std::vector GetSelectedDevices() { return devices; } +constexpr static float fraction_reserve_gpu_memory = 0.05f; + +size_t GpuAvailableMemToAlloc() { + size_t total = 0; + size_t available = 0; + memory_utils::GpuMemoryUsage(&available, &total); + size_t reserving = + static_cast(fraction_reserve_gpu_memory * available); + // If available size is less than minimum chunk size, no usable memory exists + size_t available_to_alloc = available - reserving; + size_t min_chunk_size = GpuMinChunkSize(); + if (available_to_alloc < min_chunk_size) { + available_to_alloc = 0; + } + VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20) + << "M, " << (available_to_alloc >> 20) << "M available to allocate"; + return available_to_alloc; +} + +size_t GpuMinChunkSize() { + // Allow to allocate the minimum chunk size is 256 bytes. + return 1 << 8; +} + } // namespace gpu } // namespace backends } // namespace phi diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h index 6ba8863bc39e4f5145a47e799eae06917118bb58..ebf57bd06eb19d10b1c0b49d2c40365b56086609 100644 --- a/paddle/phi/backends/gpu/gpu_info.h +++ b/paddle/phi/backends/gpu/gpu_info.h @@ -70,11 +70,12 @@ const gpuDeviceProp &GetDeviceProperties(int id); //! Set the GPU device id for next execution. void SetDeviceId(int device_id); +//! Get the available memory to allocate, which is the size of available gpu +//! minus reserving. +size_t GpuAvailableMemToAlloc(); + //! Get the minimum chunk size for GPU buddy allocator. -inline size_t GpuMinChunkSize() { - // Allow to allocate the minimum chunk size is 256 bytes. - return 1 << 8; -} +size_t GpuMinChunkSize(); //! Copy memory from address src to dst asynchronously. void GpuMemcpyAsync(void *dst, diff --git a/paddle/phi/common/memory_utils.cc b/paddle/phi/common/memory_utils.cc index cc1e9d75b6f4a8c97807ab548d85938077b3b936..4a8c8f776b286be03987becc654a2de683b5665c 100644 --- a/paddle/phi/common/memory_utils.cc +++ b/paddle/phi/common/memory_utils.cc @@ -68,6 +68,13 @@ int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) { return MemoryUtils::Instance().DeviceMemoryStatCurrentValue(stat_type, dev_id); } + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +void GpuMemoryUsage(size_t* available, size_t* total) { + return MemoryUtils::Instance().GpuMemoryUsage(available, total); +} +#endif + } // namespace memory_utils } // namespace phi diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h index 6dbf3d5d07acd09b326c3ef5d036dec850c3adb9..180385c6c5255cc4dc6d1d9ffbb4a25d43eda98f 100644 --- a/paddle/phi/common/memory_utils.h +++ b/paddle/phi/common/memory_utils.h @@ -113,6 +113,16 @@ struct MemoryInterface { */ int64_t (*device_memory_stat_current_value)(const std::string& stat_type, int dev_id); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + /** + * @brief get the memory usage of current GPU device. + * + * @param[size_t] available device available memory to alloc + * @param[size_t] total device total memory + */ + void (*gpu_memory_usage)(size_t* available, size_t* total); +#endif }; class MemoryUtils { @@ -234,6 +244,18 @@ class MemoryUtils { return memory_method_->device_memory_stat_current_value(stat_type, dev_id); } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + void GpuMemoryUsage(size_t* available, size_t* total) { + CheckMemoryMethod(); + PADDLE_ENFORCE_NOT_NULL( + memory_method_->gpu_memory_usage, + phi::errors::Unavailable( + "gpu_memory_usage method in memory_method_ is not initiazed " + "yet. You need init it first.")); + return memory_method_->gpu_memory_usage(available, total); + } +#endif + void CheckMemoryMethod() { PADDLE_ENFORCE_NE( memory_method_.get(), @@ -288,7 +310,13 @@ void Copy(const Place& dst_place, const Place& src_place, const void* src, size_t num); + int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +void GpuMemoryUsage(size_t* available, size_t* total); +#endif + } // namespace memory_utils } // namespace phi diff --git a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h index 79da6d3b1b4a03b4508683c13c903486e8031973..6e75d40d45174900d7a0078e53b9af7474d2555a 100644 --- a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h +++ b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h @@ -20,7 +20,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/kernels/autotune/cache.h" @@ -53,7 +52,7 @@ static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) { memory_utils::DeviceMemoryStatCurrentValue("Allocated", device_id); int64_t reserved = memory_utils::DeviceMemoryStatCurrentValue("Reserved", device_id); - int64_t availble = paddle::platform::GpuAvailableMemToAlloc(); + int64_t availble = phi::backends::gpu::GpuAvailableMemToAlloc(); VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated) << " MB, reserved=" << ToMegaBytes(reserved) << " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB."; diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index 8570b4be5777813a2d9abc44a4abf084194ed3b7..d81c03ceabb5c663c309670258324b6f0758bcf5 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -627,7 +627,6 @@ void ConvCudnnGradKernel(const Context& ctx, compute_format == phi::backends::gpu::DataLayout::kNHWC ? phi::backends::gpu::DataLayout::kNHWC : phi::backends::gpu::DataLayout::kNCHW; - // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout if (transformed_input.dims().size() == 5) { layout = compute_format == phi::backends::gpu::DataLayout::kNHWC ? phi::backends::gpu::DataLayout::kNDHWC