未验证 提交 907433a7 编写于 作者: H Huang Jiyi 提交者: GitHub

[phi decoupling] remove fluid gpu_info usage in phi (#51699)

* remove fluid thread_data_registry

* update

* fix bug
上级 3f3372b6
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <memory>
#include <mutex>
#include <shared_mutex>
#include <thread>
#include <type_traits>
#include <unordered_map>
namespace paddle {
namespace framework {
template <typename T>
class ThreadDataRegistry {
public:
// Singleton
static ThreadDataRegistry& GetInstance() {
static ThreadDataRegistry instance;
return instance;
}
T* GetMutableCurrentThreadData() { return &CurrentThreadData(); }
const T& GetCurrentThreadData() { return CurrentThreadData(); }
template <typename Alias = T,
typename = std::enable_if_t<std::is_copy_assignable<Alias>::value>>
void SetCurrentThreadData(const T& val) {
CurrentThreadData() = val;
}
// Returns current snapshot of all threads. Make sure there is no thread
// create/destory when using it.
template <
typename Alias = T,
typename = std::enable_if_t<std::is_copy_constructible<Alias>::value>>
std::unordered_map<uint64_t, T> GetAllThreadDataByValue() {
return impl_->GetAllThreadDataByValue();
}
// Returns current snapshot of all threads. Make sure there is no thread
// create/destory when using it.
std::unordered_map<uint64_t, std::reference_wrapper<T>>
GetAllThreadDataByRef() {
return impl_->GetAllThreadDataByRef();
}
private:
// types
// Lock types
#if defined(__clang__) || defined(__GNUC__) // CLANG or GCC
#ifndef __APPLE__
#if __cplusplus >= 201703L
using LockType = std::shared_mutex;
using SharedLockGuardType = std::shared_lock<std::shared_mutex>;
#elif __cplusplus >= 201402L
using LockType = std::shared_timed_mutex;
using SharedLockGuardType = std::shared_lock<std::shared_timed_mutex>;
#else
using LockType = std::mutex;
using SharedLockGuardType = std::lock_guard<std::mutex>;
#endif
// Special case : mac. https://github.com/facebook/react-native/issues/31250
#else
using LockType = std::mutex;
using SharedLockGuardType = std::lock_guard<std::mutex>;
#endif
#elif defined(_MSC_VER) // MSVC
#if _MSVC_LANG >= 201703L
using LockType = std::shared_mutex;
using SharedLockGuardType = std::shared_lock<std::shared_mutex>;
#elif _MSVC_LANG >= 201402L
using LockType = std::shared_timed_mutex;
using SharedLockGuardType = std::shared_lock<std::shared_timed_mutex>;
#else
using LockType = std::mutex;
using SharedLockGuardType = std::lock_guard<std::mutex>;
#endif
#else // other compilers
using LockType = std::mutex;
using SharedLockGuardType = std::lock_guard<std::mutex>;
#endif
class ThreadDataHolder;
class ThreadDataRegistryImpl {
public:
void RegisterData(uint64_t tid, ThreadDataHolder* tls_obj) {
std::lock_guard<LockType> guard(lock_);
tid_map_[tid] = tls_obj;
}
void UnregisterData(uint64_t tid) {
std::lock_guard<LockType> guard(lock_);
tid_map_.erase(tid);
}
template <
typename Alias = T,
typename = std::enable_if_t<std::is_copy_constructible<Alias>::value>>
std::unordered_map<uint64_t, T> GetAllThreadDataByValue() {
std::unordered_map<uint64_t, T> data_copy;
SharedLockGuardType guard(lock_);
data_copy.reserve(tid_map_.size());
for (auto& kv : tid_map_) {
data_copy.emplace(kv.first, kv.second->GetData());
}
return data_copy;
}
std::unordered_map<uint64_t, std::reference_wrapper<T>>
GetAllThreadDataByRef() {
std::unordered_map<uint64_t, std::reference_wrapper<T>> data_ref;
SharedLockGuardType guard(lock_);
data_ref.reserve(tid_map_.size());
for (auto& kv : tid_map_) {
data_ref.emplace(kv.first, std::ref(kv.second->GetData()));
}
return data_ref;
}
private:
LockType lock_;
std::unordered_map<uint64_t, ThreadDataHolder*> tid_map_; // not owned
};
class ThreadDataHolder {
public:
explicit ThreadDataHolder(
std::shared_ptr<ThreadDataRegistryImpl> registry) {
registry_ = std::move(registry);
tid_ = std::hash<std::thread::id>()(std::this_thread::get_id());
registry_->RegisterData(tid_, this);
}
~ThreadDataHolder() { registry_->UnregisterData(tid_); }
T& GetData() { return data_; }
private:
std::shared_ptr<ThreadDataRegistryImpl> registry_;
uint64_t tid_;
T data_;
};
// methods
ThreadDataRegistry() { impl_ = std::make_shared<ThreadDataRegistryImpl>(); }
ThreadDataRegistry(const ThreadDataRegistry&) = delete;
ThreadDataRegistry& operator=(const ThreadDataRegistry&) = delete;
T& CurrentThreadData() {
static thread_local ThreadDataHolder thread_data(impl_);
return thread_data.GetData();
}
// data
std::shared_ptr<ThreadDataRegistryImpl> impl_;
};
} // namespace framework
} // namespace paddle
...@@ -18,15 +18,15 @@ limitations under the License. */ ...@@ -18,15 +18,15 @@ limitations under the License. */
#include <map> #include <map>
#include <string> #include <string>
#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
#include "paddle/phi/common/thread_data_registry.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
using framework::ThreadDataRegistry; using phi::ThreadDataRegistry;
struct ThreadLocalStatBase { struct ThreadLocalStatBase {
int64_t current{0}; int64_t current{0};
......
...@@ -61,8 +61,6 @@ PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb, ...@@ -61,8 +61,6 @@ PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb,
"Whether to print the message of gpu memory usage " "Whether to print the message of gpu memory usage "
"MB as a unit of measurement."); "MB as a unit of measurement.");
constexpr static float fraction_reserve_gpu_memory = 0.05f;
USE_GPU_MEM_STAT; USE_GPU_MEM_STAT;
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -77,20 +75,7 @@ void GpuMemoryUsage(size_t *available, size_t *total) { ...@@ -77,20 +75,7 @@ void GpuMemoryUsage(size_t *available, size_t *total) {
} }
size_t GpuAvailableMemToAlloc() { size_t GpuAvailableMemToAlloc() {
size_t total = 0; return phi::backends::gpu::GpuAvailableMemToAlloc();
size_t available = 0;
GpuMemoryUsage(&available, &total);
size_t reserving =
static_cast<size_t>(fraction_reserve_gpu_memory * available);
// If available size is less than minimum chunk size, no usable memory exists
size_t available_to_alloc = available - reserving;
size_t min_chunk_size = GpuMinChunkSize();
if (available_to_alloc < min_chunk_size) {
available_to_alloc = 0;
}
VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20)
<< "M, " << (available_to_alloc >> 20) << "M available to allocate";
return available_to_alloc;
} }
size_t GpuMaxAllocSize() { size_t GpuMaxAllocSize() {
...@@ -124,6 +109,8 @@ size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); } ...@@ -124,6 +109,8 @@ size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }
size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); } size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }
size_t GpuMinChunkSize() { return phi::backends::gpu::GpuMinChunkSize(); }
size_t GpuMaxChunkSize() { size_t GpuMaxChunkSize() {
size_t max_chunk_size = GpuMaxAllocSize(); size_t max_chunk_size = GpuMaxAllocSize();
VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M"; VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
......
...@@ -82,7 +82,8 @@ size_t GpuInitAllocSize(); ...@@ -82,7 +82,8 @@ size_t GpuInitAllocSize();
//! Get the re-allocation size of current GPU device. //! Get the re-allocation size of current GPU device.
size_t GpuReallocSize(); size_t GpuReallocSize();
using phi::backends::gpu::GpuMinChunkSize; //! Get the minimum chunk size for GPU buddy allocator.
size_t GpuMinChunkSize();
//! Get the maximum chunk size for GPU buddy allocator. //! Get the maximum chunk size for GPU buddy allocator.
size_t GpuMaxChunkSize(); size_t GpuMaxChunkSize();
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/phi/backends/cpu/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif #endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cupti.h" #include "paddle/fluid/platform/dynload/cupti.h"
...@@ -468,6 +469,9 @@ void InitMemoryMethod() { ...@@ -468,6 +469,9 @@ void InitMemoryMethod() {
memory_method->copy = paddle::memory::Copy<phi::Place, phi::Place>; memory_method->copy = paddle::memory::Copy<phi::Place, phi::Place>;
memory_method->device_memory_stat_current_value = memory_method->device_memory_stat_current_value =
paddle::memory::DeviceMemoryStatCurrentValue; paddle::memory::DeviceMemoryStatCurrentValue;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage;
#endif
memory_utils.Init(std::move(memory_method)); memory_utils.Init(std::move(memory_method));
}); });
} }
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
#include <type_traits> #include <type_traits>
#include <vector> #include <vector>
#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/os_info.h"
#include "paddle/phi/api/profiler/host_event_recorder.h" #include "paddle/phi/api/profiler/host_event_recorder.h"
......
...@@ -18,6 +18,9 @@ limitations under the License. */ ...@@ -18,6 +18,9 @@ limitations under the License. */
#include <vector> #include <vector>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "glog/logging.h"
#include "paddle/phi/common/memory_utils.h"
DECLARE_string(selected_gpus); DECLARE_string(selected_gpus);
...@@ -56,6 +59,30 @@ std::vector<int> GetSelectedDevices() { ...@@ -56,6 +59,30 @@ std::vector<int> GetSelectedDevices() {
return devices; return devices;
} }
constexpr static float fraction_reserve_gpu_memory = 0.05f;
size_t GpuAvailableMemToAlloc() {
size_t total = 0;
size_t available = 0;
memory_utils::GpuMemoryUsage(&available, &total);
size_t reserving =
static_cast<size_t>(fraction_reserve_gpu_memory * available);
// If available size is less than minimum chunk size, no usable memory exists
size_t available_to_alloc = available - reserving;
size_t min_chunk_size = GpuMinChunkSize();
if (available_to_alloc < min_chunk_size) {
available_to_alloc = 0;
}
VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20)
<< "M, " << (available_to_alloc >> 20) << "M available to allocate";
return available_to_alloc;
}
size_t GpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 256 bytes.
return 1 << 8;
}
} // namespace gpu } // namespace gpu
} // namespace backends } // namespace backends
} // namespace phi } // namespace phi
...@@ -70,11 +70,12 @@ const gpuDeviceProp &GetDeviceProperties(int id); ...@@ -70,11 +70,12 @@ const gpuDeviceProp &GetDeviceProperties(int id);
//! Set the GPU device id for next execution. //! Set the GPU device id for next execution.
void SetDeviceId(int device_id); void SetDeviceId(int device_id);
//! Get the available memory to allocate, which is the size of available gpu
//! minus reserving.
size_t GpuAvailableMemToAlloc();
//! Get the minimum chunk size for GPU buddy allocator. //! Get the minimum chunk size for GPU buddy allocator.
inline size_t GpuMinChunkSize() { size_t GpuMinChunkSize();
// Allow to allocate the minimum chunk size is 256 bytes.
return 1 << 8;
}
//! Copy memory from address src to dst asynchronously. //! Copy memory from address src to dst asynchronously.
void GpuMemcpyAsync(void *dst, void GpuMemcpyAsync(void *dst,
......
...@@ -68,6 +68,13 @@ int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) { ...@@ -68,6 +68,13 @@ int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
return MemoryUtils::Instance().DeviceMemoryStatCurrentValue(stat_type, return MemoryUtils::Instance().DeviceMemoryStatCurrentValue(stat_type,
dev_id); dev_id);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void GpuMemoryUsage(size_t* available, size_t* total) {
return MemoryUtils::Instance().GpuMemoryUsage(available, total);
}
#endif
} // namespace memory_utils } // namespace memory_utils
} // namespace phi } // namespace phi
...@@ -113,6 +113,16 @@ struct MemoryInterface { ...@@ -113,6 +113,16 @@ struct MemoryInterface {
*/ */
int64_t (*device_memory_stat_current_value)(const std::string& stat_type, int64_t (*device_memory_stat_current_value)(const std::string& stat_type,
int dev_id); int dev_id);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
/**
* @brief get the memory usage of current GPU device.
*
* @param[size_t] available device available memory to alloc
* @param[size_t] total device total memory
*/
void (*gpu_memory_usage)(size_t* available, size_t* total);
#endif
}; };
class MemoryUtils { class MemoryUtils {
...@@ -234,6 +244,18 @@ class MemoryUtils { ...@@ -234,6 +244,18 @@ class MemoryUtils {
return memory_method_->device_memory_stat_current_value(stat_type, dev_id); return memory_method_->device_memory_stat_current_value(stat_type, dev_id);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void GpuMemoryUsage(size_t* available, size_t* total) {
CheckMemoryMethod();
PADDLE_ENFORCE_NOT_NULL(
memory_method_->gpu_memory_usage,
phi::errors::Unavailable(
"gpu_memory_usage method in memory_method_ is not initiazed "
"yet. You need init it first."));
return memory_method_->gpu_memory_usage(available, total);
}
#endif
void CheckMemoryMethod() { void CheckMemoryMethod() {
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
memory_method_.get(), memory_method_.get(),
...@@ -288,7 +310,13 @@ void Copy(const Place& dst_place, ...@@ -288,7 +310,13 @@ void Copy(const Place& dst_place,
const Place& src_place, const Place& src_place,
const void* src, const void* src,
size_t num); size_t num);
int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id); int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void GpuMemoryUsage(size_t* available, size_t* total);
#endif
} // namespace memory_utils } // namespace memory_utils
} // namespace phi } // namespace phi
...@@ -20,7 +20,6 @@ limitations under the License. */ ...@@ -20,7 +20,6 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/autotune/cache.h" #include "paddle/phi/kernels/autotune/cache.h"
...@@ -53,7 +52,7 @@ static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) { ...@@ -53,7 +52,7 @@ static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
memory_utils::DeviceMemoryStatCurrentValue("Allocated", device_id); memory_utils::DeviceMemoryStatCurrentValue("Allocated", device_id);
int64_t reserved = int64_t reserved =
memory_utils::DeviceMemoryStatCurrentValue("Reserved", device_id); memory_utils::DeviceMemoryStatCurrentValue("Reserved", device_id);
int64_t availble = paddle::platform::GpuAvailableMemToAlloc(); int64_t availble = phi::backends::gpu::GpuAvailableMemToAlloc();
VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated) VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
<< " MB, reserved=" << ToMegaBytes(reserved) << " MB, reserved=" << ToMegaBytes(reserved)
<< " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB."; << " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB.";
......
...@@ -627,7 +627,6 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -627,7 +627,6 @@ void ConvCudnnGradKernel(const Context& ctx,
compute_format == phi::backends::gpu::DataLayout::kNHWC compute_format == phi::backends::gpu::DataLayout::kNHWC
? phi::backends::gpu::DataLayout::kNHWC ? phi::backends::gpu::DataLayout::kNHWC
: phi::backends::gpu::DataLayout::kNCHW; : phi::backends::gpu::DataLayout::kNCHW;
// TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
if (transformed_input.dims().size() == 5) { if (transformed_input.dims().size() == 5) {
layout = compute_format == phi::backends::gpu::DataLayout::kNHWC layout = compute_format == phi::backends::gpu::DataLayout::kNHWC
? phi::backends::gpu::DataLayout::kNDHWC ? phi::backends::gpu::DataLayout::kNDHWC
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册