diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index bdf8325d15056527be5f8cbec09b2eb7b4c28838..827b039a109cf753170bb738cf08016abd9840f8 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -1,6 +1,6 @@ add_subdirectory(detail) add_subdirectory(allocation) -cc_library(malloc SRCS malloc.cc DEPS allocator_facade) +cc_library(malloc SRCS malloc.cc DEPS buddy_allocator place enforce allocator_facade) cc_library(memcpy SRCS memcpy.cc DEPS place) cc_library(memory diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 4f289f753798730f6eaa1a3150b1179d19872928..fd81a0a7c6e2fb6b25c9b94c60f75d4bbdde7441 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -18,6 +18,10 @@ limitations under the License. */ #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/memory/detail/buddy_allocator.h" +#include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/platform/gpu_info.h" + DEFINE_bool(init_allocated_mem, false, "It is a mistake that the values of the memory allocated by " "BuddyAllocator are always zeroed in some op's implementation. " @@ -26,17 +30,262 @@ DEFINE_bool(init_allocated_mem, false, "during unit testing."); DECLARE_double(fraction_of_gpu_memory_to_use); +DEFINE_bool(use_legacy_allocator, true, + "Whether to use the legacy allocator. If the new allocators have" + "been well tested, we should remove these flag."); + namespace paddle { namespace memory { +namespace legacy { + +using BuddyAllocator = detail::BuddyAllocator; + +BuddyAllocator* GetCPUBuddyAllocator() { + // We tried thread_local for inference::RNN1 model, but that not works much + // for multi-thread test. + static std::once_flag init_flag; + static detail::BuddyAllocator* a = nullptr; + + std::call_once(init_flag, []() { + a = new detail::BuddyAllocator( + std::unique_ptr(new detail::CPUAllocator), + platform::CpuMinChunkSize(), platform::CpuMaxChunkSize()); + }); + + return a; +} + +// We compared the NaiveAllocator with BuddyAllocator in CPU memory allocation, +// seems they are almost the same overhead. +struct NaiveAllocator { + void* Alloc(size_t size) { return malloc(size); } + + void Free(void* p) { + PADDLE_ENFORCE(p); + free(p); + } + + static NaiveAllocator* Instance() { + static NaiveAllocator x; + return &x; + } + + private: + std::mutex lock_; +}; + +template <> +void* Alloc(const platform::CPUPlace& place, size_t size) { + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); + void* p = GetCPUBuddyAllocator()->Alloc(size); + if (FLAGS_init_allocated_mem) { + memset(p, 0xEF, size); + } + VLOG(10) << " pointer=" << p; + return p; +} + +template <> +void Free(const platform::CPUPlace& place, void* p) { + VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); + GetCPUBuddyAllocator()->Free(p); +} + +template <> +size_t Used(const platform::CPUPlace& place) { + return GetCPUBuddyAllocator()->Used(); +} + +#ifdef PADDLE_WITH_CUDA + +BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { + static std::once_flag init_flag; + static detail::BuddyAllocator** a_arr = nullptr; + + std::call_once(init_flag, [gpu_id]() { + int gpu_num = platform::GetCUDADeviceCount(); + PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id, + gpu_num); + + a_arr = new BuddyAllocator*[gpu_num]; + for (int i = 0; i < gpu_num; i++) { + a_arr[i] = nullptr; + platform::SetDeviceId(i); + a_arr[i] = new BuddyAllocator( + std::unique_ptr(new detail::GPUAllocator(i)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + + VLOG(10) << "\n\nNOTE: each GPU device use " + << FLAGS_fraction_of_gpu_memory_to_use * 100 + << "% of GPU memory.\n" + << "You can set GFlags environment variable '" + << "FLAGS_fraction_of_gpu_memory_to_use" + << "' to change the fraction of GPU usage.\n\n"; + } + }); + + platform::SetDeviceId(gpu_id); + return a_arr[gpu_id]; +} + +template <> +size_t Used(const platform::CUDAPlace& place) { + return GetGPUBuddyAllocator(place.device)->Used(); +} + +template <> +void* Alloc(const platform::CUDAPlace& place, + size_t size) { + auto* buddy_allocator = GetGPUBuddyAllocator(place.device); + auto* ptr = buddy_allocator->Alloc(size); + if (ptr == nullptr) { + int cur_dev = platform::GetCurrentDeviceId(); + platform::SetDeviceId(place.device); + size_t avail, total; + platform::GpuMemoryUsage(&avail, &total); + LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU " + << place.device << ", available " << avail << " bytes"; + LOG(WARNING) << "total " << total; + LOG(WARNING) << "GpuMinChunkSize " << buddy_allocator->GetMinChunkSize(); + LOG(WARNING) << "GpuMaxChunkSize " << buddy_allocator->GetMaxChunkSize(); + LOG(WARNING) << "GPU memory used: " << Used(place); + platform::SetDeviceId(cur_dev); + } + if (FLAGS_init_allocated_mem) { + cudaMemset(ptr, 0xEF, size); + } + return ptr; +} + +template <> +void Free(const platform::CUDAPlace& place, void* p) { + GetGPUBuddyAllocator(place.device)->Free(p); +} + +BuddyAllocator* GetCUDAPinnedBuddyAllocator() { + static std::once_flag init_flag; + static BuddyAllocator* ba = nullptr; + + std::call_once(init_flag, []() { + ba = new BuddyAllocator(std::unique_ptr( + new detail::CUDAPinnedAllocator), + platform::CUDAPinnedMinChunkSize(), + platform::CUDAPinnedMaxChunkSize()); + }); + + return ba; +} + +template <> +size_t Used(const platform::CUDAPinnedPlace& place) { + return GetCUDAPinnedBuddyAllocator()->Used(); +} + +template <> +void* Alloc(const platform::CUDAPinnedPlace& place, + size_t size) { + auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(); + void* ptr = buddy_allocator->Alloc(size); + + if (ptr == nullptr) { + LOG(WARNING) << "cudaMallocHost Cannot allocate " << size + << " bytes in CUDAPinnedPlace"; + } + if (FLAGS_init_allocated_mem) { + memset(ptr, 0xEF, size); + } + return ptr; +} + +template <> +void Free(const platform::CUDAPinnedPlace& place, + void* p) { + GetCUDAPinnedBuddyAllocator()->Free(p); +} +#endif + +struct AllocVisitor : public boost::static_visitor { + inline explicit AllocVisitor(size_t size) : size_(size) {} + + template + inline void* operator()(const Place& place) const { + return Alloc(place, size_); + } + + private: + size_t size_; +}; + +struct FreeVisitor : public boost::static_visitor { + inline explicit FreeVisitor(void* ptr) : ptr_(ptr) {} + + template + inline void operator()(const Place& place) const { + Free(place, ptr_); + } + + private: + void* ptr_; +}; + +size_t Usage::operator()(const platform::CPUPlace& cpu) const { + return Used(cpu); +} + +size_t Usage::operator()(const platform::CUDAPlace& gpu) const { +#ifdef PADDLE_WITH_CUDA + return Used(gpu); +#else + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); +#endif +} + +size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const { +#ifdef PADDLE_WITH_CUDA + return Used(cuda_pinned); +#else + PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device."); +#endif +} + +size_t memory_usage(const platform::Place& p) { + return boost::apply_visitor(Usage(), p); +} + +class LegacyAllocation : public Allocation { + public: + using Allocation::Allocation; + + ~LegacyAllocation() { + boost::apply_visitor(FreeVisitor(this->ptr()), this->place()); + } +}; + +} // namespace legacy + std::shared_ptr AllocShared(const platform::Place& place, size_t size, Allocator::Attr attr) { - return allocation::AllocatorFacade::Instance().AllocShared(place, size, attr); + if (FLAGS_use_legacy_allocator) { + void* p = boost::apply_visitor(legacy::AllocVisitor(size), place); + return std::shared_ptr( + new legacy::LegacyAllocation(p, size, place)); + } else { + return allocation::AllocatorFacade::Instance().AllocShared(place, size, + attr); + } } std::unique_ptr Alloc(const platform::Place& place, size_t size, Allocator::Attr attr) { - return allocation::AllocatorFacade::Instance().Alloc(place, size, attr); + if (FLAGS_use_legacy_allocator) { + void* p = boost::apply_visitor(legacy::AllocVisitor(size), place); + return std::unique_ptr( + new legacy::LegacyAllocation(p, size, place)); + } else { + return allocation::AllocatorFacade::Instance().Alloc(place, size, attr); + } } + } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 061ca97dd8e557c7b54decda77cf0ea258483da3..d026bd4bcd5c007d65296e65df3931058ea554cf 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -30,5 +30,26 @@ extern std::unique_ptr Alloc( const platform::Place& place, size_t size, Allocator::Attr attr = Allocator::kDefault); +namespace legacy { + +template +void* Alloc(const Place& place, size_t size); + +template +void Free(const Place& place, void* p); + +template +size_t Used(const Place& place); + +struct Usage : public boost::static_visitor { + size_t operator()(const platform::CPUPlace& cpu) const; + size_t operator()(const platform::CUDAPlace& gpu) const; + size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const; +}; + +size_t memory_usage(const platform::Place& p); + +} // namespace legacy + } // namespace memory } // namespace paddle diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index f0032ab0fae78b78501f222011fb09f31446e93f..ea1086cd4d0b71c8075d6895a68935a4f098fbc3 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -113,7 +113,7 @@ def __bootstrap__(): 'check_nan_inf', 'benchmark', 'warpctc_dir', 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'paddle_num_threads', "dist_threadpool_size", 'cpu_deterministic', - 'eager_delete_tensor_gb' + 'eager_delete_tensor_gb', 'use_legacy_allocator' ] if core.is_compiled_with_dist(): read_env_flags.append('rpc_deadline')