From 236b7dd2bde254f83479ca632756b4dfaa1b8bdc Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 20 Mar 2018 14:28:07 +0800 Subject: [PATCH] add pinned memory --- .../fluid/memory/detail/system_allocator.cc | 41 ++++++++++++++ paddle/fluid/memory/detail/system_allocator.h | 12 +++++ paddle/fluid/memory/memory.cc | 53 ++++++++++++++++--- paddle/fluid/memory/memory.h | 12 +++-- 4 files changed, 107 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 8ac8978120..df9d28ede8 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -119,6 +119,47 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) { bool GPUAllocator::UseGpu() const { return true; } +void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) { + if (size <= 0) return nullptr; + void* p; + // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size + // of host fallback allocation. Allocates too much would reduce + // the amount of memory available to the underlying system for paging. + + size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_; + + if (size > usable) return nullptr; + + cudaError_t result = cudaMallocHost(&p, size); + if (result == cudaSuccess) { + index = 1; + fallback_alloc_size_ += size; + return p; + } + + return nullptr; +} + +void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { + cudaError_t err; + PADDLE_ASSERT(index == 1); + + PADDLE_ASSERT(fallback_alloc_size_ >= size); + fallback_alloc_size_ -= size; + err = cudaFreeHost(p); + + // Purposefully allow cudaErrorCudartUnloading, because + // that is returned if you ever call cudaFree after the + // driver has already shutdown. This happens only if the + // process is terminating, in which case we don't care if + // cudaFree succeeds. + if (err != cudaErrorCudartUnloading) { + PADDLE_ENFORCE(err, "cudaFreeHost failed in GPUPinnedAllocator::Free."); + } +} + +bool CUDAPinnedAllocator::UseGpu() const { return true; } + #endif } // namespace detail diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h index e93c2c1e32..3e024125fa 100644 --- a/paddle/fluid/memory/detail/system_allocator.h +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -51,6 +51,18 @@ class GPUAllocator : public SystemAllocator { size_t gpu_alloc_size_ = 0; size_t fallback_alloc_size_ = 0; }; + +class CUDAPinnedAllocator : public SystemAllocator { + public: + virtual void* Alloc(size_t& index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + virtual bool UseGpu() const; + + private: + size_t gpu_alloc_size_ = + 0; // TODO(zcd): how to define the upper limit of CUDAPinnedMemory? + size_t fallback_alloc_size_ = 0; +}; #endif } // namespace detail diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc index d07f89439a..c5577587aa 100644 --- a/paddle/fluid/memory/memory.cc +++ b/paddle/fluid/memory/memory.cc @@ -38,7 +38,8 @@ BuddyAllocator* GetCPUBuddyAllocator() { } template <> -void* Alloc(platform::CPUPlace place, size_t size) { +void* Alloc(platform::CPUPlace place, size_t size, + bool use_pinned) { VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); void* p = GetCPUBuddyAllocator()->Alloc(size); VLOG(10) << " pointer=" << p; @@ -46,7 +47,8 @@ void* Alloc(platform::CPUPlace place, size_t size) { } template <> -void Free(platform::CPUPlace place, void* p) { +void Free(platform::CPUPlace place, void* p, + bool use_pinned) { VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); GetCPUBuddyAllocator()->Free(p); } @@ -82,15 +84,47 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { return as[gpu_id]; } +BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) { + static BuddyAllocator** as = NULL; + if (as == NULL) { + int gpu_num = platform::GetCUDADeviceCount(); + as = new BuddyAllocator*[gpu_num]; + for (int gpu = 0; gpu < gpu_num; gpu++) { + as[gpu] = nullptr; + } + } + platform::SetDeviceId(gpu_id); + if (!as[gpu_id]) { + as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator, + platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize()); + VLOG(10) << "\n\nNOTE: each GPU device use " + << FLAGS_fraction_of_gpu_memory_to_use * 100 + << "% of GPU memory.\n" + << "You can set GFlags environment variable '" + << "FLAGS_fraction_of_gpu_memory_to_use" + << "' to change the fraction of GPU usage.\n\n"; + } + return as[gpu_id]; +} + template <> size_t Used(platform::CUDAPlace place) { return GetGPUBuddyAllocator(place.device)->Used(); } template <> -void* Alloc(platform::CUDAPlace place, size_t size) { - auto* buddy_allocator = GetGPUBuddyAllocator(place.device); - auto* ptr = buddy_allocator->Alloc(size); +void* Alloc(platform::CUDAPlace place, size_t size, + bool use_pinned) { + void* ptr; + if (use_pinned) { + auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device); + ptr = buddy_allocator->Alloc(size); + } else { + auto* buddy_allocator = GetGPUBuddyAllocator(place.device); + ptr = buddy_allocator->Alloc(size); + } + if (ptr == nullptr) { int cur_dev = platform::GetCurrentDeviceId(); platform::SetDeviceId(place.device); @@ -108,8 +142,13 @@ void* Alloc(platform::CUDAPlace place, size_t size) { } template <> -void Free(platform::CUDAPlace place, void* p) { - GetGPUBuddyAllocator(place.device)->Free(p); +void Free(platform::CUDAPlace place, void* p, + bool use_pinned) { + if (use_pinned) { + GetCUDAPinnedBuddyAllocator(place.device)->Free(p); + } else { + GetGPUBuddyAllocator(place.device)->Free(p); + } } #endif diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h index 7c5db815d6..9bc48ac68f 100644 --- a/paddle/fluid/memory/memory.h +++ b/paddle/fluid/memory/memory.h @@ -33,7 +33,7 @@ namespace memory { * address is valid or not. */ template -void* Alloc(Place place, size_t size); +void* Alloc(Place place, size_t size, bool use_pinned = false); /** * \brief Free memory block in one place. @@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size); * */ template -void Free(Place place, void* ptr); +void Free(Place place, void* ptr, bool use_pinned = false); /** * \brief Total size of used memory in one place. @@ -74,11 +74,15 @@ class PODDeleter { static_assert(std::is_pod::value, "T must be POD"); public: - explicit PODDeleter(Place place) : place_(place) {} - void operator()(T* ptr) { Free(place_, static_cast(ptr)); } + explicit PODDeleter(Place place, bool use_pinned = false) + : place_(place), use_pinned_(use_pinned) {} + void operator()(T* ptr) { + Free(place_, static_cast(ptr), use_pinned_); + } private: Place place_; + bool use_pinned_; }; /** -- GitLab