add pinned memory

236b7dd2 · chengduoZH · 484cff6e · 236b7dd2 · 236b7dd2 · 236b7dd2
4 changed file
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -119,6 +119,47 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {

 bool GPUAllocator::UseGpu() const { return true; }

+void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
+  if (size <= 0) return nullptr;
+  void* p;
+  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
+  // of host fallback allocation. Allocates too much would reduce
+  // the amount of memory available to the underlying system for paging.
+
+  size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
+
+  if (size > usable) return nullptr;
+
+  cudaError_t result = cudaMallocHost(&p, size);
+  if (result == cudaSuccess) {
+    index = 1;
+    fallback_alloc_size_ += size;
+    return p;
+  }
+
+  return nullptr;
+}
+
+void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
+  cudaError_t err;
+  PADDLE_ASSERT(index == 1);
+
+  PADDLE_ASSERT(fallback_alloc_size_ >= size);
+  fallback_alloc_size_ -= size;
+  err = cudaFreeHost(p);
+
+  // Purposefully allow cudaErrorCudartUnloading, because
+  // that is returned if you ever call cudaFree after the
+  // driver has already shutdown. This happens only if the
+  // process is terminating, in which case we don't care if
+  // cudaFree succeeds.
+  if (err != cudaErrorCudartUnloading) {
+    PADDLE_ENFORCE(err, "cudaFreeHost failed in GPUPinnedAllocator::Free.");
+  }
+}
+
+bool CUDAPinnedAllocator::UseGpu() const { return true; }
+
 #endif

 }  // namespace detail

--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -51,6 +51,18 @@ class GPUAllocator : public SystemAllocator {
  size_t gpu_alloc_size_ = 0;
  size_t fallback_alloc_size_ = 0;
 };
+
+class CUDAPinnedAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t gpu_alloc_size_ =
+      0;  // TODO(zcd): how to define the upper limit of CUDAPinnedMemory?
+  size_t fallback_alloc_size_ = 0;
+};
 #endif

 }  // namespace detail

--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
@@ -38,7 +38,8 @@ BuddyAllocator* GetCPUBuddyAllocator() {
 }

 template <>
-void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
+                                bool use_pinned) {
  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
  void* p = GetCPUBuddyAllocator()->Alloc(size);
  VLOG(10) << "  pointer=" << p;
@@ -46,7 +47,8 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
 }

 template <>
-void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+void Free<platform::CPUPlace>(platform::CPUPlace place, void* p,
+                              bool use_pinned) {
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
  GetCPUBuddyAllocator()->Free(p);
 }
@@ -82,15 +84,47 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
  return as[gpu_id];
 }

+BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) {
+  static BuddyAllocator** as = NULL;
+  if (as == NULL) {
+    int gpu_num = platform::GetCUDADeviceCount();
+    as = new BuddyAllocator*[gpu_num];
+    for (int gpu = 0; gpu < gpu_num; gpu++) {
+      as[gpu] = nullptr;
+    }
+  }
+  platform::SetDeviceId(gpu_id);
+  if (!as[gpu_id]) {
+    as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator,
+                                    platform::GpuMinChunkSize(),
+                                    platform::GpuMaxChunkSize());
+    VLOG(10) << "\n\nNOTE: each GPU device use "
+             << FLAGS_fraction_of_gpu_memory_to_use * 100
+             << "% of GPU memory.\n"
+             << "You can set GFlags environment variable '"
+             << "FLAGS_fraction_of_gpu_memory_to_use"
+             << "' to change the fraction of GPU usage.\n\n";
+  }
+  return as[gpu_id];
+}
+
 template <>
 size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
  return GetGPUBuddyAllocator(place.device)->Used();
 }

 template <>
-void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
-  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
-  auto* ptr = buddy_allocator->Alloc(size);
+void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
+                                 bool use_pinned) {
+  void* ptr;
+  if (use_pinned) {
+    auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
+    ptr = buddy_allocator->Alloc(size);
+  } else {
+    auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
+    ptr = buddy_allocator->Alloc(size);
+  }
+
  if (ptr == nullptr) {
    int cur_dev = platform::GetCurrentDeviceId();
    platform::SetDeviceId(place.device);
@@ -108,8 +142,13 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
 }

 template <>
-void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
-  GetGPUBuddyAllocator(place.device)->Free(p);
+void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p,
+                               bool use_pinned) {
+  if (use_pinned) {
+    GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
+  } else {
+    GetGPUBuddyAllocator(place.device)->Free(p);
+  }
 }

 #endif

--- a/paddle/fluid/memory/memory.h
+++ b/paddle/fluid/memory/memory.h
@@ -33,7 +33,7 @@ namespace memory {
 *          address is valid or not.
 */
 template <typename Place>
-void* Alloc(Place place, size_t size);
+void* Alloc(Place place, size_t size, bool use_pinned = false);

 /**
 * \brief   Free memory block in one place.
@@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size);
 *
 */
 template <typename Place>
-void Free(Place place, void* ptr);
+void Free(Place place, void* ptr, bool use_pinned = false);

 /**
 * \brief   Total size of used memory in one place.
@@ -74,11 +74,15 @@ class PODDeleter {
  static_assert(std::is_pod<T>::value, "T must be POD");

 public:
-  explicit PODDeleter(Place place) : place_(place) {}
-  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
+  explicit PODDeleter(Place place, bool use_pinned = false)
+      : place_(place), use_pinned_(use_pinned) {}
+  void operator()(T* ptr) {
+    Free(place_, static_cast<void*>(ptr), use_pinned_);
+  }

 private:
  Place place_;
+  bool use_pinned_;
 };

 /**