From 158d6c4d1956225e7aa8ddaa4e4af852060916da Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Mon, 26 Mar 2018 20:36:07 +0800
Subject: [PATCH] add unit test

---
 paddle/fluid/framework/tensor.h               |  3 -
 .../fluid/memory/detail/system_allocator.cc   |  7 ++-
 paddle/fluid/memory/memcpy.cc                 | 39 +++++++++++++
 paddle/fluid/memory/memory.cc                 | 49 +++++++----------
 paddle/fluid/memory/memory.h                  | 10 ++--
 paddle/fluid/memory/memory_test.cc            | 55 +++++++++++++++++++
 paddle/fluid/platform/place.cc                |  1 +
 7 files changed, 124 insertions(+), 40 deletions(-)

diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 6eb678e301..6f878541e6 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -92,9 +92,6 @@ class Tensor {
   /*! Return the numel of the memory block. */
   inline int64_t numel() const;
 
-  /*! Return the numel of the memory block. */
-  inline bool isPinned() const;
-
   /*! Resize the dimensions of the memory block. */
   inline Tensor& Resize(const DDim& dims);
 
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index d20c5c8682..2f3c10aeb2 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
@@ -127,10 +128,12 @@ void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
   // NOTE: here, we use CpuMaxAllocSize()/2 as the maximum memory size
   // of host pinned allocation. Allocates too much would reduce
   // the amount of memory available to the underlying system for paging.
-  size_t usable = CpuMaxAllocSize() / 2 - cuda_pinnd_alloc_size_;
+  size_t usable =
+      paddle::platform::CpuMaxAllocSize() / 2 - cuda_pinnd_alloc_size_;
 
   if (size > usable) return nullptr;
 
+  void* p;
   // PINNED memory is visible to all CUDA contexts.
   cudaError_t result = cudaMallocHost(&p, size);
 
@@ -161,7 +164,7 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
   }
 }
 
-bool CUDAPinnedAllocator::UseGpu() const { return true; }
+bool CUDAPinnedAllocator::UseGpu() const { return false; }
 
 #endif
 
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index b991360d04..eddcaab8be 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -56,6 +56,45 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
   }
 }
 
+template <>
+void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
+    platform::CPUPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
+    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CPUPlace src_place, const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
+    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
+    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CUDAPlace src_place, const void* src, size_t num,
+    cudaStream_t stream) {
+  platform::SetDeviceId(src_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+}
+
+template <>
+void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
+    platform::CUDAPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace src_place, const void* src, size_t num,
+    cudaStream_t stream) {
+  platform::SetDeviceId(dst_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+}
+
 #endif
 
 }  // namespace memory
diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc
index 6da9f00656..94b43af147 100644
--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
@@ -82,16 +82,6 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   return as[gpu_id];
 }
 
-BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) {
-  static BuddyAllocator* as = NULL;
-  if (as == NULL) {
-    as = new BuddyAllocator(new detail::CUDAPinnedAllocator,
-                            platform::CpuMinChunkSize(),
-                            platform::CpuMaxChunkSize());
-  }
-  return as;
-}
-
 template <>
 size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
   return GetGPUBuddyAllocator(place.device)->Used();
@@ -100,8 +90,7 @@ size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
 template <>
 void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
   auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
-  void* ptr = buddy_allocator->Alloc(size);
-
+  auto* ptr = buddy_allocator->Alloc(size);
   if (ptr == nullptr) {
     int cur_dev = platform::GetCurrentDeviceId();
     platform::SetDeviceId(place.device);
@@ -123,37 +112,39 @@ void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
   GetGPUBuddyAllocator(place.device)->Free(p);
 }
 
+BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
+  static BuddyAllocator* ba = NULL;
+  if (ba == NULL) {
+    ba = new BuddyAllocator(new detail::CUDAPinnedAllocator,
+                            platform::CpuMinChunkSize(),
+                            platform::CpuMaxChunkSize());
+  }
+  return ba;
+}
+
+template <>
 size_t Used<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place) {
-  return GetGPUBuddyAllocator(place.device)->Used();
+  return GetCUDAPinnedBuddyAllocator()->Used();
 }
 
 template <>
 void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
                                        size_t size) {
-  auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
+  auto* buddy_allocator = GetCUDAPinnedBuddyAllocator();
   void* ptr = buddy_allocator->Alloc(size);
 
-  if (ptr == nullptr) {
-    int cur_dev = platform::GetCurrentDeviceId();
-    platform::SetDeviceId(place.device);
-    size_t avail, total;
-    platform::GpuMemoryUsage(avail, total);
-    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
-                 << place.device << ", available " << avail << " bytes";
-    LOG(WARNING) << "total " << total;
-    LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
-    LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
-    LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
-    platform::SetDeviceId(cur_dev);
-  }
+  //  if (ptr == nullptr) {
+  //    LOG(WARNING) << "Cannot allocate " << size << " bytes in CUDAPinnedPlace
+  //    "
+  //                 << ", available " << avail << " bytes"
+  //  }
   return ptr;
 }
 
 template <>
 void Free<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place, void* p) {
-  GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
+  GetCUDAPinnedBuddyAllocator()->Free(p);
 }
-
 #endif
 
 size_t Usage::operator()(const platform::CPUPlace& cpu) const {
diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h
index fba7372e71..3e6bfddd69 100644
--- a/paddle/fluid/memory/memory.h
+++ b/paddle/fluid/memory/memory.h
@@ -33,7 +33,7 @@ namespace memory {
  *          address is valid or not.
  */
 template <typename Place>
-void* Alloc(Place place, size_t size, bool is_pinned = false);
+void* Alloc(Place place, size_t size);
 
 /**
  * \brief   Free memory block in one place.
@@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size, bool is_pinned = false);
  *
  */
 template <typename Place>
-void Free(Place place, void* ptr, bool is_pinned = false);
+void Free(Place place, void* ptr);
 
 /**
  * \brief   Total size of used memory in one place.
@@ -75,13 +75,11 @@ class PODDeleter {
   static_assert(std::is_pod<T>::value, "T must be POD");
 
  public:
-  explicit PODDeleter(Place place, bool is_pinned = false)
-      : place_(place), is_pinned_(is_pinned) {}
-  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr), is_pinned_); }
+  explicit PODDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
 
  private:
   Place place_;
-  bool is_pinned_;
 };
 
 /**
diff --git a/paddle/fluid/memory/memory_test.cc b/paddle/fluid/memory/memory_test.cc
index eb27a52b25..5254cd28cc 100644
--- a/paddle/fluid/memory/memory_test.cc
+++ b/paddle/fluid/memory/memory_test.cc
@@ -141,4 +141,59 @@ TEST(BuddyAllocator, GPUMultAlloc) {
   }
 }
 
+size_t align(size_t size, paddle::platform::CUDAPinnedPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::CpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, CUDAPinnedAllocator) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CUDAPinnedPlace cpu;
+  p = paddle::memory::Alloc(cpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::platform::Place place = cpu;
+  EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
+
+  paddle::memory::Free(cpu, p);
+}
+
+TEST(BuddyAllocator, CUDAPinnedMultAllocator) {
+  paddle::platform::CUDAPinnedPlace cpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(cpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(cpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(size, cpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(cpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, cpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+}
 #endif
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index 11ca87c211..655ce8485d 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -26,6 +26,7 @@ class PlacePrinter : public boost::static_visitor<> {
   void operator()(const CUDAPlace &p) {
     os_ << "CUDAPlace(" << p.device << ")";
   }
+  void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
 
  private:
   std::ostream &os_;
-- 
GitLab