add unit test

158d6c4d · chengduoZH · 18eb7730 · 158d6c4d · 158d6c4d · 158d6c4d
7 changed file
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -92,9 +92,6 @@ class Tensor {
  /*! Return the numel of the memory block. */
  inline int64_t numel() const;

-  /*! Return the numel of the memory block. */
-  inline bool isPinned() const;
-
  /*! Resize the dimensions of the memory block. */
  inline Tensor& Resize(const DDim& dims);


--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -14,6 +14,7 @@ limitations under the License. */

 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"

@@ -127,10 +128,12 @@ void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
  // NOTE: here, we use CpuMaxAllocSize()/2 as the maximum memory size
  // of host pinned allocation. Allocates too much would reduce
  // the amount of memory available to the underlying system for paging.
-  size_t usable = CpuMaxAllocSize() / 2 - cuda_pinnd_alloc_size_;
+  size_t usable =
+      paddle::platform::CpuMaxAllocSize() / 2 - cuda_pinnd_alloc_size_;

  if (size > usable) return nullptr;

+  void* p;
  // PINNED memory is visible to all CUDA contexts.
  cudaError_t result = cudaMallocHost(&p, size);

@@ -161,7 +164,7 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
  }
 }

-bool CUDAPinnedAllocator::UseGpu() const { return true; }
+bool CUDAPinnedAllocator::UseGpu() const { return false; }

 #endif


--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -56,6 +56,45 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
  }
 }

+template <>
+void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
+    platform::CPUPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
+    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CPUPlace src_place, const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
+    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
+    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CUDAPlace src_place, const void* src, size_t num,
+    cudaStream_t stream) {
+  platform::SetDeviceId(src_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+}
+
+template <>
+void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
+    platform::CUDAPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace src_place, const void* src, size_t num,
+    cudaStream_t stream) {
+  platform::SetDeviceId(dst_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+}
+
 #endif

 }  // namespace memory

--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
@@ -82,16 +82,6 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
  return as[gpu_id];
 }

-BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) {
-  static BuddyAllocator* as = NULL;
-  if (as == NULL) {
-    as = new BuddyAllocator(new detail::CUDAPinnedAllocator,
-                            platform::CpuMinChunkSize(),
-                            platform::CpuMaxChunkSize());
-  }
-  return as;
-}
-
 template <>
 size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
  return GetGPUBuddyAllocator(place.device)->Used();
@@ -100,8 +90,7 @@ size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
 template <>
 void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
-  void* ptr = buddy_allocator->Alloc(size);
-
+  auto* ptr = buddy_allocator->Alloc(size);
  if (ptr == nullptr) {
    int cur_dev = platform::GetCurrentDeviceId();
    platform::SetDeviceId(place.device);
@@ -123,37 +112,39 @@ void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
  GetGPUBuddyAllocator(place.device)->Free(p);
 }

+BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
+  static BuddyAllocator* ba = NULL;
+  if (ba == NULL) {
+    ba = new BuddyAllocator(new detail::CUDAPinnedAllocator,
+                            platform::CpuMinChunkSize(),
+                            platform::CpuMaxChunkSize());
+  }
+  return ba;
+}
+
+template <>
 size_t Used<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place) {
-  return GetGPUBuddyAllocator(place.device)->Used();
+  return GetCUDAPinnedBuddyAllocator()->Used();
 }

 template <>
 void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
                                       size_t size) {
-  auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
+  auto* buddy_allocator = GetCUDAPinnedBuddyAllocator();
  void* ptr = buddy_allocator->Alloc(size);

-  if (ptr == nullptr) {
-    int cur_dev = platform::GetCurrentDeviceId();
-    platform::SetDeviceId(place.device);
-    size_t avail, total;
-    platform::GpuMemoryUsage(avail, total);
-    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
-                 << place.device << ", available " << avail << " bytes";
-    LOG(WARNING) << "total " << total;
-    LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
-    LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
-    LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
-    platform::SetDeviceId(cur_dev);
-  }
+  //  if (ptr == nullptr) {
+  //    LOG(WARNING) << "Cannot allocate " << size << " bytes in CUDAPinnedPlace
+  //    "
+  //                 << ", available " << avail << " bytes"
+  //  }
  return ptr;
 }

 template <>
 void Free<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place, void* p) {
-  GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
+  GetCUDAPinnedBuddyAllocator()->Free(p);
 }
-
 #endif

 size_t Usage::operator()(const platform::CPUPlace& cpu) const {

--- a/paddle/fluid/memory/memory.h
+++ b/paddle/fluid/memory/memory.h
@@ -33,7 +33,7 @@ namespace memory {
 *          address is valid or not.
 */
 template <typename Place>
-void* Alloc(Place place, size_t size, bool is_pinned = false);
+void* Alloc(Place place, size_t size);

 /**
 * \brief   Free memory block in one place.
@@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size, bool is_pinned = false);
 *
 */
 template <typename Place>
-void Free(Place place, void* ptr, bool is_pinned = false);
+void Free(Place place, void* ptr);

 /**
 * \brief   Total size of used memory in one place.
@@ -75,13 +75,11 @@ class PODDeleter {
  static_assert(std::is_pod<T>::value, "T must be POD");

 public:
-  explicit PODDeleter(Place place, bool is_pinned = false)
-      : place_(place), is_pinned_(is_pinned) {}
-  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr), is_pinned_); }
+  explicit PODDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }

 private:
  Place place_;
-  bool is_pinned_;
 };

 /**

--- a/paddle/fluid/memory/memory_test.cc
+++ b/paddle/fluid/memory/memory_test.cc
@@ -141,4 +141,59 @@ TEST(BuddyAllocator, GPUMultAlloc) {
  }
 }

+size_t align(size_t size, paddle::platform::CUDAPinnedPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::CpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, CUDAPinnedAllocator) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CUDAPinnedPlace cpu;
+  p = paddle::memory::Alloc(cpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::platform::Place place = cpu;
+  EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
+
+  paddle::memory::Free(cpu, p);
+}
+
+TEST(BuddyAllocator, CUDAPinnedMultAllocator) {
+  paddle::platform::CUDAPinnedPlace cpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(cpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(cpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(size, cpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(cpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, cpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+}
 #endif
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -26,6 +26,7 @@ class PlacePrinter : public boost::static_visitor<> {
  void operator()(const CUDAPlace &p) {
    os_ << "CUDAPlace(" << p.device << ")";
  }
+  void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }

 private:
  std::ostream &os_;