FIX: Pinned memory

ce938ae5 · liaogang · db128c45 · ce938ae5 · ce938ae5 · ce938ae5
4 changed file
--- a/paddle/memory/README.md
+++ b/paddle/memory/README.md
@@ -97,6 +97,7 @@ class BuddyAllocator {
  struct Block {
    size_t size;
    Block* left, right;
+    size_t index; // allocator id
  };
  ...
 };

--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
-if(${WITH_GPU})
-  nv_test(cpu_allocator_test SRCS cpu_allocator_test.cc) # nv_test links CUDA, but
-else(${WITH_GPU})
-  cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc) # cc_test doesn't.
-endif(${WITH_GPU})
+cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc)
--- a/paddle/memory/detail/cpu_allocator.h
+++ b/paddle/memory/detail/cpu_allocator.h
@@ -14,20 +14,19 @@ limitations under the License. */

 #pragma once

-#include <malloc.h>  // for malloc and free
 #include <stddef.h>  // for size_t
+#include <cstdlib>   // for malloc and free

-#ifdef PADDLE_WITH_GPU
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#endif  // PADDLE_WITH_GPU
+#ifndef _WIN32
+#include <sys/mman.h>  // for mlock and munlock
+#endif

 namespace paddle {
 namespace memory {
 namespace detail {

-// CPUAllocator<staging=true> calls cudaMallocHost, which returns
-// pinned and mlocked memory as staging areas for data exchange
+// CPUAllocator<staging=true> calls mlock, which returns
+// pinned and locked memory as staging areas for data exchange
 // between host and device.  Allocates too much would reduce the
 // amount of memory available to the system for paging.  So, by
 // default, we should use CPUAllocator<staging=false>.
@@ -35,33 +34,37 @@ template <bool staging>
 class CPUAllocator {
 public:
  void* Alloc(size_t size);
-  void Free(void* p);
+  void Free(void* p, size_t size);
 };

 template <>
 class CPUAllocator<false> {
 public:
-  void* Alloc(size_t size) { return malloc(size); }
-  void Free(void* p) { free(p); }
+  void* Alloc(size_t size) { return std::malloc(size); }
+  void Free(void* p, size_t size) { std::free(p); }
 };

-// If CMake macro PADDLE_WITH_GPU is OFF, C++ compiler won't generate the
-// following specialization that depends on the CUDA library.
-#ifdef PADDLE_WITH_GPU
 template <>
 class CPUAllocator<true> {
 public:
  void* Alloc(size_t size) {
-    void* p;
-    if (cudaMallocHost(&p, size) != cudaSuccess) {
-      return NULL;
+    void* p = std::malloc(size);
+    if (p == nullptr) {
+      return p;
    }
+#ifndef _WIN32
+    mlock(p, size);
+#endif
    return p;
  }

-  void Free(void* p) { cudaFreeHost(p); }
+  void Free(void* p, size_t size) {
+#ifndef _WIN32
+    munlock(p, size);
+#endif
+    std::free(p);
+  }
 };
-#endif  // PADDLE_WITH_GPU

 }  // namespace detail
 }  // namespace memory

--- a/paddle/memory/detail/cpu_allocator_test.cc
+++ b/paddle/memory/detail/cpu_allocator_test.cc
@@ -19,20 +19,12 @@ TEST(CPUAllocator, NonStaging) {
  paddle::memory::detail::CPUAllocator<false> a;
  void* p = a.Alloc(4096);
  EXPECT_NE(p, nullptr);
-  a.Free(p);
+  a.Free(p, 4096);
 }

-#ifdef PADDLE_WITH_GPU
 TEST(CPUAllocator, Staging) {
  paddle::memory::detail::CPUAllocator<true> a;
-
-  int devices;
-  if (cudaGetDeviceCount(&devices) == cudaSuccess && devices > 0) {
-    void* p = a.Alloc(4096);
-    EXPECT_NE(p, nullptr);
-    a.Free(p);
-  } else {
-    EXPECT_EQ(a.Alloc(4096), nullptr);
-  }
+  void* p = a.Alloc(4096);
+  EXPECT_NE(p, nullptr);
+  a.Free(p, 4096);
 }
-#endif  // PADDLE_WITH_GPU