From 2002e71da825ef102e27f6318523369f893338dc Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 19 Oct 2018 09:53:57 +0000
Subject: [PATCH] fix pinned allocator

---
 paddle/fluid/framework/tensor_util.cc         |   3 +-
 paddle/fluid/memory/allocation/CMakeLists.txt |  10 +-
 .../memory/allocation/allocator_facade.cc     | 113 ++++++++++++------
 .../allocation/allocator_facade_test.cc       |  45 ++++++-
 .../allocation/auto_increment_allocator.h     |   1 +
 .../memory/allocation/locked_allocator.cc     |   1 +
 .../memory/allocation/locked_allocator.h      |   1 +
 .../memory/allocation/pinned_allocator.cc     |   6 +-
 .../memory/allocation/pinned_allocator.h      |   2 +-
 .../fluid/memory/detail/system_allocator.cc   |   7 +-
 paddle/fluid/memory/malloc.cc                 |  29 ++++-
 paddle/fluid/memory/memcpy.cc                 |  10 ++
 paddle/fluid/platform/cpu_info.cc             |   9 +-
 paddle/fluid/platform/cpu_info.h              |   2 +
 paddle/fluid/platform/device_context.cc       |   2 +-
 paddle/fluid/platform/init.cc                 |   2 +
 paddle/fluid/pybind/tensor_py.h               |   3 +-
 python/paddle/fluid/__init__.py               |   8 +-
 18 files changed, 184 insertions(+), 70 deletions(-)
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 89917cdfa..9fe92831e 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -112,8 +112,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   dst->set_layout(src.layout());
   auto src_place = src.place();
   auto src_ptr = src.data<void>();
-  auto dst_ptr =
-      dst->mutable_data(dst_place, src.type(), memory::Allocator::kCrossDevice);
+  auto dst_ptr = dst->mutable_data(dst_place, src.type());
   auto size = src.numel() * SizeOfType(src.type());
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 5620b30f5..b2be83783 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -2,7 +2,10 @@ cc_library(allocator SRCS allocator.cc DEPS place)
 cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
 cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
-nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
+
+if (WITH_GPU)
+  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
+endif()
 
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 
@@ -29,7 +32,7 @@ cc_library(naive_managed_allocator SRCS naive_managed_allocator.cc DEPS allocato
 cc_test(naive_managed_allocator_test SRCS naive_managed_allocator_test.cc DEPS naive_managed_allocator)
 nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
 if (WITH_GPU)
-    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator)
+    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard)
 else ()
     set(AllocatorFacadeDeps)
 endif()
@@ -48,8 +51,7 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS
         auto_increment_allocator
         zero_size_allocator
         conditional_allocator
-        retry_allocator
-        cuda_device_guard)
+        retry_allocator)
 
 nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 02ea5d7e7..f82668bff 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -25,17 +25,18 @@
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
 #include "paddle/fluid/memory/allocation/naive_managed_allocator.h"
-#include "paddle/fluid/memory/allocation/pinned_allocator.h"
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 #include "paddle/fluid/memory/allocation/zero_size_allocator.h"
-#include "paddle/fluid/platform/cuda_device_guard.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
+#include "paddle/fluid/memory/allocation/pinned_allocator.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/gpu_info.h"
 #endif
 
-DEFINE_int32(
+DEFINE_int64(
     gpu_allocator_retry_time, 0,
     "The retry time (milliseconds) when allocator fails "
     "to allocate memory. No retry if this value is not greater than 0");
@@ -49,51 +50,34 @@ class CPUManagedAllocator : public ManagedAllocator {
  public:
   CPUManagedAllocator()
       : normal_allocator_(NaiveManagedAllocator::Create(
-            std::unique_ptr<Allocator>(new CPUAllocator()))),
-        communication_allocator_(NaiveManagedAllocator::Create(
-            std::unique_ptr<Allocator>(new CPUPinnedAllocator()))) {}
+            std::unique_ptr<Allocator>(new CPUAllocator()))) {}
 
   std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override {
-    if (attr == kCrossDevice) {
-      return communication_allocator_->Allocate(size, attr);
-    } else {
-      return normal_allocator_->Allocate(size, attr);
-    }
+    return normal_allocator_->Allocate(size, attr);
   }
 
   std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
-    if (attr == kCrossDevice) {
-      return communication_allocator_->AllocateShared(size, attr);
-    } else {
-      return normal_allocator_->AllocateShared(size, attr);
-    }
+    return normal_allocator_->AllocateShared(size, attr);
   }
 
   bool IsAllocThreadSafe() const override { return true; }
 
  private:
   std::shared_ptr<ManagedAllocator> normal_allocator_;
-  std::shared_ptr<ManagedAllocator> communication_allocator_;
 };
 
-#ifdef PADDLE_WITH_CUDA
 // TODO(yy): Dirty code here. This class should be configurable in runtime.
-class CUDAManagedAllocator : public ManagedAllocator {
+class ChunkedManagedAllocator : public ManagedAllocator {
  public:
-  explicit CUDAManagedAllocator(int dev_id) {
-    platform::CUDADeviceGuard guard(dev_id);
-    max_chunk_size_ = platform::GpuMaxChunkSize();
-
-    raw_allocator_ = NaiveManagedAllocator::Create(std::unique_ptr<Allocator>(
-        new CUDAAllocator(platform::CUDAPlace(dev_id))));
+  explicit ChunkedManagedAllocator(std::unique_ptr<Allocator> system_allocator,
+                                   size_t max_chunk_size, size_t capacity = 1,
+                                   int64_t retry_time = -1)
+      : max_chunk_size_(max_chunk_size), retry_time_(retry_time) {
+    raw_allocator_ = NaiveManagedAllocator::Create(std::move(system_allocator));
 
     if (max_chunk_size_ == 0) {
       default_allocator_ = raw_allocator_;
     } else {
-      size_t available, total;
-      platform::GpuMemoryUsage(&available, &total);
-      size_t capacity = available / max_chunk_size_;
-
       if (capacity == 1) {
         VLOG(10) << "Create BestFitAllocator with chunk_size "
                  << max_chunk_size_;
@@ -119,7 +103,7 @@ class CUDAManagedAllocator : public ManagedAllocator {
     default_allocator_.reset(cond_allocator);
   }
 
-  ~CUDAManagedAllocator() {
+  ~ChunkedManagedAllocator() {
     // Specify destruct order.
     default_allocator_.reset();
     chunks_.clear();
@@ -140,27 +124,71 @@ class CUDAManagedAllocator : public ManagedAllocator {
     std::unique_ptr<Allocator> unmanaged_allocator(new LockedAllocator(
         std::unique_ptr<Allocator>(new BestFitAllocator(allocation))));
 
-    if (FLAGS_gpu_allocator_retry_time <= 0) {
+    if (retry_time_ <= 0) {
       VLOG(10) << "Create NaiveManagedAllocator without retry";
       return std::make_shared<AlignedAllocator<64u>>(
           NaiveManagedAllocator::Create(std::move(unmanaged_allocator)));
     } else {
-      VLOG(10) << "Create RetryAllocator with retry_time "
-               << FLAGS_gpu_allocator_retry_time << "ms";
+      VLOG(10) << "Create RetryAllocator with retry_time " << retry_time_
+               << "ms";
       return std::make_shared<AlignedAllocator<64u>>(RetryAllocator::Create(
-          std::move(unmanaged_allocator),
-          static_cast<size_t>(FLAGS_gpu_allocator_retry_time)));
+          std::move(unmanaged_allocator), static_cast<size_t>(retry_time_)));
     }
   }
 
   bool IsAllocThreadSafe() const override { return true; }
 
- private:
+ protected:
   size_t max_chunk_size_;
+  int64_t retry_time_;
   std::vector<std::unique_ptr<Allocation>> chunks_;
   std::shared_ptr<ManagedAllocator> raw_allocator_;
   std::shared_ptr<ManagedAllocator> default_allocator_;
 };
+
+#ifdef PADDLE_WITH_CUDA
+
+class CUDAManagedAllocator : public ChunkedManagedAllocator {
+ public:
+  explicit CUDAManagedAllocator(int dev_id)
+      : ChunkedManagedAllocator(
+            std::unique_ptr<Allocator>(
+                new CUDAAllocator(platform::CUDAPlace(dev_id))),
+            GetMaxChunkSize(dev_id), GetCapcity(dev_id), GetRetryTime()) {}
+
+ private:
+  static size_t GetMaxChunkSize(int dev_id) {
+    platform::CUDADeviceGuard guard(dev_id);
+    return platform::GpuMaxChunkSize();
+  }
+
+  static size_t GetCapcity(int dev_id) {
+    platform::CUDADeviceGuard guard(dev_id);
+    size_t available, total;
+    platform::GpuMemoryUsage(&available, &total);
+    size_t max_chunk_size = platform::GpuMaxChunkSize();
+    return max_chunk_size == 0 ? 0 : available / max_chunk_size;
+  }
+
+  static int64_t GetRetryTime() { return FLAGS_gpu_allocator_retry_time; }
+};
+
+class CUDAPinnedManagedAllocator : public ChunkedManagedAllocator {
+ public:
+  CUDAPinnedManagedAllocator()
+      : ChunkedManagedAllocator(
+            std::unique_ptr<Allocator>(new CPUPinnedAllocator()),
+            platform::CUDAPinnedMaxChunkSize(), GetCapacity(), -1) {
+  }  // never retry
+
+ private:
+  static size_t GetCapacity() {
+    size_t total = platform::CpuTotalPhysicalMemory();
+    size_t max_chunk_size = platform::CUDAPinnedMaxChunkSize();
+    return max_chunk_size == 0 ? 0 : total / max_chunk_size;
+  }
+};
+
 #endif
 
 class AllocatorFacadePrivate {
@@ -173,6 +201,7 @@ class AllocatorFacadePrivate {
   AllocatorFacadePrivate() {
     InitCPUAllocator();
     InitCUDAAllocator();
+    InitCUDAPinnedAllocator();
     WrapZeroSizeAllocator();
   }
 
@@ -183,13 +212,21 @@ class AllocatorFacadePrivate {
 
   void InitCUDAAllocator() {
 #ifdef PADDLE_WITH_CUDA
-    for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
+    int device_count = platform::GetCUDADeviceCount();
+    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
       allocators_[platform::CUDAPlace(dev_id)] =
           std::make_shared<CUDAManagedAllocator>(dev_id);
     }
 #endif
   }
 
+  void InitCUDAPinnedAllocator() {
+#ifdef PADDLE_WITH_CUDA
+    allocators_[platform::CUDAPinnedPlace()] =
+        std::make_shared<CUDAPinnedManagedAllocator>();
+#endif
+  }
+
   void WrapZeroSizeAllocator() {
     for (auto& pair : allocators_) {
       pair.second =
diff --git a/paddle/fluid/memory/allocation/allocator_facade_test.cc b/paddle/fluid/memory/allocation/allocator_facade_test.cc
index 5185bf944..802d79e15 100644
--- a/paddle/fluid/memory/allocation/allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_test.cc
@@ -16,37 +16,70 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 
+#ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_int32(gpu_allocator_retry_time);
+DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
+DECLARE_int64(gpu_allocator_retry_time);
+#endif
 
 namespace paddle {
 namespace memory {
 namespace allocation {
 
 TEST(allocator, allocator) {
+#ifdef PADDLE_WITH_CUDA
   FLAGS_fraction_of_gpu_memory_to_use = 0.01;
   FLAGS_gpu_allocator_retry_time = 500;
+  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
+#endif
 
   auto &instance = AllocatorFacade::Instance();
+  platform::Place place;
+  size_t size = 1024;
 
   {
-    auto cpu_allocation = instance.Alloc(platform::CPUPlace(), 1024);
+    place = platform::CPUPlace();
+    size = 1024;
+    auto cpu_allocation = instance.Alloc(place, size);
     ASSERT_NE(cpu_allocation, nullptr);
+    ASSERT_NE(cpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(cpu_allocation->place(), place);
+    ASSERT_EQ(cpu_allocation->size(), size);
   }
 
+#ifdef PADDLE_WITH_CUDA
   {
-    auto gpu_allocation = instance.Alloc(platform::CUDAPlace(0), 1024);
+    place = platform::CUDAPlace(0);
+    size = 1024;
+    auto gpu_allocation = instance.Alloc(place, size);
     ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), size);
   }
 
   {
     // Allocate 2GB gpu memory
-    auto gpu_allocation = instance.Alloc(platform::CUDAPlace(0),
-                                         2 * static_cast<size_t>(1 << 30));
+    place = platform::CUDAPlace(0);
+    size = 2 * static_cast<size_t>(1 << 30);
+    auto gpu_allocation = instance.Alloc(place, size);
     ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), size);
   }
 
-  {}
+  {
+    place = platform::CUDAPinnedPlace();
+    size = (1 << 20);
+    auto cuda_pinned_allocation =
+        instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
+    ASSERT_NE(cuda_pinned_allocation, nullptr);
+    ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
+    ASSERT_EQ(cuda_pinned_allocation->place(), place);
+    ASSERT_GE(cuda_pinned_allocation->size(), size);
+  }
+#endif
 }
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/auto_increment_allocator.h b/paddle/fluid/memory/allocation/auto_increment_allocator.h
index f026c413d..36ddd2b32 100644
--- a/paddle/fluid/memory/allocation/auto_increment_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_increment_allocator.h
@@ -17,6 +17,7 @@
 #include <atomic>  // NOLINT
 #include <functional>
 #include <memory>
+#include <mutex>   // NOLINT
 #include <thread>  // NOLINT
 #include <vector>
 #include "paddle/fluid/memory/allocation/allocator.h"
diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc
index 1e0febe10..dea87229f 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.cc
+++ b/paddle/fluid/memory/allocation/locked_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
+#include <mutex>  // NOLINT
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h
index f092a5bad..d6b877ba4 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.h
+++ b/paddle/fluid/memory/allocation/locked_allocator.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 #include <memory>
+#include <mutex>   // NOLINT
 #include <thread>  // NOLINT
 #include "paddle/fluid/memory/allocation/allocator.h"
 
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index dd1f5a3dd..650dab1b2 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -22,9 +22,9 @@ namespace allocation {
 
 std::unique_ptr<Allocation> CPUPinnedAllocator::Allocate(size_t size,
                                                          Allocator::Attr attr) {
-  PADDLE_ENFORCE_EQ(
-      attr, kCrossDevice,
-      "CPUPinnedAllocator should be used for Cross-Device Communication");
+  // PADDLE_ENFORCE_EQ(
+  //    attr, kCrossDevice,
+  //    "CPUPinnedAllocator should be used for Cross-Device Communication");
 
   void* ptr;
   PADDLE_ENFORCE(cudaMallocHost(&ptr, size));
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h
index 2c9e09cd7..d001a91d8 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/pinned_allocator.h
@@ -23,7 +23,7 @@ namespace allocation {
 class CPUPinnedAllocation : public Allocation {
  public:
   CPUPinnedAllocation(void* ptr, size_t size)
-      : Allocation(ptr, size, platform::CPUPlace()) {}
+      : Allocation(ptr, size, platform::CUDAPinnedPlace()) {}
 };
 
 class CPUPinnedAllocator : public UnmanagedAllocator {
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 1b96798d2..2019d1a14 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -30,12 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
-// If use_pinned_memory is true, CPUAllocator calls mlock, which
-// returns pinned and locked memory as staging areas for data exchange
-// between host and device.  Allocates too much would reduce the amount
-// of memory available to the system for paging.  So, by default, we
-// should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+DECLARE_bool(use_pinned_memory);
 DECLARE_double(fraction_of_gpu_memory_to_use);
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index fd81a0a7c..75686df43 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -98,7 +98,6 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace& place) {
 }
 
 #ifdef PADDLE_WITH_CUDA
-
 BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   static std::once_flag init_flag;
   static detail::BuddyAllocator** a_arr = nullptr;
@@ -128,15 +127,21 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   platform::SetDeviceId(gpu_id);
   return a_arr[gpu_id];
 }
+#endif
 
 template <>
 size_t Used<platform::CUDAPlace>(const platform::CUDAPlace& place) {
+#ifdef PADDLE_WITH_CUDA
   return GetGPUBuddyAllocator(place.device)->Used();
+#else
+  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
 }
 
 template <>
 void* Alloc<platform::CUDAPlace>(const platform::CUDAPlace& place,
                                  size_t size) {
+#ifdef PADDLE_WITH_CUDA
   auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
   auto* ptr = buddy_allocator->Alloc(size);
   if (ptr == nullptr) {
@@ -156,13 +161,21 @@ void* Alloc<platform::CUDAPlace>(const platform::CUDAPlace& place,
     cudaMemset(ptr, 0xEF, size);
   }
   return ptr;
+#else
+  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
 }
 
 template <>
 void Free<platform::CUDAPlace>(const platform::CUDAPlace& place, void* p) {
+#ifdef PADDLE_WITH_CUDA
   GetGPUBuddyAllocator(place.device)->Free(p);
+#else
+  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
 }
 
+#ifdef PADDLE_WITH_CUDA
 BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
   static std::once_flag init_flag;
   static BuddyAllocator* ba = nullptr;
@@ -176,15 +189,21 @@ BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
 
   return ba;
 }
+#endif
 
 template <>
 size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place) {
+#ifdef PADDLE_WITH_CUDA
   return GetCUDAPinnedBuddyAllocator()->Used();
+#else
+  PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
+#endif
 }
 
 template <>
 void* Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place,
                                        size_t size) {
+#ifdef PADDLE_WITH_CUDA
   auto* buddy_allocator = GetCUDAPinnedBuddyAllocator();
   void* ptr = buddy_allocator->Alloc(size);
 
@@ -196,14 +215,20 @@ void* Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place,
     memset(ptr, 0xEF, size);
   }
   return ptr;
+#else
+  PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
+#endif
 }
 
 template <>
 void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place,
                                      void* p) {
+#ifdef PADDLE_WITH_CUDA
   GetCUDAPinnedBuddyAllocator()->Free(p);
-}
+#else
+  PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
 #endif
+}
 
 struct AllocVisitor : public boost::static_visitor<void*> {
   inline explicit AllocVisitor(size_t size) : size_(size) {}
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index a177d4985..2a6f70a01 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -27,6 +27,8 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
 }
 
 #ifdef PADDLE_WITH_CUDA
+static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
+
 template <>
 void Copy<platform::CPUPlace, platform::CUDAPlace>(
     platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
@@ -36,6 +38,10 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
     platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
   } else {
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
+    // FIXME(zjl): do we really need it?
+    if (num <= kMaxGpuAsyncCopyBytes) {
+      cudaStreamSynchronize(0);
+    }
   }
 }
 
@@ -48,6 +54,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
     platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
   } else {
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
+    // FIXME(zjl): do we really need it?
+    if (num <= kMaxGpuAsyncCopyBytes) {
+      cudaStreamSynchronize(0);
+    }
   }
 }
 
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 2880c0926..f12070acf 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -56,10 +56,17 @@ DEFINE_double(
     "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
     "reserve the rest for page tables, etc");
 
+// If use_pinned_memory is true, CPUAllocator calls mlock, which
+// returns pinned and locked memory as staging areas for data exchange
+// between host and device.  Allocates too much would reduce the amount
+// of memory available to the system for paging.  So, by default, we
+// should set false to use_pinned_memory.
+DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+
 namespace paddle {
 namespace platform {
 
-inline size_t CpuTotalPhysicalMemory() {
+size_t CpuTotalPhysicalMemory() {
 #ifdef __APPLE__
   int mib[2];
   mib[0] = CTL_HW;
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index 30c8fbcfc..e2221414e 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -19,6 +19,8 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+size_t CpuTotalPhysicalMemory();
+
 //! Get the maximum allocation size for a machine.
 size_t CpuMaxAllocSize();
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 6b1d5e297..e026ff703 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -13,11 +13,11 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/platform/cuda_device_guard.h"
 
 #include "paddle/fluid/memory/memory.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/framework/rw_lock.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 25a693ab9..3d5c4ac2d 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -19,7 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index e55f734e4..b39323f84 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -63,8 +63,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
 #ifdef PADDLE_WITH_CUDA
         auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
         auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
-            tensor.dims(), platform::CPUPlace(),
-            memory::Allocator::kCrossDevice));
+            tensor.dims(), platform::CPUPlace()));
 
         paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
                                         sizeof(CUR_TYPE) * tensor.numel(),
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index ea1086cd4..f29b85b30 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -110,10 +110,10 @@ def __bootstrap__():
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
 
     read_env_flags = [
-        'check_nan_inf', 'benchmark', 'warpctc_dir', 'eager_delete_scope',
-        'use_mkldnn', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
-        'paddle_num_threads', "dist_threadpool_size", 'cpu_deterministic',
-        'eager_delete_tensor_gb', 'use_legacy_allocator'
+        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
+        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
+        'init_allocated_mem', 'paddle_num_threads', "dist_threadpool_size",
+        'cpu_deterministic', 'eager_delete_tensor_gb', 'use_legacy_allocator'
     ]
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')
-- 
GitLab