fix pinned allocator

2002e71d · sneaxiy · 21fdf8e8 · 2002e71d · 2002e71d · 2002e71d
18 changed file
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -112,8 +112,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
  dst->set_layout(src.layout());
  auto src_place = src.place();
  auto src_ptr = src.data<void>();
-  auto dst_ptr =
+  auto dst_ptr = dst->mutable_data(dst_place, src.type());
-      dst->mutable_data(dst_place, src.type(), memory::Allocator::kCrossDevice);
  auto size = src.numel() * SizeOfType(src.type());
  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -2,7 +2,10 @@ cc_library(allocator SRCS allocator.cc DEPS place)
 cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
 cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
-nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
+if (WITH_GPU)
+  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
+endif()
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
@@ -29,7 +32,7 @@ cc_library(naive_managed_allocator SRCS naive_managed_allocator.cc DEPS allocato
 cc_test(naive_managed_allocator_test SRCS naive_managed_allocator_test.cc DEPS naive_managed_allocator)
 nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
 if (WITH_GPU)
-    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator)
+    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard)
 else ()
    set(AllocatorFacadeDeps)
 endif()
@@ -48,8 +51,7 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS
        auto_increment_allocator
        zero_size_allocator
        conditional_allocator
-        retry_allocator
+        retry_allocator)
-        cuda_device_guard)
 nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -25,17 +25,18 @@
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
 #include "paddle/fluid/memory/allocation/naive_managed_allocator.h"
-#include "paddle/fluid/memory/allocation/pinned_allocator.h"
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 #include "paddle/fluid/memory/allocation/zero_size_allocator.h"
-#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
+#include "paddle/fluid/memory/allocation/pinned_allocator.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/gpu_info.h"
 #endif
-DEFINE_int32(
+DEFINE_int64(
    gpu_allocator_retry_time, 0,
    "The retry time (milliseconds) when allocator fails "
    "to allocate memory. No retry if this value is not greater than 0");
@@ -49,51 +50,34 @@ class CPUManagedAllocator : public ManagedAllocator {
 public:
  CPUManagedAllocator()
      : normal_allocator_(NaiveManagedAllocator::Create(
-            std::unique_ptr<Allocator>(new CPUAllocator()))),
+            std::unique_ptr<Allocator>(new CPUAllocator()))) {}
-        communication_allocator_(NaiveManagedAllocator::Create(
-            std::unique_ptr<Allocator>(new CPUPinnedAllocator()))) {}
  std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override {
-    if (attr == kCrossDevice) {
+    return normal_allocator_->Allocate(size, attr);
-      return communication_allocator_->Allocate(size, attr);
-    } else {
-      return normal_allocator_->Allocate(size, attr);
-    }
  }
  std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
-    if (attr == kCrossDevice) {
+    return normal_allocator_->AllocateShared(size, attr);
-      return communication_allocator_->AllocateShared(size, attr);
-    } else {
-      return normal_allocator_->AllocateShared(size, attr);
-    }
  }
  bool IsAllocThreadSafe() const override { return true; }
 private:
  std::shared_ptr<ManagedAllocator> normal_allocator_;
-  std::shared_ptr<ManagedAllocator> communication_allocator_;
 };
-#ifdef PADDLE_WITH_CUDA
 // TODO(yy): Dirty code here. This class should be configurable in runtime.
-class CUDAManagedAllocator : public ManagedAllocator {
+class ChunkedManagedAllocator : public ManagedAllocator {
 public:
-  explicit CUDAManagedAllocator(int dev_id) {
+  explicit ChunkedManagedAllocator(std::unique_ptr<Allocator> system_allocator,
-    platform::CUDADeviceGuard guard(dev_id);
+                                   size_t max_chunk_size, size_t capacity = 1,
-    max_chunk_size_ = platform::GpuMaxChunkSize();
+                                   int64_t retry_time = -1)
+      : max_chunk_size_(max_chunk_size), retry_time_(retry_time) {
-    raw_allocator_ = NaiveManagedAllocator::Create(std::unique_ptr<Allocator>(
+    raw_allocator_ = NaiveManagedAllocator::Create(std::move(system_allocator));
-        new CUDAAllocator(platform::CUDAPlace(dev_id))));
    if (max_chunk_size_ == 0) {
      default_allocator_ = raw_allocator_;
    } else {
-      size_t available, total;
-      platform::GpuMemoryUsage(&available, &total);
-      size_t capacity = available / max_chunk_size_;
      if (capacity == 1) {
        VLOG(10) << "Create BestFitAllocator with chunk_size "
                 << max_chunk_size_;
@@ -119,7 +103,7 @@ class CUDAManagedAllocator : public ManagedAllocator {
    default_allocator_.reset(cond_allocator);
  }
-  ~CUDAManagedAllocator() {
+  ~ChunkedManagedAllocator() {
    // Specify destruct order.
    default_allocator_.reset();
    chunks_.clear();
@@ -140,27 +124,71 @@ class CUDAManagedAllocator : public ManagedAllocator {
    std::unique_ptr<Allocator> unmanaged_allocator(new LockedAllocator(
        std::unique_ptr<Allocator>(new BestFitAllocator(allocation))));
-    if (FLAGS_gpu_allocator_retry_time <= 0) {
+    if (retry_time_ <= 0) {
      VLOG(10) << "Create NaiveManagedAllocator without retry";
      return std::make_shared<AlignedAllocator<64u>>(
          NaiveManagedAllocator::Create(std::move(unmanaged_allocator)));
    } else {
-      VLOG(10) << "Create RetryAllocator with retry_time "
+      VLOG(10) << "Create RetryAllocator with retry_time " << retry_time_
-               << FLAGS_gpu_allocator_retry_time << "ms";
+               << "ms";
      return std::make_shared<AlignedAllocator<64u>>(RetryAllocator::Create(
-          std::move(unmanaged_allocator),
+          std::move(unmanaged_allocator), static_cast<size_t>(retry_time_)));
-          static_cast<size_t>(FLAGS_gpu_allocator_retry_time)));
    }
  }
  bool IsAllocThreadSafe() const override { return true; }
- private:
+ protected:
  size_t max_chunk_size_;
+  int64_t retry_time_;
  std::vector<std::unique_ptr<Allocation>> chunks_;
  std::shared_ptr<ManagedAllocator> raw_allocator_;
  std::shared_ptr<ManagedAllocator> default_allocator_;
 };
+#ifdef PADDLE_WITH_CUDA
+class CUDAManagedAllocator : public ChunkedManagedAllocator {
+ public:
+  explicit CUDAManagedAllocator(int dev_id)
+      : ChunkedManagedAllocator(
+            std::unique_ptr<Allocator>(
+                new CUDAAllocator(platform::CUDAPlace(dev_id))),
+            GetMaxChunkSize(dev_id), GetCapcity(dev_id), GetRetryTime()) {}
+ private:
+  static size_t GetMaxChunkSize(int dev_id) {
+    platform::CUDADeviceGuard guard(dev_id);
+    return platform::GpuMaxChunkSize();
+  }
+  static size_t GetCapcity(int dev_id) {
+    platform::CUDADeviceGuard guard(dev_id);
+    size_t available, total;
+    platform::GpuMemoryUsage(&available, &total);
+    size_t max_chunk_size = platform::GpuMaxChunkSize();
+    return max_chunk_size == 0 ? 0 : available / max_chunk_size;
+  }
+  static int64_t GetRetryTime() { return FLAGS_gpu_allocator_retry_time; }
+};
+class CUDAPinnedManagedAllocator : public ChunkedManagedAllocator {
+ public:
+  CUDAPinnedManagedAllocator()
+      : ChunkedManagedAllocator(
+            std::unique_ptr<Allocator>(new CPUPinnedAllocator()),
+            platform::CUDAPinnedMaxChunkSize(), GetCapacity(), -1) {
+  }  // never retry
+ private:
+  static size_t GetCapacity() {
+    size_t total = platform::CpuTotalPhysicalMemory();
+    size_t max_chunk_size = platform::CUDAPinnedMaxChunkSize();
+    return max_chunk_size == 0 ? 0 : total / max_chunk_size;
+  }
+};
 #endif
 class AllocatorFacadePrivate {
@@ -173,6 +201,7 @@ class AllocatorFacadePrivate {
  AllocatorFacadePrivate() {
    InitCPUAllocator();
    InitCUDAAllocator();
+    InitCUDAPinnedAllocator();
    WrapZeroSizeAllocator();
  }
@@ -183,13 +212,21 @@ class AllocatorFacadePrivate {
  void InitCUDAAllocator() {
 #ifdef PADDLE_WITH_CUDA
-    for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
+    int device_count = platform::GetCUDADeviceCount();
+    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      allocators_[platform::CUDAPlace(dev_id)] =
          std::make_shared<CUDAManagedAllocator>(dev_id);
    }
 #endif
  }
+  void InitCUDAPinnedAllocator() {
+#ifdef PADDLE_WITH_CUDA
+    allocators_[platform::CUDAPinnedPlace()] =
+        std::make_shared<CUDAPinnedManagedAllocator>();
+#endif
+  }
  void WrapZeroSizeAllocator() {
    for (auto& pair : allocators_) {
      pair.second =

--- a/paddle/fluid/memory/allocation/allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_test.cc
@@ -16,37 +16,70 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
+#ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_int32(gpu_allocator_retry_time);
+DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
+DECLARE_int64(gpu_allocator_retry_time);
+#endif
 namespace paddle {
 namespace memory {
 namespace allocation {
 TEST(allocator, allocator) {
+#ifdef PADDLE_WITH_CUDA
  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
  FLAGS_gpu_allocator_retry_time = 500;
+  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
+#endif
  auto &instance = AllocatorFacade::Instance();
+  platform::Place place;
+  size_t size = 1024;
  {
-    auto cpu_allocation = instance.Alloc(platform::CPUPlace(), 1024);
+    place = platform::CPUPlace();
+    size = 1024;
+    auto cpu_allocation = instance.Alloc(place, size);
    ASSERT_NE(cpu_allocation, nullptr);
+    ASSERT_NE(cpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(cpu_allocation->place(), place);
+    ASSERT_EQ(cpu_allocation->size(), size);
  }
+#ifdef PADDLE_WITH_CUDA
  {
-    auto gpu_allocation = instance.Alloc(platform::CUDAPlace(0), 1024);
+    place = platform::CUDAPlace(0);
+    size = 1024;
+    auto gpu_allocation = instance.Alloc(place, size);
    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), size);
  }
  {
    // Allocate 2GB gpu memory
-    auto gpu_allocation = instance.Alloc(platform::CUDAPlace(0),
+    place = platform::CUDAPlace(0);
-                                         2 * static_cast<size_t>(1 << 30));
+    size = 2 * static_cast<size_t>(1 << 30);
+    auto gpu_allocation = instance.Alloc(place, size);
    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), size);
  }
-  {}
+  {
+    place = platform::CUDAPinnedPlace();
+    size = (1 << 20);
+    auto cuda_pinned_allocation =
+        instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
+    ASSERT_NE(cuda_pinned_allocation, nullptr);
+    ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
+    ASSERT_EQ(cuda_pinned_allocation->place(), place);
+    ASSERT_GE(cuda_pinned_allocation->size(), size);
+  }
+#endif
 }
 }  // namespace allocation

--- a/paddle/fluid/memory/allocation/auto_increment_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_increment_allocator.h
@@ -17,6 +17,7 @@
 #include <atomic>  // NOLINT
 #include <functional>
 #include <memory>
+#include <mutex>   // NOLINT
 #include <thread>  // NOLINT
 #include <vector>
 #include "paddle/fluid/memory/allocation/allocator.h"

--- a/paddle/fluid/memory/allocation/locked_allocator.cc
+++ b/paddle/fluid/memory/allocation/locked_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
+#include <mutex>  // NOLINT
 namespace paddle {
 namespace memory {

--- a/paddle/fluid/memory/allocation/locked_allocator.h
+++ b/paddle/fluid/memory/allocation/locked_allocator.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 #include <memory>
+#include <mutex>   // NOLINT
 #include <thread>  // NOLINT
 #include "paddle/fluid/memory/allocation/allocator.h"

--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -22,9 +22,9 @@ namespace allocation {
 std::unique_ptr<Allocation> CPUPinnedAllocator::Allocate(size_t size,
                                                         Allocator::Attr attr) {
-  PADDLE_ENFORCE_EQ(
+  // PADDLE_ENFORCE_EQ(
-      attr, kCrossDevice,
+  //    attr, kCrossDevice,
-      "CPUPinnedAllocator should be used for Cross-Device Communication");
+  //    "CPUPinnedAllocator should be used for Cross-Device Communication");
  void* ptr;
  PADDLE_ENFORCE(cudaMallocHost(&ptr, size));

--- a/paddle/fluid/memory/allocation/pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/pinned_allocator.h
@@ -23,7 +23,7 @@ namespace allocation {
 class CPUPinnedAllocation : public Allocation {
 public:
  CPUPinnedAllocation(void* ptr, size_t size)
-      : Allocation(ptr, size, platform::CPUPlace()) {}
+      : Allocation(ptr, size, platform::CUDAPinnedPlace()) {}
 };
 class CPUPinnedAllocator : public UnmanagedAllocator {

--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -30,12 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
-// If use_pinned_memory is true, CPUAllocator calls mlock, which
+DECLARE_bool(use_pinned_memory);
-// returns pinned and locked memory as staging areas for data exchange
-// between host and device.  Allocates too much would reduce the amount
-// of memory available to the system for paging.  So, by default, we
-// should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
 namespace paddle {
 namespace memory {

--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -98,7 +98,6 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace& place) {
 }
 #ifdef PADDLE_WITH_CUDA
 BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
  static std::once_flag init_flag;
  static detail::BuddyAllocator** a_arr = nullptr;
@@ -128,15 +127,21 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
  platform::SetDeviceId(gpu_id);
  return a_arr[gpu_id];
 }
+#endif
 template <>
 size_t Used<platform::CUDAPlace>(const platform::CUDAPlace& place) {
+#ifdef PADDLE_WITH_CUDA
  return GetGPUBuddyAllocator(place.device)->Used();
+#else
+  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
 }
 template <>
 void* Alloc<platform::CUDAPlace>(const platform::CUDAPlace& place,
                                 size_t size) {
+#ifdef PADDLE_WITH_CUDA
  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
  auto* ptr = buddy_allocator->Alloc(size);
  if (ptr == nullptr) {
@@ -156,13 +161,21 @@ void* Alloc<platform::CUDAPlace>(const platform::CUDAPlace& place,
    cudaMemset(ptr, 0xEF, size);
  }
  return ptr;
+#else
+  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
 }
 template <>
 void Free<platform::CUDAPlace>(const platform::CUDAPlace& place, void* p) {
+#ifdef PADDLE_WITH_CUDA
  GetGPUBuddyAllocator(place.device)->Free(p);
+#else
+  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
 }
+#ifdef PADDLE_WITH_CUDA
 BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
  static std::once_flag init_flag;
  static BuddyAllocator* ba = nullptr;
@@ -176,15 +189,21 @@ BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
  return ba;
 }
+#endif
 template <>
 size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place) {
+#ifdef PADDLE_WITH_CUDA
  return GetCUDAPinnedBuddyAllocator()->Used();
+#else
+  PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
+#endif
 }
 template <>
 void* Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place,
                                       size_t size) {
+#ifdef PADDLE_WITH_CUDA
  auto* buddy_allocator = GetCUDAPinnedBuddyAllocator();
  void* ptr = buddy_allocator->Alloc(size);
@@ -196,14 +215,20 @@ void* Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place,
    memset(ptr, 0xEF, size);
  }
  return ptr;
+#else
+  PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
+#endif
 }
 template <>
 void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place,
                                     void* p) {
+#ifdef PADDLE_WITH_CUDA
  GetCUDAPinnedBuddyAllocator()->Free(p);
-}
+#else
+  PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
 #endif
+}
 struct AllocVisitor : public boost::static_visitor<void*> {
  inline explicit AllocVisitor(size_t size) : size_(size) {}

--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -27,6 +27,8 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
 }
 #ifdef PADDLE_WITH_CUDA
+static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
 template <>
 void Copy<platform::CPUPlace, platform::CUDAPlace>(
    platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
@@ -36,6 +38,10 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
  } else {
    platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
+    // FIXME(zjl): do we really need it?
+    if (num <= kMaxGpuAsyncCopyBytes) {
+      cudaStreamSynchronize(0);
+    }
  }
 }
@@ -48,6 +54,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
  } else {
    platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
+    // FIXME(zjl): do we really need it?
+    if (num <= kMaxGpuAsyncCopyBytes) {
+      cudaStreamSynchronize(0);
+    }
  }
 }

--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -56,10 +56,17 @@ DEFINE_double(
    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
    "reserve the rest for page tables, etc");
+// If use_pinned_memory is true, CPUAllocator calls mlock, which
+// returns pinned and locked memory as staging areas for data exchange
+// between host and device.  Allocates too much would reduce the amount
+// of memory available to the system for paging.  So, by default, we
+// should set false to use_pinned_memory.
+DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
 namespace paddle {
 namespace platform {
-inline size_t CpuTotalPhysicalMemory() {
+size_t CpuTotalPhysicalMemory() {
 #ifdef __APPLE__
  int mib[2];
  mib[0] = CTL_HW;

--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -19,6 +19,8 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
+size_t CpuTotalPhysicalMemory();
 //! Get the maximum allocation size for a machine.
 size_t CpuMaxAllocSize();

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -13,11 +13,11 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/memory/memory.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/framework/rw_lock.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 namespace paddle {

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -19,7 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -63,8 +63,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
 #ifdef PADDLE_WITH_CUDA
        auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
        auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
-            tensor.dims(), platform::CPUPlace(),
+            tensor.dims(), platform::CPUPlace()));
-            memory::Allocator::kCrossDevice));
        paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
                                        sizeof(CUR_TYPE) * tensor.numel(),

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -110,10 +110,10 @@ def __bootstrap__():
    os.environ['OMP_NUM_THREADS'] = str(num_threads)
    read_env_flags = [
-        'check_nan_inf', 'benchmark', 'warpctc_dir', 'eager_delete_scope',
+        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
-        'use_mkldnn', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
+        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
-        'paddle_num_threads', "dist_threadpool_size", 'cpu_deterministic',
+        'init_allocated_mem', 'paddle_num_threads', "dist_threadpool_size",
-        'eager_delete_tensor_gb', 'use_legacy_allocator'
+        'cpu_deterministic', 'eager_delete_tensor_gb', 'use_legacy_allocator'
    ]
    if core.is_compiled_with_dist():
        read_env_flags.append('rpc_deadline')