add CUDAPinnedPlace

18eb7730 · chengduoZH · f3dc3112 · 18eb7730 · 18eb7730 · 18eb7730
9 changed file
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -45,11 +45,10 @@ class Tensor {
  friend struct EigenVector;
 public:
-  Tensor() : offset_(0), is_pinned_(false) {}
+  Tensor() : offset_(0) {}
  /*! Constructor with place should only be used in pybind. */
-  explicit Tensor(const platform::Place& place)
+  explicit Tensor(const platform::Place& place) : offset_(0) {
-      : offset_(0), is_pinned_(false) {
    holder_->set_place(place);
  }
@@ -70,12 +69,11 @@ class Tensor {
   * @note    If not exist, then allocation.
   */
  template <typename T>
-  inline T* mutable_data(platform::Place place, bool is_pinned = false);
+  inline T* mutable_data(platform::Place place);
-  inline void* mutable_data(platform::Place place, std::type_index type,
+  inline void* mutable_data(platform::Place place, std::type_index type);
-                            bool is_pinned = false);
-  inline void* mutable_data(platform::Place place, bool is_pinned = false);
+  inline void* mutable_data(platform::Place place);
  /**
   * @brief     Return a pointer to mutable memory block.
@@ -86,8 +84,7 @@ class Tensor {
   * @note      If not exist, then allocation.
   */
  template <typename T>
-  inline T* mutable_data(DDim dims, platform::Place place,
+  inline T* mutable_data(DDim dims, platform::Place place);
-                         bool is_pinned = false);
  /*! Return the dimensions of the memory block. */
  inline const DDim& dims() const;
@@ -152,14 +149,12 @@ class Tensor {
  template <typename Place>
  struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(Place place, size_t size, std::type_index type,
+    PlaceholderImpl(Place place, size_t size, std::type_index type)
-                    bool is_pinned = false)
+        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
-        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size, is_pinned)),
+               memory::PODDeleter<uint8_t, Place>(place)),
-               memory::PODDeleter<uint8_t, Place>(place, is_pinned)),
          place_(place),
          size_(size),
-          type_(type),
+          type_(type) {
-          is_pinned_(is_pinned) {
      PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
                              (is_cpu_place(place_) ? "CPU" : "GPU"));
    }
@@ -182,9 +177,6 @@ class Tensor {
    /* the current type of memory */
    std::type_index type_;
-    /*! use pinned memory or not. */
-    bool is_pinned_;
  };
  /*! holds the memory block if allocated. */
@@ -219,7 +211,6 @@ class Tensor {
   *          PlaceHolder::ptr_ and where the tensor data really begins.
   */
  size_t offset_;
-  bool is_pinned_;
 };
 inline void Tensor::switch_place(platform::Place new_place) {

--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -101,21 +101,19 @@ inline T* Tensor::data() {
 }
 template <typename T>
-inline T* Tensor::mutable_data(DDim dims, platform::Place place,
+inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
-                               bool is_pinned) {
  static_assert(std::is_pod<T>::value, "T must be POD");
  Resize(dims);
-  return mutable_data<T>(place, is_pinned);
+  return mutable_data<T>(place);
 }
 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place, bool is_pinned) {
+inline T* Tensor::mutable_data(platform::Place place) {
  static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(place, typeid(T), is_pinned));
+  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
 }
-inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
+inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
-                                  bool is_pinned) {
  if (holder_ != nullptr) {
    holder_->set_type(type);
  }
@@ -129,27 +127,26 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
      holder_->size() < size + offset_) {
    if (platform::is_cpu_place(place)) {
      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size, type, is_pinned));
+          boost::get<platform::CPUPlace>(place), size, type));
    } else if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
    }
 #else
      holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-          boost::get<platform::CUDAPlace>(place), size, type, is_pinned));
+          boost::get<platform::CUDAPlace>(place), size, type));
    }
 #endif
    offset_ = 0;
-    is_pinned_ = is_pinned;
  }
  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                 offset_);
 }
-inline void* Tensor::mutable_data(platform::Place place, bool is_pinned) {
+inline void* Tensor::mutable_data(platform::Place place) {
  PADDLE_ENFORCE(this->holder_ != nullptr,
                 "Cannot invoke mutable data if current hold nothing");
-  return mutable_data(place, holder_->type(), is_pinned);
+  return mutable_data(place, holder_->type());
 }
 inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
@@ -191,8 +188,6 @@ inline const DDim& Tensor::dims() const { return dims_; }
 inline int64_t Tensor::numel() const { return product(dims_); }
-inline bool Tensor::isPinned() const { return is_pinned_; }
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
  Tensor res;
  res.ShareDataWith(src);

--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -123,20 +123,20 @@ bool GPUAllocator::UseGpu() const { return true; }
 // memory. It’s locked to a physical address.
 void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
  if (size <= 0) return nullptr;
-  void* p;
-  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
+  // NOTE: here, we use CpuMaxAllocSize()/2 as the maximum memory size
  // of host pinned allocation. Allocates too much would reduce
  // the amount of memory available to the underlying system for paging.
+  size_t usable = CpuMaxAllocSize() / 2 - cuda_pinnd_alloc_size_;
-  size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
  if (size > usable) return nullptr;
  // PINNED memory is visible to all CUDA contexts.
  cudaError_t result = cudaMallocHost(&p, size);
  if (result == cudaSuccess) {
-    index = 1;
+    index = 1;  // PINNED memory
-    fallback_alloc_size_ += size;
+    cuda_pinnd_alloc_size_ += size;
    return p;
  }
@@ -147,8 +147,8 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
  cudaError_t err;
  PADDLE_ASSERT(index == 1);
-  PADDLE_ASSERT(fallback_alloc_size_ >= size);
+  PADDLE_ASSERT(cuda_pinnd_alloc_size_ >= size);
-  fallback_alloc_size_ -= size;
+  cuda_pinnd_alloc_size_ -= size;
  err = cudaFreeHost(p);
  // Purposefully allow cudaErrorCudartUnloading, because

--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -59,9 +59,7 @@ class CUDAPinnedAllocator : public SystemAllocator {
  virtual bool UseGpu() const;
 private:
-  size_t gpu_alloc_size_ =
+  size_t cuda_pinnd_alloc_size_ = 0;
-      0;  // TODO(zcd): how to define the upper limit of CUDAPinnedMemory?
-  size_t fallback_alloc_size_ = 0;
 };
 #endif

--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
@@ -38,8 +38,7 @@ BuddyAllocator* GetCPUBuddyAllocator() {
 }
 template <>
-void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
+void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
-                                bool is_pinned) {
  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
  void* p = GetCPUBuddyAllocator()->Alloc(size);
  VLOG(10) << "  pointer=" << p;
@@ -47,8 +46,7 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
 }
 template <>
-void Free<platform::CPUPlace>(platform::CPUPlace place, void* p,
+void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
-                              bool is_pinned) {
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
  GetCPUBuddyAllocator()->Free(p);
 }
@@ -85,27 +83,13 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
 }
 BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) {
-  static BuddyAllocator** as = NULL;
+  static BuddyAllocator* as = NULL;
  if (as == NULL) {
-    int gpu_num = platform::GetCUDADeviceCount();
+    as = new BuddyAllocator(new detail::CUDAPinnedAllocator,
-    as = new BuddyAllocator*[gpu_num];
+                            platform::CpuMinChunkSize(),
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
+                            platform::CpuMaxChunkSize());
-      as[gpu] = nullptr;
-    }
-  }
-  platform::SetDeviceId(gpu_id);
-  if (!as[gpu_id]) {
-    as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator,
-                                    platform::GpuMinChunkSize(),
-                                    platform::GpuMaxChunkSize());
-    VLOG(10) << "\n\nNOTE: each GPU device use "
-             << FLAGS_fraction_of_gpu_memory_to_use * 100
-             << "% of GPU memory.\n"
-             << "You can set GFlags environment variable '"
-             << "FLAGS_fraction_of_gpu_memory_to_use"
-             << "' to change the fraction of GPU usage.\n\n";
  }
-  return as[gpu_id];
+  return as;
 }
 template <>
@@ -114,16 +98,9 @@ size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
 }
 template <>
-void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
+void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
-                                 bool is_pinned) {
-  void* ptr;
-  if (is_pinned) {
-    auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
-    ptr = buddy_allocator->Alloc(size);
-  } else {
  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
-    ptr = buddy_allocator->Alloc(size);
+  void* ptr = buddy_allocator->Alloc(size);
-  }
  if (ptr == nullptr) {
    int cur_dev = platform::GetCurrentDeviceId();
@@ -142,13 +119,39 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
 }
 template <>
-void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p,
+void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
-                               bool is_pinned) {
-  if (is_pinned) {
-    GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
-  } else {
  GetGPUBuddyAllocator(place.device)->Free(p);
+}
+size_t Used<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place) {
+  return GetGPUBuddyAllocator(place.device)->Used();
+}
+template <>
+void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
+                                       size_t size) {
+  auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
+  void* ptr = buddy_allocator->Alloc(size);
+  if (ptr == nullptr) {
+    int cur_dev = platform::GetCurrentDeviceId();
+    platform::SetDeviceId(place.device);
+    size_t avail, total;
+    platform::GpuMemoryUsage(avail, total);
+    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
+                 << place.device << ", available " << avail << " bytes";
+    LOG(WARNING) << "total " << total;
+    LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
+    LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
+    LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
+    platform::SetDeviceId(cur_dev);
  }
+  return ptr;
+}
+template <>
+void Free<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place, void* p) {
+  GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
 }
 #endif
@@ -165,6 +168,10 @@ size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
 #endif
 }
+size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const {
+  return Used(cuda_pinned);
+}
 size_t memory_usage(const platform::Place& p) {
  return boost::apply_visitor(Usage(), p);
 }

--- a/paddle/fluid/memory/memory.h
+++ b/paddle/fluid/memory/memory.h
@@ -57,6 +57,7 @@ size_t Used(Place place);
 struct Usage : public boost::static_visitor<size_t> {
  size_t operator()(const platform::CPUPlace& cpu) const;
  size_t operator()(const platform::CUDAPlace& gpu) const;
+  size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
 };
 size_t memory_usage(const platform::Place& p);

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -118,6 +118,18 @@ struct DefaultDeviceContextType<platform::CUDAPlace> {
  using TYPE = CUDADeviceContext;
 };
+// Currently, CUDAPinnedDeviceContext is only used to data copying.
+// class CUDAPinnedDeviceContext : public DeviceContext {
+// public:
+//  CUDAPinnedDeviceContext();
+//  explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place);
+//
+//  Place GetPlace() const override;
+//
+// private:
+//  CUDAPinnedPlace place_;
+//};
 #endif
 #ifdef PADDLE_WITH_MKLDNN

--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -40,12 +40,19 @@ const Place &get_place() { return the_default_place; }
 const CUDAPlace default_gpu() { return CUDAPlace(0); }
 const CPUPlace default_cpu() { return CPUPlace(); }
+const CUDAPinnedPlace default_cuda_pinned() { return CUDAPinnedPlace(); }
 bool is_gpu_place(const Place &p) {
  return boost::apply_visitor(IsCUDAPlace(), p);
 }
-bool is_cpu_place(const Place &p) { return !is_gpu_place(p); }
+bool is_cpu_place(const Place &p) {
+  return boost::apply_visitor(IsCPUPlace(), p);
+}
+bool is_cuda_pinned_place(const Place &p) {
+  return boost::apply_visitor(IsCUDAPinnedPlace(), p);
+}
 bool places_are_same_class(const Place &p1, const Place &p2) {
  return p1.which() == p2.which();
@@ -53,7 +60,7 @@ bool places_are_same_class(const Place &p1, const Place &p2) {
 bool is_same_place(const Place &p1, const Place &p2) {
  if (places_are_same_class(p1, p2)) {
-    if (is_cpu_place(p1)) {
+    if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) {
      return true;
    } else {
      return boost::get<CUDAPlace>(p1) == boost::get<CUDAPlace>(p2);

--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -45,12 +45,33 @@ struct CUDAPlace {
  int device;
 };
+struct CUDAPinnedPlace {
+  CUDAPinnedPlace() {}
+  // needed for variant equality comparison
+  inline bool operator==(const CUDAPinnedPlace &) const { return true; }
+  inline bool operator!=(const CUDAPinnedPlace &) const { return false; }
+};
 struct IsCUDAPlace : public boost::static_visitor<bool> {
  bool operator()(const CPUPlace &) const { return false; }
  bool operator()(const CUDAPlace &gpu) const { return true; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
-typedef boost::variant<CUDAPlace, CPUPlace> Place;
+struct IsCPUPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &cpu) const { return true; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
+};
+struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
+};
+typedef boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> Place;
 using PlaceList = std::vector<Place>;
@@ -59,9 +80,11 @@ const Place &get_place();
 const CUDAPlace default_gpu();
 const CPUPlace default_cpu();
+const CUDAPinnedPlace default_cuda_pinned();
 bool is_gpu_place(const Place &);
 bool is_cpu_place(const Place &);
+bool is_cuda_pinned_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
 bool is_same_place(const Place &, const Place &);
@@ -97,6 +120,11 @@ struct PlaceVisitorWrapper
    return typename Visitor::result_type();
 #endif
  }
+  typename Visitor::result_type operator()(
+      const CUDAPinnedPlace &cuda_pinned) const {
+    return visitor_(cuda_pinned);
+  }
 };
 template <typename Visitor>