From 18eb77303d24a53f0d8312385527d4e9c3a674a3 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 26 Mar 2018 19:12:05 +0800 Subject: [PATCH] add CUDAPinnedPlace --- paddle/fluid/framework/tensor.h | 29 +++---- paddle/fluid/framework/tensor_impl.h | 23 ++--- .../fluid/memory/detail/system_allocator.cc | 16 ++-- paddle/fluid/memory/detail/system_allocator.h | 4 +- paddle/fluid/memory/memory.cc | 85 ++++++++++--------- paddle/fluid/memory/memory.h | 1 + paddle/fluid/platform/device_context.h | 12 +++ paddle/fluid/platform/place.cc | 11 ++- paddle/fluid/platform/place.h | 30 ++++++- 9 files changed, 125 insertions(+), 86 deletions(-) diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index f7a6b5ba84..6eb678e301 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -45,11 +45,10 @@ class Tensor { friend struct EigenVector; public: - Tensor() : offset_(0), is_pinned_(false) {} + Tensor() : offset_(0) {} /*! Constructor with place should only be used in pybind. */ - explicit Tensor(const platform::Place& place) - : offset_(0), is_pinned_(false) { + explicit Tensor(const platform::Place& place) : offset_(0) { holder_->set_place(place); } @@ -70,12 +69,11 @@ class Tensor { * @note If not exist, then allocation. */ template - inline T* mutable_data(platform::Place place, bool is_pinned = false); + inline T* mutable_data(platform::Place place); - inline void* mutable_data(platform::Place place, std::type_index type, - bool is_pinned = false); + inline void* mutable_data(platform::Place place, std::type_index type); - inline void* mutable_data(platform::Place place, bool is_pinned = false); + inline void* mutable_data(platform::Place place); /** * @brief Return a pointer to mutable memory block. @@ -86,8 +84,7 @@ class Tensor { * @note If not exist, then allocation. */ template - inline T* mutable_data(DDim dims, platform::Place place, - bool is_pinned = false); + inline T* mutable_data(DDim dims, platform::Place place); /*! Return the dimensions of the memory block. */ inline const DDim& dims() const; @@ -152,14 +149,12 @@ class Tensor { template struct PlaceholderImpl : public Placeholder { - PlaceholderImpl(Place place, size_t size, std::type_index type, - bool is_pinned = false) - : ptr_(static_cast(memory::Alloc(place, size, is_pinned)), - memory::PODDeleter(place, is_pinned)), + PlaceholderImpl(Place place, size_t size, std::type_index type) + : ptr_(static_cast(memory::Alloc(place, size)), + memory::PODDeleter(place)), place_(place), size_(size), - type_(type), - is_pinned_(is_pinned) { + type_(type) { PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.", (is_cpu_place(place_) ? "CPU" : "GPU")); } @@ -182,9 +177,6 @@ class Tensor { /* the current type of memory */ std::type_index type_; - - /*! use pinned memory or not. */ - bool is_pinned_; }; /*! holds the memory block if allocated. */ @@ -219,7 +211,6 @@ class Tensor { * PlaceHolder::ptr_ and where the tensor data really begins. */ size_t offset_; - bool is_pinned_; }; inline void Tensor::switch_place(platform::Place new_place) { diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index 113814971e..7a48390440 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -101,21 +101,19 @@ inline T* Tensor::data() { } template -inline T* Tensor::mutable_data(DDim dims, platform::Place place, - bool is_pinned) { +inline T* Tensor::mutable_data(DDim dims, platform::Place place) { static_assert(std::is_pod::value, "T must be POD"); Resize(dims); - return mutable_data(place, is_pinned); + return mutable_data(place); } template -inline T* Tensor::mutable_data(platform::Place place, bool is_pinned) { +inline T* Tensor::mutable_data(platform::Place place) { static_assert(std::is_pod::value, "T must be POD"); - return reinterpret_cast(mutable_data(place, typeid(T), is_pinned)); + return reinterpret_cast(mutable_data(place, typeid(T))); } -inline void* Tensor::mutable_data(platform::Place place, std::type_index type, - bool is_pinned) { +inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { if (holder_ != nullptr) { holder_->set_type(type); } @@ -129,27 +127,26 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type, holder_->size() < size + offset_) { if (platform::is_cpu_place(place)) { holder_.reset(new PlaceholderImpl( - boost::get(place), size, type, is_pinned)); + boost::get(place), size, type)); } else if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); } #else holder_.reset(new PlaceholderImpl( - boost::get(place), size, type, is_pinned)); + boost::get(place), size, type)); } #endif offset_ = 0; - is_pinned_ = is_pinned; } return reinterpret_cast(reinterpret_cast(holder_->ptr()) + offset_); } -inline void* Tensor::mutable_data(platform::Place place, bool is_pinned) { +inline void* Tensor::mutable_data(platform::Place place) { PADDLE_ENFORCE(this->holder_ != nullptr, "Cannot invoke mutable data if current hold nothing"); - return mutable_data(place, holder_->type(), is_pinned); + return mutable_data(place, holder_->type()); } inline Tensor& Tensor::ShareDataWith(const Tensor& src) { @@ -191,8 +188,6 @@ inline const DDim& Tensor::dims() const { return dims_; } inline int64_t Tensor::numel() const { return product(dims_); } -inline bool Tensor::isPinned() const { return is_pinned_; } - inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { Tensor res; res.ShareDataWith(src); diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 71d28dcbad..d20c5c8682 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -123,20 +123,20 @@ bool GPUAllocator::UseGpu() const { return true; } // memory. It’s locked to a physical address. void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) { if (size <= 0) return nullptr; - void* p; - // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size + + // NOTE: here, we use CpuMaxAllocSize()/2 as the maximum memory size // of host pinned allocation. Allocates too much would reduce // the amount of memory available to the underlying system for paging. - - size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_; + size_t usable = CpuMaxAllocSize() / 2 - cuda_pinnd_alloc_size_; if (size > usable) return nullptr; // PINNED memory is visible to all CUDA contexts. cudaError_t result = cudaMallocHost(&p, size); + if (result == cudaSuccess) { - index = 1; - fallback_alloc_size_ += size; + index = 1; // PINNED memory + cuda_pinnd_alloc_size_ += size; return p; } @@ -147,8 +147,8 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { cudaError_t err; PADDLE_ASSERT(index == 1); - PADDLE_ASSERT(fallback_alloc_size_ >= size); - fallback_alloc_size_ -= size; + PADDLE_ASSERT(cuda_pinnd_alloc_size_ >= size); + cuda_pinnd_alloc_size_ -= size; err = cudaFreeHost(p); // Purposefully allow cudaErrorCudartUnloading, because diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h index 3e024125fa..c2f474f4b6 100644 --- a/paddle/fluid/memory/detail/system_allocator.h +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -59,9 +59,7 @@ class CUDAPinnedAllocator : public SystemAllocator { virtual bool UseGpu() const; private: - size_t gpu_alloc_size_ = - 0; // TODO(zcd): how to define the upper limit of CUDAPinnedMemory? - size_t fallback_alloc_size_ = 0; + size_t cuda_pinnd_alloc_size_ = 0; }; #endif diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc index f2d5f250bf..6da9f00656 100644 --- a/paddle/fluid/memory/memory.cc +++ b/paddle/fluid/memory/memory.cc @@ -38,8 +38,7 @@ BuddyAllocator* GetCPUBuddyAllocator() { } template <> -void* Alloc(platform::CPUPlace place, size_t size, - bool is_pinned) { +void* Alloc(platform::CPUPlace place, size_t size) { VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); void* p = GetCPUBuddyAllocator()->Alloc(size); VLOG(10) << " pointer=" << p; @@ -47,8 +46,7 @@ void* Alloc(platform::CPUPlace place, size_t size, } template <> -void Free(platform::CPUPlace place, void* p, - bool is_pinned) { +void Free(platform::CPUPlace place, void* p) { VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); GetCPUBuddyAllocator()->Free(p); } @@ -85,27 +83,13 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { } BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) { - static BuddyAllocator** as = NULL; + static BuddyAllocator* as = NULL; if (as == NULL) { - int gpu_num = platform::GetCUDADeviceCount(); - as = new BuddyAllocator*[gpu_num]; - for (int gpu = 0; gpu < gpu_num; gpu++) { - as[gpu] = nullptr; - } - } - platform::SetDeviceId(gpu_id); - if (!as[gpu_id]) { - as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator, - platform::GpuMinChunkSize(), - platform::GpuMaxChunkSize()); - VLOG(10) << "\n\nNOTE: each GPU device use " - << FLAGS_fraction_of_gpu_memory_to_use * 100 - << "% of GPU memory.\n" - << "You can set GFlags environment variable '" - << "FLAGS_fraction_of_gpu_memory_to_use" - << "' to change the fraction of GPU usage.\n\n"; + as = new BuddyAllocator(new detail::CUDAPinnedAllocator, + platform::CpuMinChunkSize(), + platform::CpuMaxChunkSize()); } - return as[gpu_id]; + return as; } template <> @@ -114,16 +98,9 @@ size_t Used(platform::CUDAPlace place) { } template <> -void* Alloc(platform::CUDAPlace place, size_t size, - bool is_pinned) { - void* ptr; - if (is_pinned) { - auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device); - ptr = buddy_allocator->Alloc(size); - } else { - auto* buddy_allocator = GetGPUBuddyAllocator(place.device); - ptr = buddy_allocator->Alloc(size); - } +void* Alloc(platform::CUDAPlace place, size_t size) { + auto* buddy_allocator = GetGPUBuddyAllocator(place.device); + void* ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { int cur_dev = platform::GetCurrentDeviceId(); @@ -142,13 +119,39 @@ void* Alloc(platform::CUDAPlace place, size_t size, } template <> -void Free(platform::CUDAPlace place, void* p, - bool is_pinned) { - if (is_pinned) { - GetCUDAPinnedBuddyAllocator(place.device)->Free(p); - } else { - GetGPUBuddyAllocator(place.device)->Free(p); +void Free(platform::CUDAPlace place, void* p) { + GetGPUBuddyAllocator(place.device)->Free(p); +} + +size_t Used(platform::CUDAPinnedPlace place) { + return GetGPUBuddyAllocator(place.device)->Used(); +} + +template <> +void* Alloc(platform::CUDAPinnedPlace place, + size_t size) { + auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device); + void* ptr = buddy_allocator->Alloc(size); + + if (ptr == nullptr) { + int cur_dev = platform::GetCurrentDeviceId(); + platform::SetDeviceId(place.device); + size_t avail, total; + platform::GpuMemoryUsage(avail, total); + LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU " + << place.device << ", available " << avail << " bytes"; + LOG(WARNING) << "total " << total; + LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize(); + LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize(); + LOG(WARNING) << "GPU memory used: " << Used(place); + platform::SetDeviceId(cur_dev); } + return ptr; +} + +template <> +void Free(platform::CUDAPinnedPlace place, void* p) { + GetCUDAPinnedBuddyAllocator(place.device)->Free(p); } #endif @@ -165,6 +168,10 @@ size_t Usage::operator()(const platform::CUDAPlace& gpu) const { #endif } +size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const { + return Used(cuda_pinned); +} + size_t memory_usage(const platform::Place& p) { return boost::apply_visitor(Usage(), p); } diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h index 062bfc880e..fba7372e71 100644 --- a/paddle/fluid/memory/memory.h +++ b/paddle/fluid/memory/memory.h @@ -57,6 +57,7 @@ size_t Used(Place place); struct Usage : public boost::static_visitor { size_t operator()(const platform::CPUPlace& cpu) const; size_t operator()(const platform::CUDAPlace& gpu) const; + size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const; }; size_t memory_usage(const platform::Place& p); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 202394c7be..e25cfe60b1 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -118,6 +118,18 @@ struct DefaultDeviceContextType { using TYPE = CUDADeviceContext; }; +// Currently, CUDAPinnedDeviceContext is only used to data copying. +// class CUDAPinnedDeviceContext : public DeviceContext { +// public: +// CUDAPinnedDeviceContext(); +// explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place); +// +// Place GetPlace() const override; +// +// private: +// CUDAPinnedPlace place_; +//}; + #endif #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index de8f958eb0..11ca87c211 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -40,12 +40,19 @@ const Place &get_place() { return the_default_place; } const CUDAPlace default_gpu() { return CUDAPlace(0); } const CPUPlace default_cpu() { return CPUPlace(); } +const CUDAPinnedPlace default_cuda_pinned() { return CUDAPinnedPlace(); } bool is_gpu_place(const Place &p) { return boost::apply_visitor(IsCUDAPlace(), p); } -bool is_cpu_place(const Place &p) { return !is_gpu_place(p); } +bool is_cpu_place(const Place &p) { + return boost::apply_visitor(IsCPUPlace(), p); +} + +bool is_cuda_pinned_place(const Place &p) { + return boost::apply_visitor(IsCUDAPinnedPlace(), p); +} bool places_are_same_class(const Place &p1, const Place &p2) { return p1.which() == p2.which(); @@ -53,7 +60,7 @@ bool places_are_same_class(const Place &p1, const Place &p2) { bool is_same_place(const Place &p1, const Place &p2) { if (places_are_same_class(p1, p2)) { - if (is_cpu_place(p1)) { + if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) { return true; } else { return boost::get(p1) == boost::get(p2); diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index 4cc8b377b8..8f3acd8df6 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -45,12 +45,33 @@ struct CUDAPlace { int device; }; +struct CUDAPinnedPlace { + CUDAPinnedPlace() {} + + // needed for variant equality comparison + inline bool operator==(const CUDAPinnedPlace &) const { return true; } + inline bool operator!=(const CUDAPinnedPlace &) const { return false; } +}; + struct IsCUDAPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const CUDAPlace &gpu) const { return true; } + bool operator()(const CUDAPinnedPlace &) const { return false; } }; -typedef boost::variant Place; +struct IsCPUPlace : public boost::static_visitor { + bool operator()(const CPUPlace &cpu) const { return true; } + bool operator()(const CUDAPlace &) const { return false; } + bool operator()(const CUDAPinnedPlace &) const { return false; } +}; + +struct IsCUDAPinnedPlace : public boost::static_visitor { + bool operator()(const CPUPlace &) const { return false; } + bool operator()(const CUDAPlace &) const { return false; } + bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; } +}; + +typedef boost::variant Place; using PlaceList = std::vector; @@ -59,9 +80,11 @@ const Place &get_place(); const CUDAPlace default_gpu(); const CPUPlace default_cpu(); +const CUDAPinnedPlace default_cuda_pinned(); bool is_gpu_place(const Place &); bool is_cpu_place(const Place &); +bool is_cuda_pinned_place(const Place &); bool places_are_same_class(const Place &, const Place &); bool is_same_place(const Place &, const Place &); @@ -97,6 +120,11 @@ struct PlaceVisitorWrapper return typename Visitor::result_type(); #endif } + + typename Visitor::result_type operator()( + const CUDAPinnedPlace &cuda_pinned) const { + return visitor_(cuda_pinned); + } }; template -- GitLab