diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 6f878541e6de1deec1829145b1b325ecd176a034..f7a6b5ba84ca1762bd903790aa3c0346b22ed035 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -45,10 +45,11 @@ class Tensor { friend struct EigenVector; public: - Tensor() : offset_(0) {} + Tensor() : offset_(0), is_pinned_(false) {} /*! Constructor with place should only be used in pybind. */ - explicit Tensor(const platform::Place& place) : offset_(0) { + explicit Tensor(const platform::Place& place) + : offset_(0), is_pinned_(false) { holder_->set_place(place); } @@ -69,11 +70,12 @@ class Tensor { * @note If not exist, then allocation. */ template - inline T* mutable_data(platform::Place place); + inline T* mutable_data(platform::Place place, bool is_pinned = false); - inline void* mutable_data(platform::Place place, std::type_index type); + inline void* mutable_data(platform::Place place, std::type_index type, + bool is_pinned = false); - inline void* mutable_data(platform::Place place); + inline void* mutable_data(platform::Place place, bool is_pinned = false); /** * @brief Return a pointer to mutable memory block. @@ -84,7 +86,8 @@ class Tensor { * @note If not exist, then allocation. */ template - inline T* mutable_data(DDim dims, platform::Place place); + inline T* mutable_data(DDim dims, platform::Place place, + bool is_pinned = false); /*! Return the dimensions of the memory block. */ inline const DDim& dims() const; @@ -92,6 +95,9 @@ class Tensor { /*! Return the numel of the memory block. */ inline int64_t numel() const; + /*! Return the numel of the memory block. */ + inline bool isPinned() const; + /*! Resize the dimensions of the memory block. */ inline Tensor& Resize(const DDim& dims); @@ -146,12 +152,14 @@ class Tensor { template struct PlaceholderImpl : public Placeholder { - PlaceholderImpl(Place place, size_t size, std::type_index type) - : ptr_(static_cast(memory::Alloc(place, size)), - memory::PODDeleter(place)), + PlaceholderImpl(Place place, size_t size, std::type_index type, + bool is_pinned = false) + : ptr_(static_cast(memory::Alloc(place, size, is_pinned)), + memory::PODDeleter(place, is_pinned)), place_(place), size_(size), - type_(type) { + type_(type), + is_pinned_(is_pinned) { PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.", (is_cpu_place(place_) ? "CPU" : "GPU")); } @@ -174,6 +182,9 @@ class Tensor { /* the current type of memory */ std::type_index type_; + + /*! use pinned memory or not. */ + bool is_pinned_; }; /*! holds the memory block if allocated. */ @@ -208,6 +219,7 @@ class Tensor { * PlaceHolder::ptr_ and where the tensor data really begins. */ size_t offset_; + bool is_pinned_; }; inline void Tensor::switch_place(platform::Place new_place) { diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index 7a4839044008338dda43f75b5ee6def500b78270..113814971e115fa88bd0ded34017fa26a9dd5803 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -101,19 +101,21 @@ inline T* Tensor::data() { } template -inline T* Tensor::mutable_data(DDim dims, platform::Place place) { +inline T* Tensor::mutable_data(DDim dims, platform::Place place, + bool is_pinned) { static_assert(std::is_pod::value, "T must be POD"); Resize(dims); - return mutable_data(place); + return mutable_data(place, is_pinned); } template -inline T* Tensor::mutable_data(platform::Place place) { +inline T* Tensor::mutable_data(platform::Place place, bool is_pinned) { static_assert(std::is_pod::value, "T must be POD"); - return reinterpret_cast(mutable_data(place, typeid(T))); + return reinterpret_cast(mutable_data(place, typeid(T), is_pinned)); } -inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { +inline void* Tensor::mutable_data(platform::Place place, std::type_index type, + bool is_pinned) { if (holder_ != nullptr) { holder_->set_type(type); } @@ -127,26 +129,27 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { holder_->size() < size + offset_) { if (platform::is_cpu_place(place)) { holder_.reset(new PlaceholderImpl( - boost::get(place), size, type)); + boost::get(place), size, type, is_pinned)); } else if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); } #else holder_.reset(new PlaceholderImpl( - boost::get(place), size, type)); + boost::get(place), size, type, is_pinned)); } #endif offset_ = 0; + is_pinned_ = is_pinned; } return reinterpret_cast(reinterpret_cast(holder_->ptr()) + offset_); } -inline void* Tensor::mutable_data(platform::Place place) { +inline void* Tensor::mutable_data(platform::Place place, bool is_pinned) { PADDLE_ENFORCE(this->holder_ != nullptr, "Cannot invoke mutable data if current hold nothing"); - return mutable_data(place, holder_->type()); + return mutable_data(place, holder_->type(), is_pinned); } inline Tensor& Tensor::ShareDataWith(const Tensor& src) { @@ -188,6 +191,8 @@ inline const DDim& Tensor::dims() const { return dims_; } inline int64_t Tensor::numel() const { return product(dims_); } +inline bool Tensor::isPinned() const { return is_pinned_; } + inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { Tensor res; res.ShareDataWith(src); diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 8ac8978120ad5930cd80272189ac0a83a77b2617..71d28dcbade1bcb48d2e906c61e03236860cb7d0 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -119,6 +119,50 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) { bool GPUAllocator::UseGpu() const { return true; } +// PINNED memory allows direct DMA transfers by the GPU to and from system +// memory. It’s locked to a physical address. +void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) { + if (size <= 0) return nullptr; + void* p; + // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size + // of host pinned allocation. Allocates too much would reduce + // the amount of memory available to the underlying system for paging. + + size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_; + + if (size > usable) return nullptr; + + // PINNED memory is visible to all CUDA contexts. + cudaError_t result = cudaMallocHost(&p, size); + if (result == cudaSuccess) { + index = 1; + fallback_alloc_size_ += size; + return p; + } + + return nullptr; +} + +void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { + cudaError_t err; + PADDLE_ASSERT(index == 1); + + PADDLE_ASSERT(fallback_alloc_size_ >= size); + fallback_alloc_size_ -= size; + err = cudaFreeHost(p); + + // Purposefully allow cudaErrorCudartUnloading, because + // that is returned if you ever call cudaFreeHost after the + // driver has already shutdown. This happens only if the + // process is terminating, in which case we don't care if + // cudaFreeHost succeeds. + if (err != cudaErrorCudartUnloading) { + PADDLE_ENFORCE(err, "cudaFreeHost failed in GPUPinnedAllocator::Free."); + } +} + +bool CUDAPinnedAllocator::UseGpu() const { return true; } + #endif } // namespace detail diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h index e93c2c1e3231f7f42794dd78121072dbdb6abc41..3e024125fabb8bbff094ed4455f164dfd4cae163 100644 --- a/paddle/fluid/memory/detail/system_allocator.h +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -51,6 +51,18 @@ class GPUAllocator : public SystemAllocator { size_t gpu_alloc_size_ = 0; size_t fallback_alloc_size_ = 0; }; + +class CUDAPinnedAllocator : public SystemAllocator { + public: + virtual void* Alloc(size_t& index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + virtual bool UseGpu() const; + + private: + size_t gpu_alloc_size_ = + 0; // TODO(zcd): how to define the upper limit of CUDAPinnedMemory? + size_t fallback_alloc_size_ = 0; +}; #endif } // namespace detail diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc index d07f89439a1ec37682f79799d5569cad2ab75818..f2d5f250bfb56fb522416e83ab4c5315a9f533f0 100644 --- a/paddle/fluid/memory/memory.cc +++ b/paddle/fluid/memory/memory.cc @@ -38,7 +38,8 @@ BuddyAllocator* GetCPUBuddyAllocator() { } template <> -void* Alloc(platform::CPUPlace place, size_t size) { +void* Alloc(platform::CPUPlace place, size_t size, + bool is_pinned) { VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); void* p = GetCPUBuddyAllocator()->Alloc(size); VLOG(10) << " pointer=" << p; @@ -46,7 +47,8 @@ void* Alloc(platform::CPUPlace place, size_t size) { } template <> -void Free(platform::CPUPlace place, void* p) { +void Free(platform::CPUPlace place, void* p, + bool is_pinned) { VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); GetCPUBuddyAllocator()->Free(p); } @@ -82,15 +84,47 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { return as[gpu_id]; } +BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) { + static BuddyAllocator** as = NULL; + if (as == NULL) { + int gpu_num = platform::GetCUDADeviceCount(); + as = new BuddyAllocator*[gpu_num]; + for (int gpu = 0; gpu < gpu_num; gpu++) { + as[gpu] = nullptr; + } + } + platform::SetDeviceId(gpu_id); + if (!as[gpu_id]) { + as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator, + platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize()); + VLOG(10) << "\n\nNOTE: each GPU device use " + << FLAGS_fraction_of_gpu_memory_to_use * 100 + << "% of GPU memory.\n" + << "You can set GFlags environment variable '" + << "FLAGS_fraction_of_gpu_memory_to_use" + << "' to change the fraction of GPU usage.\n\n"; + } + return as[gpu_id]; +} + template <> size_t Used(platform::CUDAPlace place) { return GetGPUBuddyAllocator(place.device)->Used(); } template <> -void* Alloc(platform::CUDAPlace place, size_t size) { - auto* buddy_allocator = GetGPUBuddyAllocator(place.device); - auto* ptr = buddy_allocator->Alloc(size); +void* Alloc(platform::CUDAPlace place, size_t size, + bool is_pinned) { + void* ptr; + if (is_pinned) { + auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device); + ptr = buddy_allocator->Alloc(size); + } else { + auto* buddy_allocator = GetGPUBuddyAllocator(place.device); + ptr = buddy_allocator->Alloc(size); + } + if (ptr == nullptr) { int cur_dev = platform::GetCurrentDeviceId(); platform::SetDeviceId(place.device); @@ -108,8 +142,13 @@ void* Alloc(platform::CUDAPlace place, size_t size) { } template <> -void Free(platform::CUDAPlace place, void* p) { - GetGPUBuddyAllocator(place.device)->Free(p); +void Free(platform::CUDAPlace place, void* p, + bool is_pinned) { + if (is_pinned) { + GetCUDAPinnedBuddyAllocator(place.device)->Free(p); + } else { + GetGPUBuddyAllocator(place.device)->Free(p); + } } #endif diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h index 7c5db815d6543f026ab99f7cf895a87db4e5a3d8..062bfc880e78dc5d90c567ffe5c4e521704c9ca6 100644 --- a/paddle/fluid/memory/memory.h +++ b/paddle/fluid/memory/memory.h @@ -33,7 +33,7 @@ namespace memory { * address is valid or not. */ template -void* Alloc(Place place, size_t size); +void* Alloc(Place place, size_t size, bool is_pinned = false); /** * \brief Free memory block in one place. @@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size); * */ template -void Free(Place place, void* ptr); +void Free(Place place, void* ptr, bool is_pinned = false); /** * \brief Total size of used memory in one place. @@ -74,11 +74,13 @@ class PODDeleter { static_assert(std::is_pod::value, "T must be POD"); public: - explicit PODDeleter(Place place) : place_(place) {} - void operator()(T* ptr) { Free(place_, static_cast(ptr)); } + explicit PODDeleter(Place place, bool is_pinned = false) + : place_(place), is_pinned_(is_pinned) {} + void operator()(T* ptr) { Free(place_, static_cast(ptr), is_pinned_); } private: Place place_; + bool is_pinned_; }; /**