未验证 提交 2e4a3986 编写于 作者: C chengduo 提交者: GitHub

Merge pull request #9216 from chengduoZH/feature/add_pinned_memory

Add pinned memory
......@@ -45,10 +45,11 @@ class Tensor {
friend struct EigenVector;
public:
Tensor() : offset_(0) {}
Tensor() : offset_(0), is_pinned_(false) {}
/*! Constructor with place should only be used in pybind. */
explicit Tensor(const platform::Place& place) : offset_(0) {
explicit Tensor(const platform::Place& place)
: offset_(0), is_pinned_(false) {
holder_->set_place(place);
}
......@@ -69,11 +70,12 @@ class Tensor {
* @note If not exist, then allocation.
*/
template <typename T>
inline T* mutable_data(platform::Place place);
inline T* mutable_data(platform::Place place, bool is_pinned = false);
inline void* mutable_data(platform::Place place, std::type_index type);
inline void* mutable_data(platform::Place place, std::type_index type,
bool is_pinned = false);
inline void* mutable_data(platform::Place place);
inline void* mutable_data(platform::Place place, bool is_pinned = false);
/**
* @brief Return a pointer to mutable memory block.
......@@ -84,7 +86,8 @@ class Tensor {
* @note If not exist, then allocation.
*/
template <typename T>
inline T* mutable_data(DDim dims, platform::Place place);
inline T* mutable_data(DDim dims, platform::Place place,
bool is_pinned = false);
/*! Return the dimensions of the memory block. */
inline const DDim& dims() const;
......@@ -92,6 +95,9 @@ class Tensor {
/*! Return the numel of the memory block. */
inline int64_t numel() const;
/*! Return the numel of the memory block. */
inline bool isPinned() const;
/*! Resize the dimensions of the memory block. */
inline Tensor& Resize(const DDim& dims);
......@@ -146,12 +152,14 @@ class Tensor {
template <typename Place>
struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(Place place, size_t size, std::type_index type)
: ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
memory::PODDeleter<uint8_t, Place>(place)),
PlaceholderImpl(Place place, size_t size, std::type_index type,
bool is_pinned = false)
: ptr_(static_cast<uint8_t*>(memory::Alloc(place, size, is_pinned)),
memory::PODDeleter<uint8_t, Place>(place, is_pinned)),
place_(place),
size_(size),
type_(type) {
type_(type),
is_pinned_(is_pinned) {
PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
(is_cpu_place(place_) ? "CPU" : "GPU"));
}
......@@ -174,6 +182,9 @@ class Tensor {
/* the current type of memory */
std::type_index type_;
/*! use pinned memory or not. */
bool is_pinned_;
};
/*! holds the memory block if allocated. */
......@@ -208,6 +219,7 @@ class Tensor {
* PlaceHolder::ptr_ and where the tensor data really begins.
*/
size_t offset_;
bool is_pinned_;
};
inline void Tensor::switch_place(platform::Place new_place) {
......
......@@ -101,19 +101,21 @@ inline T* Tensor::data() {
}
template <typename T>
inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
inline T* Tensor::mutable_data(DDim dims, platform::Place place,
bool is_pinned) {
static_assert(std::is_pod<T>::value, "T must be POD");
Resize(dims);
return mutable_data<T>(place);
return mutable_data<T>(place, is_pinned);
}
template <typename T>
inline T* Tensor::mutable_data(platform::Place place) {
inline T* Tensor::mutable_data(platform::Place place, bool is_pinned) {
static_assert(std::is_pod<T>::value, "T must be POD");
return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
return reinterpret_cast<T*>(mutable_data(place, typeid(T), is_pinned));
}
inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
bool is_pinned) {
if (holder_ != nullptr) {
holder_->set_type(type);
}
......@@ -127,26 +129,27 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
holder_->size() < size + offset_) {
if (platform::is_cpu_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), size, type));
boost::get<platform::CPUPlace>(place), size, type, is_pinned));
} else if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
}
#else
holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(place), size, type));
boost::get<platform::CUDAPlace>(place), size, type, is_pinned));
}
#endif
offset_ = 0;
is_pinned_ = is_pinned;
}
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
}
inline void* Tensor::mutable_data(platform::Place place) {
inline void* Tensor::mutable_data(platform::Place place, bool is_pinned) {
PADDLE_ENFORCE(this->holder_ != nullptr,
"Cannot invoke mutable data if current hold nothing");
return mutable_data(place, holder_->type());
return mutable_data(place, holder_->type(), is_pinned);
}
inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
......@@ -188,6 +191,8 @@ inline const DDim& Tensor::dims() const { return dims_; }
inline int64_t Tensor::numel() const { return product(dims_); }
inline bool Tensor::isPinned() const { return is_pinned_; }
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
Tensor res;
res.ShareDataWith(src);
......
......@@ -119,6 +119,50 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {
bool GPUAllocator::UseGpu() const { return true; }
// PINNED memory allows direct DMA transfers by the GPU to and from system
// memory. It’s locked to a physical address.
void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
if (size <= 0) return nullptr;
void* p;
// NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
// of host pinned allocation. Allocates too much would reduce
// the amount of memory available to the underlying system for paging.
size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
if (size > usable) return nullptr;
// PINNED memory is visible to all CUDA contexts.
cudaError_t result = cudaMallocHost(&p, size);
if (result == cudaSuccess) {
index = 1;
fallback_alloc_size_ += size;
return p;
}
return nullptr;
}
void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
cudaError_t err;
PADDLE_ASSERT(index == 1);
PADDLE_ASSERT(fallback_alloc_size_ >= size);
fallback_alloc_size_ -= size;
err = cudaFreeHost(p);
// Purposefully allow cudaErrorCudartUnloading, because
// that is returned if you ever call cudaFreeHost after the
// driver has already shutdown. This happens only if the
// process is terminating, in which case we don't care if
// cudaFreeHost succeeds.
if (err != cudaErrorCudartUnloading) {
PADDLE_ENFORCE(err, "cudaFreeHost failed in GPUPinnedAllocator::Free.");
}
}
bool CUDAPinnedAllocator::UseGpu() const { return true; }
#endif
} // namespace detail
......
......@@ -51,6 +51,18 @@ class GPUAllocator : public SystemAllocator {
size_t gpu_alloc_size_ = 0;
size_t fallback_alloc_size_ = 0;
};
class CUDAPinnedAllocator : public SystemAllocator {
public:
virtual void* Alloc(size_t& index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
private:
size_t gpu_alloc_size_ =
0; // TODO(zcd): how to define the upper limit of CUDAPinnedMemory?
size_t fallback_alloc_size_ = 0;
};
#endif
} // namespace detail
......
......@@ -38,7 +38,8 @@ BuddyAllocator* GetCPUBuddyAllocator() {
}
template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
bool is_pinned) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void* p = GetCPUBuddyAllocator()->Alloc(size);
VLOG(10) << " pointer=" << p;
......@@ -46,7 +47,8 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
}
template <>
void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
void Free<platform::CPUPlace>(platform::CPUPlace place, void* p,
bool is_pinned) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p);
}
......@@ -82,15 +84,47 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
return as[gpu_id];
}
BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) {
static BuddyAllocator** as = NULL;
if (as == NULL) {
int gpu_num = platform::GetCUDADeviceCount();
as = new BuddyAllocator*[gpu_num];
for (int gpu = 0; gpu < gpu_num; gpu++) {
as[gpu] = nullptr;
}
}
platform::SetDeviceId(gpu_id);
if (!as[gpu_id]) {
as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator,
platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n";
}
return as[gpu_id];
}
template <>
size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
return GetGPUBuddyAllocator(place.device)->Used();
}
template <>
void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
bool is_pinned) {
void* ptr;
if (is_pinned) {
auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
ptr = buddy_allocator->Alloc(size);
} else {
auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
auto* ptr = buddy_allocator->Alloc(size);
ptr = buddy_allocator->Alloc(size);
}
if (ptr == nullptr) {
int cur_dev = platform::GetCurrentDeviceId();
platform::SetDeviceId(place.device);
......@@ -108,8 +142,13 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
}
template <>
void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p,
bool is_pinned) {
if (is_pinned) {
GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
} else {
GetGPUBuddyAllocator(place.device)->Free(p);
}
}
#endif
......
......@@ -33,7 +33,7 @@ namespace memory {
* address is valid or not.
*/
template <typename Place>
void* Alloc(Place place, size_t size);
void* Alloc(Place place, size_t size, bool is_pinned = false);
/**
* \brief Free memory block in one place.
......@@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size);
*
*/
template <typename Place>
void Free(Place place, void* ptr);
void Free(Place place, void* ptr, bool is_pinned = false);
/**
* \brief Total size of used memory in one place.
......@@ -74,11 +74,13 @@ class PODDeleter {
static_assert(std::is_pod<T>::value, "T must be POD");
public:
explicit PODDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
explicit PODDeleter(Place place, bool is_pinned = false)
: place_(place), is_pinned_(is_pinned) {}
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr), is_pinned_); }
private:
Place place_;
bool is_pinned_;
};
/**
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册