提交 18eb7730 编写于 作者: C chengduoZH

add CUDAPinnedPlace

上级 f3dc3112
...@@ -45,11 +45,10 @@ class Tensor { ...@@ -45,11 +45,10 @@ class Tensor {
friend struct EigenVector; friend struct EigenVector;
public: public:
Tensor() : offset_(0), is_pinned_(false) {} Tensor() : offset_(0) {}
/*! Constructor with place should only be used in pybind. */ /*! Constructor with place should only be used in pybind. */
explicit Tensor(const platform::Place& place) explicit Tensor(const platform::Place& place) : offset_(0) {
: offset_(0), is_pinned_(false) {
holder_->set_place(place); holder_->set_place(place);
} }
...@@ -70,12 +69,11 @@ class Tensor { ...@@ -70,12 +69,11 @@ class Tensor {
* @note If not exist, then allocation. * @note If not exist, then allocation.
*/ */
template <typename T> template <typename T>
inline T* mutable_data(platform::Place place, bool is_pinned = false); inline T* mutable_data(platform::Place place);
inline void* mutable_data(platform::Place place, std::type_index type, inline void* mutable_data(platform::Place place, std::type_index type);
bool is_pinned = false);
inline void* mutable_data(platform::Place place, bool is_pinned = false); inline void* mutable_data(platform::Place place);
/** /**
* @brief Return a pointer to mutable memory block. * @brief Return a pointer to mutable memory block.
...@@ -86,8 +84,7 @@ class Tensor { ...@@ -86,8 +84,7 @@ class Tensor {
* @note If not exist, then allocation. * @note If not exist, then allocation.
*/ */
template <typename T> template <typename T>
inline T* mutable_data(DDim dims, platform::Place place, inline T* mutable_data(DDim dims, platform::Place place);
bool is_pinned = false);
/*! Return the dimensions of the memory block. */ /*! Return the dimensions of the memory block. */
inline const DDim& dims() const; inline const DDim& dims() const;
...@@ -152,14 +149,12 @@ class Tensor { ...@@ -152,14 +149,12 @@ class Tensor {
template <typename Place> template <typename Place>
struct PlaceholderImpl : public Placeholder { struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(Place place, size_t size, std::type_index type, PlaceholderImpl(Place place, size_t size, std::type_index type)
bool is_pinned = false) : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
: ptr_(static_cast<uint8_t*>(memory::Alloc(place, size, is_pinned)), memory::PODDeleter<uint8_t, Place>(place)),
memory::PODDeleter<uint8_t, Place>(place, is_pinned)),
place_(place), place_(place),
size_(size), size_(size),
type_(type), type_(type) {
is_pinned_(is_pinned) {
PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.", PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
(is_cpu_place(place_) ? "CPU" : "GPU")); (is_cpu_place(place_) ? "CPU" : "GPU"));
} }
...@@ -182,9 +177,6 @@ class Tensor { ...@@ -182,9 +177,6 @@ class Tensor {
/* the current type of memory */ /* the current type of memory */
std::type_index type_; std::type_index type_;
/*! use pinned memory or not. */
bool is_pinned_;
}; };
/*! holds the memory block if allocated. */ /*! holds the memory block if allocated. */
...@@ -219,7 +211,6 @@ class Tensor { ...@@ -219,7 +211,6 @@ class Tensor {
* PlaceHolder::ptr_ and where the tensor data really begins. * PlaceHolder::ptr_ and where the tensor data really begins.
*/ */
size_t offset_; size_t offset_;
bool is_pinned_;
}; };
inline void Tensor::switch_place(platform::Place new_place) { inline void Tensor::switch_place(platform::Place new_place) {
......
...@@ -101,21 +101,19 @@ inline T* Tensor::data() { ...@@ -101,21 +101,19 @@ inline T* Tensor::data() {
} }
template <typename T> template <typename T>
inline T* Tensor::mutable_data(DDim dims, platform::Place place, inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
bool is_pinned) {
static_assert(std::is_pod<T>::value, "T must be POD"); static_assert(std::is_pod<T>::value, "T must be POD");
Resize(dims); Resize(dims);
return mutable_data<T>(place, is_pinned); return mutable_data<T>(place);
} }
template <typename T> template <typename T>
inline T* Tensor::mutable_data(platform::Place place, bool is_pinned) { inline T* Tensor::mutable_data(platform::Place place) {
static_assert(std::is_pod<T>::value, "T must be POD"); static_assert(std::is_pod<T>::value, "T must be POD");
return reinterpret_cast<T*>(mutable_data(place, typeid(T), is_pinned)); return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
} }
inline void* Tensor::mutable_data(platform::Place place, std::type_index type, inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
bool is_pinned) {
if (holder_ != nullptr) { if (holder_ != nullptr) {
holder_->set_type(type); holder_->set_type(type);
} }
...@@ -129,27 +127,26 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type, ...@@ -129,27 +127,26 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
holder_->size() < size + offset_) { holder_->size() < size + offset_) {
if (platform::is_cpu_place(place)) { if (platform::is_cpu_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CPUPlace>( holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), size, type, is_pinned)); boost::get<platform::CPUPlace>(place), size, type));
} else if (platform::is_gpu_place(place)) { } else if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
} }
#else #else
holder_.reset(new PlaceholderImpl<platform::CUDAPlace>( holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(place), size, type, is_pinned)); boost::get<platform::CUDAPlace>(place), size, type));
} }
#endif #endif
offset_ = 0; offset_ = 0;
is_pinned_ = is_pinned;
} }
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) + return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_); offset_);
} }
inline void* Tensor::mutable_data(platform::Place place, bool is_pinned) { inline void* Tensor::mutable_data(platform::Place place) {
PADDLE_ENFORCE(this->holder_ != nullptr, PADDLE_ENFORCE(this->holder_ != nullptr,
"Cannot invoke mutable data if current hold nothing"); "Cannot invoke mutable data if current hold nothing");
return mutable_data(place, holder_->type(), is_pinned); return mutable_data(place, holder_->type());
} }
inline Tensor& Tensor::ShareDataWith(const Tensor& src) { inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
...@@ -191,8 +188,6 @@ inline const DDim& Tensor::dims() const { return dims_; } ...@@ -191,8 +188,6 @@ inline const DDim& Tensor::dims() const { return dims_; }
inline int64_t Tensor::numel() const { return product(dims_); } inline int64_t Tensor::numel() const { return product(dims_); }
inline bool Tensor::isPinned() const { return is_pinned_; }
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
Tensor res; Tensor res;
res.ShareDataWith(src); res.ShareDataWith(src);
......
...@@ -123,20 +123,20 @@ bool GPUAllocator::UseGpu() const { return true; } ...@@ -123,20 +123,20 @@ bool GPUAllocator::UseGpu() const { return true; }
// memory. It’s locked to a physical address. // memory. It’s locked to a physical address.
void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) { void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
if (size <= 0) return nullptr; if (size <= 0) return nullptr;
void* p;
// NOTE: here, we use GpuMaxAllocSize() as the maximum memory size // NOTE: here, we use CpuMaxAllocSize()/2 as the maximum memory size
// of host pinned allocation. Allocates too much would reduce // of host pinned allocation. Allocates too much would reduce
// the amount of memory available to the underlying system for paging. // the amount of memory available to the underlying system for paging.
size_t usable = CpuMaxAllocSize() / 2 - cuda_pinnd_alloc_size_;
size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
if (size > usable) return nullptr; if (size > usable) return nullptr;
// PINNED memory is visible to all CUDA contexts. // PINNED memory is visible to all CUDA contexts.
cudaError_t result = cudaMallocHost(&p, size); cudaError_t result = cudaMallocHost(&p, size);
if (result == cudaSuccess) { if (result == cudaSuccess) {
index = 1; index = 1; // PINNED memory
fallback_alloc_size_ += size; cuda_pinnd_alloc_size_ += size;
return p; return p;
} }
...@@ -147,8 +147,8 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { ...@@ -147,8 +147,8 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
cudaError_t err; cudaError_t err;
PADDLE_ASSERT(index == 1); PADDLE_ASSERT(index == 1);
PADDLE_ASSERT(fallback_alloc_size_ >= size); PADDLE_ASSERT(cuda_pinnd_alloc_size_ >= size);
fallback_alloc_size_ -= size; cuda_pinnd_alloc_size_ -= size;
err = cudaFreeHost(p); err = cudaFreeHost(p);
// Purposefully allow cudaErrorCudartUnloading, because // Purposefully allow cudaErrorCudartUnloading, because
......
...@@ -59,9 +59,7 @@ class CUDAPinnedAllocator : public SystemAllocator { ...@@ -59,9 +59,7 @@ class CUDAPinnedAllocator : public SystemAllocator {
virtual bool UseGpu() const; virtual bool UseGpu() const;
private: private:
size_t gpu_alloc_size_ = size_t cuda_pinnd_alloc_size_ = 0;
0; // TODO(zcd): how to define the upper limit of CUDAPinnedMemory?
size_t fallback_alloc_size_ = 0;
}; };
#endif #endif
......
...@@ -38,8 +38,7 @@ BuddyAllocator* GetCPUBuddyAllocator() { ...@@ -38,8 +38,7 @@ BuddyAllocator* GetCPUBuddyAllocator() {
} }
template <> template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size, void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
bool is_pinned) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void* p = GetCPUBuddyAllocator()->Alloc(size); void* p = GetCPUBuddyAllocator()->Alloc(size);
VLOG(10) << " pointer=" << p; VLOG(10) << " pointer=" << p;
...@@ -47,8 +46,7 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size, ...@@ -47,8 +46,7 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
} }
template <> template <>
void Free<platform::CPUPlace>(platform::CPUPlace place, void* p, void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
bool is_pinned) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p); GetCPUBuddyAllocator()->Free(p);
} }
...@@ -85,27 +83,13 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { ...@@ -85,27 +83,13 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
} }
BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) { BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) {
static BuddyAllocator** as = NULL; static BuddyAllocator* as = NULL;
if (as == NULL) { if (as == NULL) {
int gpu_num = platform::GetCUDADeviceCount(); as = new BuddyAllocator(new detail::CUDAPinnedAllocator,
as = new BuddyAllocator*[gpu_num]; platform::CpuMinChunkSize(),
for (int gpu = 0; gpu < gpu_num; gpu++) { platform::CpuMaxChunkSize());
as[gpu] = nullptr;
}
}
platform::SetDeviceId(gpu_id);
if (!as[gpu_id]) {
as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator,
platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n";
} }
return as[gpu_id]; return as;
} }
template <> template <>
...@@ -114,16 +98,9 @@ size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) { ...@@ -114,16 +98,9 @@ size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
} }
template <> template <>
void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size, void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
bool is_pinned) { auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
void* ptr; void* ptr = buddy_allocator->Alloc(size);
if (is_pinned) {
auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
ptr = buddy_allocator->Alloc(size);
} else {
auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
ptr = buddy_allocator->Alloc(size);
}
if (ptr == nullptr) { if (ptr == nullptr) {
int cur_dev = platform::GetCurrentDeviceId(); int cur_dev = platform::GetCurrentDeviceId();
...@@ -142,13 +119,39 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size, ...@@ -142,13 +119,39 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
} }
template <> template <>
void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p, void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
bool is_pinned) { GetGPUBuddyAllocator(place.device)->Free(p);
if (is_pinned) { }
GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
} else { size_t Used<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place) {
GetGPUBuddyAllocator(place.device)->Free(p); return GetGPUBuddyAllocator(place.device)->Used();
}
template <>
void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
size_t size) {
auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
void* ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
int cur_dev = platform::GetCurrentDeviceId();
platform::SetDeviceId(place.device);
size_t avail, total;
platform::GpuMemoryUsage(avail, total);
LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
<< place.device << ", available " << avail << " bytes";
LOG(WARNING) << "total " << total;
LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
platform::SetDeviceId(cur_dev);
} }
return ptr;
}
template <>
void Free<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place, void* p) {
GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
} }
#endif #endif
...@@ -165,6 +168,10 @@ size_t Usage::operator()(const platform::CUDAPlace& gpu) const { ...@@ -165,6 +168,10 @@ size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
#endif #endif
} }
size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const {
return Used(cuda_pinned);
}
size_t memory_usage(const platform::Place& p) { size_t memory_usage(const platform::Place& p) {
return boost::apply_visitor(Usage(), p); return boost::apply_visitor(Usage(), p);
} }
......
...@@ -57,6 +57,7 @@ size_t Used(Place place); ...@@ -57,6 +57,7 @@ size_t Used(Place place);
struct Usage : public boost::static_visitor<size_t> { struct Usage : public boost::static_visitor<size_t> {
size_t operator()(const platform::CPUPlace& cpu) const; size_t operator()(const platform::CPUPlace& cpu) const;
size_t operator()(const platform::CUDAPlace& gpu) const; size_t operator()(const platform::CUDAPlace& gpu) const;
size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
}; };
size_t memory_usage(const platform::Place& p); size_t memory_usage(const platform::Place& p);
......
...@@ -118,6 +118,18 @@ struct DefaultDeviceContextType<platform::CUDAPlace> { ...@@ -118,6 +118,18 @@ struct DefaultDeviceContextType<platform::CUDAPlace> {
using TYPE = CUDADeviceContext; using TYPE = CUDADeviceContext;
}; };
// Currently, CUDAPinnedDeviceContext is only used to data copying.
// class CUDAPinnedDeviceContext : public DeviceContext {
// public:
// CUDAPinnedDeviceContext();
// explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place);
//
// Place GetPlace() const override;
//
// private:
// CUDAPinnedPlace place_;
//};
#endif #endif
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
......
...@@ -40,12 +40,19 @@ const Place &get_place() { return the_default_place; } ...@@ -40,12 +40,19 @@ const Place &get_place() { return the_default_place; }
const CUDAPlace default_gpu() { return CUDAPlace(0); } const CUDAPlace default_gpu() { return CUDAPlace(0); }
const CPUPlace default_cpu() { return CPUPlace(); } const CPUPlace default_cpu() { return CPUPlace(); }
const CUDAPinnedPlace default_cuda_pinned() { return CUDAPinnedPlace(); }
bool is_gpu_place(const Place &p) { bool is_gpu_place(const Place &p) {
return boost::apply_visitor(IsCUDAPlace(), p); return boost::apply_visitor(IsCUDAPlace(), p);
} }
bool is_cpu_place(const Place &p) { return !is_gpu_place(p); } bool is_cpu_place(const Place &p) {
return boost::apply_visitor(IsCPUPlace(), p);
}
bool is_cuda_pinned_place(const Place &p) {
return boost::apply_visitor(IsCUDAPinnedPlace(), p);
}
bool places_are_same_class(const Place &p1, const Place &p2) { bool places_are_same_class(const Place &p1, const Place &p2) {
return p1.which() == p2.which(); return p1.which() == p2.which();
...@@ -53,7 +60,7 @@ bool places_are_same_class(const Place &p1, const Place &p2) { ...@@ -53,7 +60,7 @@ bool places_are_same_class(const Place &p1, const Place &p2) {
bool is_same_place(const Place &p1, const Place &p2) { bool is_same_place(const Place &p1, const Place &p2) {
if (places_are_same_class(p1, p2)) { if (places_are_same_class(p1, p2)) {
if (is_cpu_place(p1)) { if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) {
return true; return true;
} else { } else {
return boost::get<CUDAPlace>(p1) == boost::get<CUDAPlace>(p2); return boost::get<CUDAPlace>(p1) == boost::get<CUDAPlace>(p2);
......
...@@ -45,12 +45,33 @@ struct CUDAPlace { ...@@ -45,12 +45,33 @@ struct CUDAPlace {
int device; int device;
}; };
struct CUDAPinnedPlace {
CUDAPinnedPlace() {}
// needed for variant equality comparison
inline bool operator==(const CUDAPinnedPlace &) const { return true; }
inline bool operator!=(const CUDAPinnedPlace &) const { return false; }
};
struct IsCUDAPlace : public boost::static_visitor<bool> { struct IsCUDAPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; } bool operator()(const CPUPlace &) const { return false; }
bool operator()(const CUDAPlace &gpu) const { return true; } bool operator()(const CUDAPlace &gpu) const { return true; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
}; };
typedef boost::variant<CUDAPlace, CPUPlace> Place; struct IsCPUPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &cpu) const { return true; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
};
typedef boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> Place;
using PlaceList = std::vector<Place>; using PlaceList = std::vector<Place>;
...@@ -59,9 +80,11 @@ const Place &get_place(); ...@@ -59,9 +80,11 @@ const Place &get_place();
const CUDAPlace default_gpu(); const CUDAPlace default_gpu();
const CPUPlace default_cpu(); const CPUPlace default_cpu();
const CUDAPinnedPlace default_cuda_pinned();
bool is_gpu_place(const Place &); bool is_gpu_place(const Place &);
bool is_cpu_place(const Place &); bool is_cpu_place(const Place &);
bool is_cuda_pinned_place(const Place &);
bool places_are_same_class(const Place &, const Place &); bool places_are_same_class(const Place &, const Place &);
bool is_same_place(const Place &, const Place &); bool is_same_place(const Place &, const Place &);
...@@ -97,6 +120,11 @@ struct PlaceVisitorWrapper ...@@ -97,6 +120,11 @@ struct PlaceVisitorWrapper
return typename Visitor::result_type(); return typename Visitor::result_type();
#endif #endif
} }
typename Visitor::result_type operator()(
const CUDAPinnedPlace &cuda_pinned) const {
return visitor_(cuda_pinned);
}
}; };
template <typename Visitor> template <typename Visitor>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册