提交 18eb7730 编写于 作者: C chengduoZH

add CUDAPinnedPlace

上级 f3dc3112
......@@ -45,11 +45,10 @@ class Tensor {
friend struct EigenVector;
public:
Tensor() : offset_(0), is_pinned_(false) {}
Tensor() : offset_(0) {}
/*! Constructor with place should only be used in pybind. */
explicit Tensor(const platform::Place& place)
: offset_(0), is_pinned_(false) {
explicit Tensor(const platform::Place& place) : offset_(0) {
holder_->set_place(place);
}
......@@ -70,12 +69,11 @@ class Tensor {
* @note If not exist, then allocation.
*/
template <typename T>
inline T* mutable_data(platform::Place place, bool is_pinned = false);
inline T* mutable_data(platform::Place place);
inline void* mutable_data(platform::Place place, std::type_index type,
bool is_pinned = false);
inline void* mutable_data(platform::Place place, std::type_index type);
inline void* mutable_data(platform::Place place, bool is_pinned = false);
inline void* mutable_data(platform::Place place);
/**
* @brief Return a pointer to mutable memory block.
......@@ -86,8 +84,7 @@ class Tensor {
* @note If not exist, then allocation.
*/
template <typename T>
inline T* mutable_data(DDim dims, platform::Place place,
bool is_pinned = false);
inline T* mutable_data(DDim dims, platform::Place place);
/*! Return the dimensions of the memory block. */
inline const DDim& dims() const;
......@@ -152,14 +149,12 @@ class Tensor {
template <typename Place>
struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(Place place, size_t size, std::type_index type,
bool is_pinned = false)
: ptr_(static_cast<uint8_t*>(memory::Alloc(place, size, is_pinned)),
memory::PODDeleter<uint8_t, Place>(place, is_pinned)),
PlaceholderImpl(Place place, size_t size, std::type_index type)
: ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
memory::PODDeleter<uint8_t, Place>(place)),
place_(place),
size_(size),
type_(type),
is_pinned_(is_pinned) {
type_(type) {
PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
(is_cpu_place(place_) ? "CPU" : "GPU"));
}
......@@ -182,9 +177,6 @@ class Tensor {
/* the current type of memory */
std::type_index type_;
/*! use pinned memory or not. */
bool is_pinned_;
};
/*! holds the memory block if allocated. */
......@@ -219,7 +211,6 @@ class Tensor {
* PlaceHolder::ptr_ and where the tensor data really begins.
*/
size_t offset_;
bool is_pinned_;
};
inline void Tensor::switch_place(platform::Place new_place) {
......
......@@ -101,21 +101,19 @@ inline T* Tensor::data() {
}
template <typename T>
inline T* Tensor::mutable_data(DDim dims, platform::Place place,
bool is_pinned) {
inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
static_assert(std::is_pod<T>::value, "T must be POD");
Resize(dims);
return mutable_data<T>(place, is_pinned);
return mutable_data<T>(place);
}
template <typename T>
inline T* Tensor::mutable_data(platform::Place place, bool is_pinned) {
inline T* Tensor::mutable_data(platform::Place place) {
static_assert(std::is_pod<T>::value, "T must be POD");
return reinterpret_cast<T*>(mutable_data(place, typeid(T), is_pinned));
return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
}
inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
bool is_pinned) {
inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
if (holder_ != nullptr) {
holder_->set_type(type);
}
......@@ -129,27 +127,26 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
holder_->size() < size + offset_) {
if (platform::is_cpu_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), size, type, is_pinned));
boost::get<platform::CPUPlace>(place), size, type));
} else if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
}
#else
holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(place), size, type, is_pinned));
boost::get<platform::CUDAPlace>(place), size, type));
}
#endif
offset_ = 0;
is_pinned_ = is_pinned;
}
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
}
inline void* Tensor::mutable_data(platform::Place place, bool is_pinned) {
inline void* Tensor::mutable_data(platform::Place place) {
PADDLE_ENFORCE(this->holder_ != nullptr,
"Cannot invoke mutable data if current hold nothing");
return mutable_data(place, holder_->type(), is_pinned);
return mutable_data(place, holder_->type());
}
inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
......@@ -191,8 +188,6 @@ inline const DDim& Tensor::dims() const { return dims_; }
inline int64_t Tensor::numel() const { return product(dims_); }
inline bool Tensor::isPinned() const { return is_pinned_; }
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
Tensor res;
res.ShareDataWith(src);
......
......@@ -123,20 +123,20 @@ bool GPUAllocator::UseGpu() const { return true; }
// memory. It’s locked to a physical address.
void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
if (size <= 0) return nullptr;
void* p;
// NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
// NOTE: here, we use CpuMaxAllocSize()/2 as the maximum memory size
// of host pinned allocation. Allocates too much would reduce
// the amount of memory available to the underlying system for paging.
size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
size_t usable = CpuMaxAllocSize() / 2 - cuda_pinnd_alloc_size_;
if (size > usable) return nullptr;
// PINNED memory is visible to all CUDA contexts.
cudaError_t result = cudaMallocHost(&p, size);
if (result == cudaSuccess) {
index = 1;
fallback_alloc_size_ += size;
index = 1; // PINNED memory
cuda_pinnd_alloc_size_ += size;
return p;
}
......@@ -147,8 +147,8 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
cudaError_t err;
PADDLE_ASSERT(index == 1);
PADDLE_ASSERT(fallback_alloc_size_ >= size);
fallback_alloc_size_ -= size;
PADDLE_ASSERT(cuda_pinnd_alloc_size_ >= size);
cuda_pinnd_alloc_size_ -= size;
err = cudaFreeHost(p);
// Purposefully allow cudaErrorCudartUnloading, because
......
......@@ -59,9 +59,7 @@ class CUDAPinnedAllocator : public SystemAllocator {
virtual bool UseGpu() const;
private:
size_t gpu_alloc_size_ =
0; // TODO(zcd): how to define the upper limit of CUDAPinnedMemory?
size_t fallback_alloc_size_ = 0;
size_t cuda_pinnd_alloc_size_ = 0;
};
#endif
......
......@@ -38,8 +38,7 @@ BuddyAllocator* GetCPUBuddyAllocator() {
}
template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
bool is_pinned) {
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void* p = GetCPUBuddyAllocator()->Alloc(size);
VLOG(10) << " pointer=" << p;
......@@ -47,8 +46,7 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
}
template <>
void Free<platform::CPUPlace>(platform::CPUPlace place, void* p,
bool is_pinned) {
void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p);
}
......@@ -85,27 +83,13 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
}
BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) {
static BuddyAllocator** as = NULL;
static BuddyAllocator* as = NULL;
if (as == NULL) {
int gpu_num = platform::GetCUDADeviceCount();
as = new BuddyAllocator*[gpu_num];
for (int gpu = 0; gpu < gpu_num; gpu++) {
as[gpu] = nullptr;
}
}
platform::SetDeviceId(gpu_id);
if (!as[gpu_id]) {
as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator,
platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n";
as = new BuddyAllocator(new detail::CUDAPinnedAllocator,
platform::CpuMinChunkSize(),
platform::CpuMaxChunkSize());
}
return as[gpu_id];
return as;
}
template <>
......@@ -114,16 +98,9 @@ size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
}
template <>
void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
bool is_pinned) {
void* ptr;
if (is_pinned) {
auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
ptr = buddy_allocator->Alloc(size);
} else {
auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
ptr = buddy_allocator->Alloc(size);
}
void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
void* ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
int cur_dev = platform::GetCurrentDeviceId();
......@@ -142,13 +119,39 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
}
template <>
void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p,
bool is_pinned) {
if (is_pinned) {
GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
} else {
GetGPUBuddyAllocator(place.device)->Free(p);
void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
GetGPUBuddyAllocator(place.device)->Free(p);
}
size_t Used<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place) {
return GetGPUBuddyAllocator(place.device)->Used();
}
template <>
void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
size_t size) {
auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
void* ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
int cur_dev = platform::GetCurrentDeviceId();
platform::SetDeviceId(place.device);
size_t avail, total;
platform::GpuMemoryUsage(avail, total);
LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
<< place.device << ", available " << avail << " bytes";
LOG(WARNING) << "total " << total;
LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
platform::SetDeviceId(cur_dev);
}
return ptr;
}
template <>
void Free<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place, void* p) {
GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
}
#endif
......@@ -165,6 +168,10 @@ size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
#endif
}
size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const {
return Used(cuda_pinned);
}
size_t memory_usage(const platform::Place& p) {
return boost::apply_visitor(Usage(), p);
}
......
......@@ -57,6 +57,7 @@ size_t Used(Place place);
struct Usage : public boost::static_visitor<size_t> {
size_t operator()(const platform::CPUPlace& cpu) const;
size_t operator()(const platform::CUDAPlace& gpu) const;
size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
};
size_t memory_usage(const platform::Place& p);
......
......@@ -118,6 +118,18 @@ struct DefaultDeviceContextType<platform::CUDAPlace> {
using TYPE = CUDADeviceContext;
};
// Currently, CUDAPinnedDeviceContext is only used to data copying.
// class CUDAPinnedDeviceContext : public DeviceContext {
// public:
// CUDAPinnedDeviceContext();
// explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place);
//
// Place GetPlace() const override;
//
// private:
// CUDAPinnedPlace place_;
//};
#endif
#ifdef PADDLE_WITH_MKLDNN
......
......@@ -40,12 +40,19 @@ const Place &get_place() { return the_default_place; }
const CUDAPlace default_gpu() { return CUDAPlace(0); }
const CPUPlace default_cpu() { return CPUPlace(); }
const CUDAPinnedPlace default_cuda_pinned() { return CUDAPinnedPlace(); }
bool is_gpu_place(const Place &p) {
return boost::apply_visitor(IsCUDAPlace(), p);
}
bool is_cpu_place(const Place &p) { return !is_gpu_place(p); }
bool is_cpu_place(const Place &p) {
return boost::apply_visitor(IsCPUPlace(), p);
}
bool is_cuda_pinned_place(const Place &p) {
return boost::apply_visitor(IsCUDAPinnedPlace(), p);
}
bool places_are_same_class(const Place &p1, const Place &p2) {
return p1.which() == p2.which();
......@@ -53,7 +60,7 @@ bool places_are_same_class(const Place &p1, const Place &p2) {
bool is_same_place(const Place &p1, const Place &p2) {
if (places_are_same_class(p1, p2)) {
if (is_cpu_place(p1)) {
if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) {
return true;
} else {
return boost::get<CUDAPlace>(p1) == boost::get<CUDAPlace>(p2);
......
......@@ -45,12 +45,33 @@ struct CUDAPlace {
int device;
};
struct CUDAPinnedPlace {
CUDAPinnedPlace() {}
// needed for variant equality comparison
inline bool operator==(const CUDAPinnedPlace &) const { return true; }
inline bool operator!=(const CUDAPinnedPlace &) const { return false; }
};
struct IsCUDAPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const CUDAPlace &gpu) const { return true; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
typedef boost::variant<CUDAPlace, CPUPlace> Place;
struct IsCPUPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &cpu) const { return true; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
};
typedef boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> Place;
using PlaceList = std::vector<Place>;
......@@ -59,9 +80,11 @@ const Place &get_place();
const CUDAPlace default_gpu();
const CPUPlace default_cpu();
const CUDAPinnedPlace default_cuda_pinned();
bool is_gpu_place(const Place &);
bool is_cpu_place(const Place &);
bool is_cuda_pinned_place(const Place &);
bool places_are_same_class(const Place &, const Place &);
bool is_same_place(const Place &, const Place &);
......@@ -97,6 +120,11 @@ struct PlaceVisitorWrapper
return typename Visitor::result_type();
#endif
}
typename Visitor::result_type operator()(
const CUDAPinnedPlace &cuda_pinned) const {
return visitor_(cuda_pinned);
}
};
template <typename Visitor>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册