提交 236b7dd2 编写于 作者: C chengduoZH

add pinned memory

上级 484cff6e
......@@ -119,6 +119,47 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {
bool GPUAllocator::UseGpu() const { return true; }
void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
if (size <= 0) return nullptr;
void* p;
// NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
// of host fallback allocation. Allocates too much would reduce
// the amount of memory available to the underlying system for paging.
size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
if (size > usable) return nullptr;
cudaError_t result = cudaMallocHost(&p, size);
if (result == cudaSuccess) {
index = 1;
fallback_alloc_size_ += size;
return p;
}
return nullptr;
}
void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
cudaError_t err;
PADDLE_ASSERT(index == 1);
PADDLE_ASSERT(fallback_alloc_size_ >= size);
fallback_alloc_size_ -= size;
err = cudaFreeHost(p);
// Purposefully allow cudaErrorCudartUnloading, because
// that is returned if you ever call cudaFree after the
// driver has already shutdown. This happens only if the
// process is terminating, in which case we don't care if
// cudaFree succeeds.
if (err != cudaErrorCudartUnloading) {
PADDLE_ENFORCE(err, "cudaFreeHost failed in GPUPinnedAllocator::Free.");
}
}
bool CUDAPinnedAllocator::UseGpu() const { return true; }
#endif
} // namespace detail
......
......@@ -51,6 +51,18 @@ class GPUAllocator : public SystemAllocator {
size_t gpu_alloc_size_ = 0;
size_t fallback_alloc_size_ = 0;
};
class CUDAPinnedAllocator : public SystemAllocator {
public:
virtual void* Alloc(size_t& index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
private:
size_t gpu_alloc_size_ =
0; // TODO(zcd): how to define the upper limit of CUDAPinnedMemory?
size_t fallback_alloc_size_ = 0;
};
#endif
} // namespace detail
......
......@@ -38,7 +38,8 @@ BuddyAllocator* GetCPUBuddyAllocator() {
}
template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
bool use_pinned) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void* p = GetCPUBuddyAllocator()->Alloc(size);
VLOG(10) << " pointer=" << p;
......@@ -46,7 +47,8 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
}
template <>
void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
void Free<platform::CPUPlace>(platform::CPUPlace place, void* p,
bool use_pinned) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p);
}
......@@ -82,15 +84,47 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
return as[gpu_id];
}
BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) {
static BuddyAllocator** as = NULL;
if (as == NULL) {
int gpu_num = platform::GetCUDADeviceCount();
as = new BuddyAllocator*[gpu_num];
for (int gpu = 0; gpu < gpu_num; gpu++) {
as[gpu] = nullptr;
}
}
platform::SetDeviceId(gpu_id);
if (!as[gpu_id]) {
as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator,
platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n";
}
return as[gpu_id];
}
template <>
size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
return GetGPUBuddyAllocator(place.device)->Used();
}
template <>
void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
auto* ptr = buddy_allocator->Alloc(size);
void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
bool use_pinned) {
void* ptr;
if (use_pinned) {
auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
ptr = buddy_allocator->Alloc(size);
} else {
auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
ptr = buddy_allocator->Alloc(size);
}
if (ptr == nullptr) {
int cur_dev = platform::GetCurrentDeviceId();
platform::SetDeviceId(place.device);
......@@ -108,8 +142,13 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
}
template <>
void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
GetGPUBuddyAllocator(place.device)->Free(p);
void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p,
bool use_pinned) {
if (use_pinned) {
GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
} else {
GetGPUBuddyAllocator(place.device)->Free(p);
}
}
#endif
......
......@@ -33,7 +33,7 @@ namespace memory {
* address is valid or not.
*/
template <typename Place>
void* Alloc(Place place, size_t size);
void* Alloc(Place place, size_t size, bool use_pinned = false);
/**
* \brief Free memory block in one place.
......@@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size);
*
*/
template <typename Place>
void Free(Place place, void* ptr);
void Free(Place place, void* ptr, bool use_pinned = false);
/**
* \brief Total size of used memory in one place.
......@@ -74,11 +74,15 @@ class PODDeleter {
static_assert(std::is_pod<T>::value, "T must be POD");
public:
explicit PODDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
explicit PODDeleter(Place place, bool use_pinned = false)
: place_(place), use_pinned_(use_pinned) {}
void operator()(T* ptr) {
Free(place_, static_cast<void*>(ptr), use_pinned_);
}
private:
Place place_;
bool use_pinned_;
};
/**
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册