未验证 提交 f56967c4 编写于 作者: Z Zeng Jinle 提交者: GitHub

refine error message of gpu allocator, test=develop (#21008)

上级 bc8e600c
...@@ -105,6 +105,21 @@ bool CPUAllocator::UseGpu() const { return false; } ...@@ -105,6 +105,21 @@ bool CPUAllocator::UseGpu() const { return false; }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
static void ClearCUDAOutOfMemoryError(cudaError_t* status) {
if (*status == cudaErrorMemoryAllocation) {
*status = cudaSuccess;
}
PADDLE_ENFORCE_CUDA_SUCCESS(*status);
*status = cudaGetLastError();
if (*status == cudaErrorMemoryAllocation) {
*status = cudaSuccess;
}
PADDLE_ENFORCE_CUDA_SUCCESS(*status);
}
void* GPUAllocator::Alloc(size_t* index, size_t size) { void* GPUAllocator::Alloc(size_t* index, size_t size) {
// CUDA documentation doesn't explain if cudaMalloc returns nullptr // CUDA documentation doesn't explain if cudaMalloc returns nullptr
// if size is 0. We just make sure it does. // if size is 0. We just make sure it does.
...@@ -120,19 +135,19 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { ...@@ -120,19 +135,19 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
gpu_alloc_size_ += size; gpu_alloc_size_ += size;
return p; return p;
} else { } else {
if (result == cudaErrorMemoryAllocation) { ClearCUDAOutOfMemoryError(&result);
result = cudaSuccess;
} /**
PADDLE_ENFORCE_CUDA_SUCCESS(result); * NOTE(zjl): Sometimes cudaMemGetInfo would raise OOM error
* if there is very little GPU memory left. In this case, we
result = cudaGetLastError(); * should consider the available GPU memory to be 0, and throw
if (result == cudaErrorMemoryAllocation) { * exception inside this function instead of throwing exception
result = cudaSuccess; * inside cudaMemGetInfo.
} */
PADDLE_ENFORCE_CUDA_SUCCESS(result); size_t avail = 0, total = 0;
result = cudaMemGetInfo(&avail, &total);
size_t avail, total; if (result != cudaSuccess) avail = 0;
platform::GpuMemoryUsage(&avail, &total); ClearCUDAOutOfMemoryError(&result);
PADDLE_THROW_BAD_ALLOC( PADDLE_THROW_BAD_ALLOC(
"\n\nOut of memory error on GPU %d. " "\n\nOut of memory error on GPU %d. "
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册