未验证 提交 a710ccc0 编写于 作者: Z Zeng Jinle 提交者: GitHub

refine error message of allocator again, test=develop (#21023)

上级 d89ca2ff
...@@ -35,14 +35,27 @@ void CUDAAllocator::FreeImpl(Allocation* allocation) { ...@@ -35,14 +35,27 @@ void CUDAAllocator::FreeImpl(Allocation* allocation) {
Allocation* CUDAAllocator::AllocateImpl(size_t size) { Allocation* CUDAAllocator::AllocateImpl(size_t size) {
platform::CUDADeviceGuard guard(place_.device); platform::CUDADeviceGuard guard(place_.device);
void* ptr; void* ptr;
auto status = cudaMalloc(&ptr, size); auto result = cudaMalloc(&ptr, size);
if (UNLIKELY(status != cudaSuccess)) { if (LIKELY(result == cudaSuccess)) {
PADDLE_ENFORCE_NE(cudaGetLastError(), cudaSuccess);
PADDLE_THROW_BAD_ALLOC("Cannot allocate %d on GPU %d, cuda status %d, %s",
size, place_.device, status,
cudaGetErrorString(status));
}
return new Allocation(ptr, size, platform::Place(place_)); return new Allocation(ptr, size, platform::Place(place_));
}
platform::RaiseNonOutOfMemoryError(&result);
size_t avail = 0, total = 0;
result = cudaMemGetInfo(&avail, &total);
if (result != cudaSuccess) avail = 0;
platform::RaiseNonOutOfMemoryError(&result);
PADDLE_THROW_BAD_ALLOC(
"\n\nOut of memory error on GPU %d. "
"Cannot allocate %s memory on GPU %d, "
"available memory is only %s.\n\n"
"Please check whether there is any other process using GPU %d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
"2. If no, please decrease the batch size of your model.\n",
place_.device, string::HumanReadableSize(size), place_.device,
string::HumanReadableSize(avail), place_.device);
} }
} // namespace allocation } // namespace allocation
......
...@@ -133,9 +133,7 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) { ...@@ -133,9 +133,7 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
ASSERT_TRUE(false); ASSERT_TRUE(false);
allocation.reset(); allocation.reset();
} catch (BadAlloc &ex) { } catch (BadAlloc &ex) {
ASSERT_TRUE(std::string(ex.what()).find( ASSERT_TRUE(std::string(ex.what()).find("Cannot allocate") !=
"Cannot allocate " + std::to_string(allocate_size) +
" on GPU " + std::to_string(p.device)) !=
std::string::npos); std::string::npos);
} }
} }
......
...@@ -105,21 +105,6 @@ bool CPUAllocator::UseGpu() const { return false; } ...@@ -105,21 +105,6 @@ bool CPUAllocator::UseGpu() const { return false; }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
static void ClearCUDAOutOfMemoryError(cudaError_t* status) {
if (*status == cudaErrorMemoryAllocation) {
*status = cudaSuccess;
}
PADDLE_ENFORCE_CUDA_SUCCESS(*status);
*status = cudaGetLastError();
if (*status == cudaErrorMemoryAllocation) {
*status = cudaSuccess;
}
PADDLE_ENFORCE_CUDA_SUCCESS(*status);
}
void* GPUAllocator::Alloc(size_t* index, size_t size) { void* GPUAllocator::Alloc(size_t* index, size_t size) {
// CUDA documentation doesn't explain if cudaMalloc returns nullptr // CUDA documentation doesn't explain if cudaMalloc returns nullptr
// if size is 0. We just make sure it does. // if size is 0. We just make sure it does.
...@@ -135,7 +120,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { ...@@ -135,7 +120,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
gpu_alloc_size_ += size; gpu_alloc_size_ += size;
return p; return p;
} else { } else {
ClearCUDAOutOfMemoryError(&result); platform::RaiseNonOutOfMemoryError(&result);
/** /**
* NOTE(zjl): Sometimes cudaMemGetInfo would raise OOM error * NOTE(zjl): Sometimes cudaMemGetInfo would raise OOM error
...@@ -147,7 +132,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { ...@@ -147,7 +132,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
size_t avail = 0, total = 0; size_t avail = 0, total = 0;
result = cudaMemGetInfo(&avail, &total); result = cudaMemGetInfo(&avail, &total);
if (result != cudaSuccess) avail = 0; if (result != cudaSuccess) avail = 0;
ClearCUDAOutOfMemoryError(&result); platform::RaiseNonOutOfMemoryError(&result);
PADDLE_THROW_BAD_ALLOC( PADDLE_THROW_BAD_ALLOC(
"\n\nOut of memory error on GPU %d. " "\n\nOut of memory error on GPU %d. "
......
...@@ -302,5 +302,20 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) { ...@@ -302,5 +302,20 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
"error code : %d, %s", "error code : %d, %s",
error_code, CudaErrorWebsite()); error_code, CudaErrorWebsite());
} }
void RaiseNonOutOfMemoryError(cudaError_t *status) {
if (*status == cudaErrorMemoryAllocation) {
*status = cudaSuccess;
}
PADDLE_ENFORCE_CUDA_SUCCESS(*status);
*status = cudaGetLastError();
if (*status == cudaErrorMemoryAllocation) {
*status = cudaSuccess;
}
PADDLE_ENFORCE_CUDA_SUCCESS(*status);
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -95,6 +95,9 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, ...@@ -95,6 +95,9 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
//! Set memory dst with value count size asynchronously //! Set memory dst with value count size asynchronously
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream); void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
//! Raise error if status is not cudaSuccess or OOM, otherwise reset status.
void RaiseNonOutOfMemoryError(cudaError_t *status);
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册