未验证 提交 a710ccc0 编写于 作者: Z Zeng Jinle 提交者: GitHub

refine error message of allocator again, test=develop (#21023)

上级 d89ca2ff
......@@ -35,14 +35,27 @@ void CUDAAllocator::FreeImpl(Allocation* allocation) {
Allocation* CUDAAllocator::AllocateImpl(size_t size) {
platform::CUDADeviceGuard guard(place_.device);
void* ptr;
auto status = cudaMalloc(&ptr, size);
if (UNLIKELY(status != cudaSuccess)) {
PADDLE_ENFORCE_NE(cudaGetLastError(), cudaSuccess);
PADDLE_THROW_BAD_ALLOC("Cannot allocate %d on GPU %d, cuda status %d, %s",
size, place_.device, status,
cudaGetErrorString(status));
auto result = cudaMalloc(&ptr, size);
if (LIKELY(result == cudaSuccess)) {
return new Allocation(ptr, size, platform::Place(place_));
}
return new Allocation(ptr, size, platform::Place(place_));
platform::RaiseNonOutOfMemoryError(&result);
size_t avail = 0, total = 0;
result = cudaMemGetInfo(&avail, &total);
if (result != cudaSuccess) avail = 0;
platform::RaiseNonOutOfMemoryError(&result);
PADDLE_THROW_BAD_ALLOC(
"\n\nOut of memory error on GPU %d. "
"Cannot allocate %s memory on GPU %d, "
"available memory is only %s.\n\n"
"Please check whether there is any other process using GPU %d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
"2. If no, please decrease the batch size of your model.\n",
place_.device, string::HumanReadableSize(size), place_.device,
string::HumanReadableSize(avail), place_.device);
}
} // namespace allocation
......
......@@ -133,9 +133,7 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
ASSERT_TRUE(false);
allocation.reset();
} catch (BadAlloc &ex) {
ASSERT_TRUE(std::string(ex.what()).find(
"Cannot allocate " + std::to_string(allocate_size) +
" on GPU " + std::to_string(p.device)) !=
ASSERT_TRUE(std::string(ex.what()).find("Cannot allocate") !=
std::string::npos);
}
}
......
......@@ -105,21 +105,6 @@ bool CPUAllocator::UseGpu() const { return false; }
#ifdef PADDLE_WITH_CUDA
static void ClearCUDAOutOfMemoryError(cudaError_t* status) {
if (*status == cudaErrorMemoryAllocation) {
*status = cudaSuccess;
}
PADDLE_ENFORCE_CUDA_SUCCESS(*status);
*status = cudaGetLastError();
if (*status == cudaErrorMemoryAllocation) {
*status = cudaSuccess;
}
PADDLE_ENFORCE_CUDA_SUCCESS(*status);
}
void* GPUAllocator::Alloc(size_t* index, size_t size) {
// CUDA documentation doesn't explain if cudaMalloc returns nullptr
// if size is 0. We just make sure it does.
......@@ -135,7 +120,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
gpu_alloc_size_ += size;
return p;
} else {
ClearCUDAOutOfMemoryError(&result);
platform::RaiseNonOutOfMemoryError(&result);
/**
* NOTE(zjl): Sometimes cudaMemGetInfo would raise OOM error
......@@ -147,7 +132,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
size_t avail = 0, total = 0;
result = cudaMemGetInfo(&avail, &total);
if (result != cudaSuccess) avail = 0;
ClearCUDAOutOfMemoryError(&result);
platform::RaiseNonOutOfMemoryError(&result);
PADDLE_THROW_BAD_ALLOC(
"\n\nOut of memory error on GPU %d. "
......
......@@ -302,5 +302,20 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
"error code : %d, %s",
error_code, CudaErrorWebsite());
}
void RaiseNonOutOfMemoryError(cudaError_t *status) {
if (*status == cudaErrorMemoryAllocation) {
*status = cudaSuccess;
}
PADDLE_ENFORCE_CUDA_SUCCESS(*status);
*status = cudaGetLastError();
if (*status == cudaErrorMemoryAllocation) {
*status = cudaSuccess;
}
PADDLE_ENFORCE_CUDA_SUCCESS(*status);
}
} // namespace platform
} // namespace paddle
......@@ -95,6 +95,9 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
//! Set memory dst with value count size asynchronously
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
//! Raise error if status is not cudaSuccess or OOM, otherwise reset status.
void RaiseNonOutOfMemoryError(cudaError_t *status);
} // namespace platform
} // namespace paddle
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册