From f56967c48388fe4a365c3b63be09694008b46232 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Tue, 5 Nov 2019 14:02:05 +0800 Subject: [PATCH] refine error message of gpu allocator, test=develop (#21008) --- .../fluid/memory/detail/system_allocator.cc | 41 +++++++++++++------ 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 0b14fc6a3f..001d5f4d6a 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -105,6 +105,21 @@ bool CPUAllocator::UseGpu() const { return false; } #ifdef PADDLE_WITH_CUDA +static void ClearCUDAOutOfMemoryError(cudaError_t* status) { + if (*status == cudaErrorMemoryAllocation) { + *status = cudaSuccess; + } + + PADDLE_ENFORCE_CUDA_SUCCESS(*status); + + *status = cudaGetLastError(); + if (*status == cudaErrorMemoryAllocation) { + *status = cudaSuccess; + } + + PADDLE_ENFORCE_CUDA_SUCCESS(*status); +} + void* GPUAllocator::Alloc(size_t* index, size_t size) { // CUDA documentation doesn't explain if cudaMalloc returns nullptr // if size is 0. We just make sure it does. @@ -120,19 +135,19 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { gpu_alloc_size_ += size; return p; } else { - if (result == cudaErrorMemoryAllocation) { - result = cudaSuccess; - } - PADDLE_ENFORCE_CUDA_SUCCESS(result); - - result = cudaGetLastError(); - if (result == cudaErrorMemoryAllocation) { - result = cudaSuccess; - } - PADDLE_ENFORCE_CUDA_SUCCESS(result); - - size_t avail, total; - platform::GpuMemoryUsage(&avail, &total); + ClearCUDAOutOfMemoryError(&result); + + /** + * NOTE(zjl): Sometimes cudaMemGetInfo would raise OOM error + * if there is very little GPU memory left. In this case, we + * should consider the available GPU memory to be 0, and throw + * exception inside this function instead of throwing exception + * inside cudaMemGetInfo. + */ + size_t avail = 0, total = 0; + result = cudaMemGetInfo(&avail, &total); + if (result != cudaSuccess) avail = 0; + ClearCUDAOutOfMemoryError(&result); PADDLE_THROW_BAD_ALLOC( "\n\nOut of memory error on GPU %d. " -- GitLab