diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 0b14fc6a3fc4a0e4a58b3fac1aafe8ee5c65e8dc..001d5f4d6ae5fbf2b7947ce04bf0785c26498487 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -105,6 +105,21 @@ bool CPUAllocator::UseGpu() const { return false; } #ifdef PADDLE_WITH_CUDA +static void ClearCUDAOutOfMemoryError(cudaError_t* status) { + if (*status == cudaErrorMemoryAllocation) { + *status = cudaSuccess; + } + + PADDLE_ENFORCE_CUDA_SUCCESS(*status); + + *status = cudaGetLastError(); + if (*status == cudaErrorMemoryAllocation) { + *status = cudaSuccess; + } + + PADDLE_ENFORCE_CUDA_SUCCESS(*status); +} + void* GPUAllocator::Alloc(size_t* index, size_t size) { // CUDA documentation doesn't explain if cudaMalloc returns nullptr // if size is 0. We just make sure it does. @@ -120,19 +135,19 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { gpu_alloc_size_ += size; return p; } else { - if (result == cudaErrorMemoryAllocation) { - result = cudaSuccess; - } - PADDLE_ENFORCE_CUDA_SUCCESS(result); - - result = cudaGetLastError(); - if (result == cudaErrorMemoryAllocation) { - result = cudaSuccess; - } - PADDLE_ENFORCE_CUDA_SUCCESS(result); - - size_t avail, total; - platform::GpuMemoryUsage(&avail, &total); + ClearCUDAOutOfMemoryError(&result); + + /** + * NOTE(zjl): Sometimes cudaMemGetInfo would raise OOM error + * if there is very little GPU memory left. In this case, we + * should consider the available GPU memory to be 0, and throw + * exception inside this function instead of throwing exception + * inside cudaMemGetInfo. + */ + size_t avail = 0, total = 0; + result = cudaMemGetInfo(&avail, &total); + if (result != cudaSuccess) avail = 0; + ClearCUDAOutOfMemoryError(&result); PADDLE_THROW_BAD_ALLOC( "\n\nOut of memory error on GPU %d. "