From f56967c48388fe4a365c3b63be09694008b46232 Mon Sep 17 00:00:00 2001
From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com>
Date: Tue, 5 Nov 2019 14:02:05 +0800
Subject: [PATCH] refine error message of gpu allocator, test=develop (#21008)

---
 .../fluid/memory/detail/system_allocator.cc   | 41 +++++++++++++------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 0b14fc6a3f..001d5f4d6a 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -105,6 +105,21 @@ bool CPUAllocator::UseGpu() const { return false; }
 
 #ifdef PADDLE_WITH_CUDA
 
+static void ClearCUDAOutOfMemoryError(cudaError_t* status) {
+  if (*status == cudaErrorMemoryAllocation) {
+    *status = cudaSuccess;
+  }
+
+  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
+
+  *status = cudaGetLastError();
+  if (*status == cudaErrorMemoryAllocation) {
+    *status = cudaSuccess;
+  }
+
+  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
+}
+
 void* GPUAllocator::Alloc(size_t* index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
   // if size is 0.  We just make sure it does.
@@ -120,19 +135,19 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
     gpu_alloc_size_ += size;
     return p;
   } else {
-    if (result == cudaErrorMemoryAllocation) {
-      result = cudaSuccess;
-    }
-    PADDLE_ENFORCE_CUDA_SUCCESS(result);
-
-    result = cudaGetLastError();
-    if (result == cudaErrorMemoryAllocation) {
-      result = cudaSuccess;
-    }
-    PADDLE_ENFORCE_CUDA_SUCCESS(result);
-
-    size_t avail, total;
-    platform::GpuMemoryUsage(&avail, &total);
+    ClearCUDAOutOfMemoryError(&result);
+
+    /**
+     * NOTE(zjl): Sometimes cudaMemGetInfo would raise OOM error
+     * if there is very little GPU memory left. In this case, we
+     * should consider the available GPU memory to be 0, and throw
+     * exception inside this function instead of throwing exception
+     * inside cudaMemGetInfo.
+     */
+    size_t avail = 0, total = 0;
+    result = cudaMemGetInfo(&avail, &total);
+    if (result != cudaSuccess) avail = 0;
+    ClearCUDAOutOfMemoryError(&result);
 
     PADDLE_THROW_BAD_ALLOC(
         "\n\nOut of memory error on GPU %d. "
-- 
GitLab