From a710ccc0cb5532e02f7e493aa83a7a8f50cb9b42 Mon Sep 17 00:00:00 2001
From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com>
Date: Wed, 6 Nov 2019 09:58:52 +0800
Subject: [PATCH] refine error message of allocator again, test=develop
 (#21023)

---
 .../fluid/memory/allocation/cuda_allocator.cc | 27 ++++++++++++++-----
 .../memory/allocation/retry_allocator_test.cc |  4 +--
 .../fluid/memory/detail/system_allocator.cc   | 19 ++-----------
 paddle/fluid/platform/gpu_info.cc             | 15 +++++++++++
 paddle/fluid/platform/gpu_info.h              |  3 +++
 5 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index 2ba3b6d0b5..af899230de 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -35,14 +35,27 @@ void CUDAAllocator::FreeImpl(Allocation* allocation) {
 Allocation* CUDAAllocator::AllocateImpl(size_t size) {
   platform::CUDADeviceGuard guard(place_.device);
   void* ptr;
-  auto status = cudaMalloc(&ptr, size);
-  if (UNLIKELY(status != cudaSuccess)) {
-    PADDLE_ENFORCE_NE(cudaGetLastError(), cudaSuccess);
-    PADDLE_THROW_BAD_ALLOC("Cannot allocate %d on GPU %d, cuda status %d, %s",
-                           size, place_.device, status,
-                           cudaGetErrorString(status));
+  auto result = cudaMalloc(&ptr, size);
+  if (LIKELY(result == cudaSuccess)) {
+    return new Allocation(ptr, size, platform::Place(place_));
   }
-  return new Allocation(ptr, size, platform::Place(place_));
+
+  platform::RaiseNonOutOfMemoryError(&result);
+
+  size_t avail = 0, total = 0;
+  result = cudaMemGetInfo(&avail, &total);
+  if (result != cudaSuccess) avail = 0;
+  platform::RaiseNonOutOfMemoryError(&result);
+
+  PADDLE_THROW_BAD_ALLOC(
+      "\n\nOut of memory error on GPU %d. "
+      "Cannot allocate %s memory on GPU %d, "
+      "available memory is only %s.\n\n"
+      "Please check whether there is any other process using GPU %d.\n"
+      "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
+      "2. If no, please decrease the batch size of your model.\n",
+      place_.device, string::HumanReadableSize(size), place_.device,
+      string::HumanReadableSize(avail), place_.device);
 }
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
index 11a8dfdc47..0e81f5f223 100644
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -133,9 +133,7 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
       ASSERT_TRUE(false);
       allocation.reset();
     } catch (BadAlloc &ex) {
-      ASSERT_TRUE(std::string(ex.what()).find(
-                      "Cannot allocate " + std::to_string(allocate_size) +
-                      " on GPU " + std::to_string(p.device)) !=
+      ASSERT_TRUE(std::string(ex.what()).find("Cannot allocate") !=
                   std::string::npos);
     }
   }
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 001d5f4d6a..76a121241e 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -105,21 +105,6 @@ bool CPUAllocator::UseGpu() const { return false; }
 
 #ifdef PADDLE_WITH_CUDA
 
-static void ClearCUDAOutOfMemoryError(cudaError_t* status) {
-  if (*status == cudaErrorMemoryAllocation) {
-    *status = cudaSuccess;
-  }
-
-  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
-
-  *status = cudaGetLastError();
-  if (*status == cudaErrorMemoryAllocation) {
-    *status = cudaSuccess;
-  }
-
-  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
-}
-
 void* GPUAllocator::Alloc(size_t* index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
   // if size is 0.  We just make sure it does.
@@ -135,7 +120,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
     gpu_alloc_size_ += size;
     return p;
   } else {
-    ClearCUDAOutOfMemoryError(&result);
+    platform::RaiseNonOutOfMemoryError(&result);
 
     /**
      * NOTE(zjl): Sometimes cudaMemGetInfo would raise OOM error
@@ -147,7 +132,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
     size_t avail = 0, total = 0;
     result = cudaMemGetInfo(&avail, &total);
     if (result != cudaSuccess) avail = 0;
-    ClearCUDAOutOfMemoryError(&result);
+    platform::RaiseNonOutOfMemoryError(&result);
 
     PADDLE_THROW_BAD_ALLOC(
         "\n\nOut of memory error on GPU %d. "
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index c8d312c61b..dba85d3141 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -302,5 +302,20 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
                  "error code : %d, %s",
                  error_code, CudaErrorWebsite());
 }
+
+void RaiseNonOutOfMemoryError(cudaError_t *status) {
+  if (*status == cudaErrorMemoryAllocation) {
+    *status = cudaSuccess;
+  }
+
+  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
+
+  *status = cudaGetLastError();
+  if (*status == cudaErrorMemoryAllocation) {
+    *status = cudaSuccess;
+  }
+
+  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
+}
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index e468c4aab0..4b4e2b4ac3 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -95,6 +95,9 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
 //! Set memory dst with value count size asynchronously
 void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
 
+//! Raise error if status is not cudaSuccess or OOM, otherwise reset status.
+void RaiseNonOutOfMemoryError(cudaError_t *status);
+
 }  // namespace platform
 }  // namespace paddle
 
-- 
GitLab