refine error message of allocator again, test=develop (#21023)

a710ccc0 · Zeng Jinle · GitHub · d89ca2ff · a710ccc0 · a710ccc0
5 changed file
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -35,14 +35,27 @@ void CUDAAllocator::FreeImpl(Allocation* allocation) {
 Allocation* CUDAAllocator::AllocateImpl(size_t size) {
  platform::CUDADeviceGuard guard(place_.device);
  void* ptr;
-  auto status = cudaMalloc(&ptr, size);
-  if (UNLIKELY(status != cudaSuccess)) {
-    PADDLE_ENFORCE_NE(cudaGetLastError(), cudaSuccess);
-    PADDLE_THROW_BAD_ALLOC("Cannot allocate %d on GPU %d, cuda status %d, %s",
-                           size, place_.device, status,
-                           cudaGetErrorString(status));
-  }
+  auto result = cudaMalloc(&ptr, size);
+  if (LIKELY(result == cudaSuccess)) {
    return new Allocation(ptr, size, platform::Place(place_));
+  }
+
+  platform::RaiseNonOutOfMemoryError(&result);
+
+  size_t avail = 0, total = 0;
+  result = cudaMemGetInfo(&avail, &total);
+  if (result != cudaSuccess) avail = 0;
+  platform::RaiseNonOutOfMemoryError(&result);
+
+  PADDLE_THROW_BAD_ALLOC(
+      "\n\nOut of memory error on GPU %d. "
+      "Cannot allocate %s memory on GPU %d, "
+      "available memory is only %s.\n\n"
+      "Please check whether there is any other process using GPU %d.\n"
+      "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
+      "2. If no, please decrease the batch size of your model.\n",
+      place_.device, string::HumanReadableSize(size), place_.device,
+      string::HumanReadableSize(avail), place_.device);
 }

 }  // namespace allocation

--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -133,9 +133,7 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
      ASSERT_TRUE(false);
      allocation.reset();
    } catch (BadAlloc &ex) {
-      ASSERT_TRUE(std::string(ex.what()).find(
-                      "Cannot allocate " + std::to_string(allocate_size) +
-                      " on GPU " + std::to_string(p.device)) !=
+      ASSERT_TRUE(std::string(ex.what()).find("Cannot allocate") !=
                  std::string::npos);
    }
  }

--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -105,21 +105,6 @@ bool CPUAllocator::UseGpu() const { return false; }

 #ifdef PADDLE_WITH_CUDA

-static void ClearCUDAOutOfMemoryError(cudaError_t* status) {
-  if (*status == cudaErrorMemoryAllocation) {
-    *status = cudaSuccess;
-  }
-
-  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
-
-  *status = cudaGetLastError();
-  if (*status == cudaErrorMemoryAllocation) {
-    *status = cudaSuccess;
-  }
-
-  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
-}
-
 void* GPUAllocator::Alloc(size_t* index, size_t size) {
  // CUDA documentation doesn't explain if cudaMalloc returns nullptr
  // if size is 0.  We just make sure it does.
@@ -135,7 +120,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
    gpu_alloc_size_ += size;
    return p;
  } else {
-    ClearCUDAOutOfMemoryError(&result);
+    platform::RaiseNonOutOfMemoryError(&result);

    /**
     * NOTE(zjl): Sometimes cudaMemGetInfo would raise OOM error
@@ -147,7 +132,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
    size_t avail = 0, total = 0;
    result = cudaMemGetInfo(&avail, &total);
    if (result != cudaSuccess) avail = 0;
-    ClearCUDAOutOfMemoryError(&result);
+    platform::RaiseNonOutOfMemoryError(&result);

    PADDLE_THROW_BAD_ALLOC(
        "\n\nOut of memory error on GPU %d. "

--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -302,5 +302,20 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
                 "error code : %d, %s",
                 error_code, CudaErrorWebsite());
 }
+
+void RaiseNonOutOfMemoryError(cudaError_t *status) {
+  if (*status == cudaErrorMemoryAllocation) {
+    *status = cudaSuccess;
+  }
+
+  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
+
+  *status = cudaGetLastError();
+  if (*status == cudaErrorMemoryAllocation) {
+    *status = cudaSuccess;
+  }
+
+  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
+}
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -95,6 +95,9 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
 //! Set memory dst with value count size asynchronously
 void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);

+//! Raise error if status is not cudaSuccess or OOM, otherwise reset status.
+void RaiseNonOutOfMemoryError(cudaError_t *status);
+
 }  // namespace platform
 }  // namespace paddle