From a710ccc0cb5532e02f7e493aa83a7a8f50cb9b42 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Wed, 6 Nov 2019 09:58:52 +0800 Subject: [PATCH] refine error message of allocator again, test=develop (#21023) --- .../fluid/memory/allocation/cuda_allocator.cc | 27 ++++++++++++++----- .../memory/allocation/retry_allocator_test.cc | 4 +-- .../fluid/memory/detail/system_allocator.cc | 19 ++----------- paddle/fluid/platform/gpu_info.cc | 15 +++++++++++ paddle/fluid/platform/gpu_info.h | 3 +++ 5 files changed, 41 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index 2ba3b6d0b5..af899230de 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -35,14 +35,27 @@ void CUDAAllocator::FreeImpl(Allocation* allocation) { Allocation* CUDAAllocator::AllocateImpl(size_t size) { platform::CUDADeviceGuard guard(place_.device); void* ptr; - auto status = cudaMalloc(&ptr, size); - if (UNLIKELY(status != cudaSuccess)) { - PADDLE_ENFORCE_NE(cudaGetLastError(), cudaSuccess); - PADDLE_THROW_BAD_ALLOC("Cannot allocate %d on GPU %d, cuda status %d, %s", - size, place_.device, status, - cudaGetErrorString(status)); + auto result = cudaMalloc(&ptr, size); + if (LIKELY(result == cudaSuccess)) { + return new Allocation(ptr, size, platform::Place(place_)); } - return new Allocation(ptr, size, platform::Place(place_)); + + platform::RaiseNonOutOfMemoryError(&result); + + size_t avail = 0, total = 0; + result = cudaMemGetInfo(&avail, &total); + if (result != cudaSuccess) avail = 0; + platform::RaiseNonOutOfMemoryError(&result); + + PADDLE_THROW_BAD_ALLOC( + "\n\nOut of memory error on GPU %d. " + "Cannot allocate %s memory on GPU %d, " + "available memory is only %s.\n\n" + "Please check whether there is any other process using GPU %d.\n" + "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n" + "2. If no, please decrease the batch size of your model.\n", + place_.device, string::HumanReadableSize(size), place_.device, + string::HumanReadableSize(avail), place_.device); } } // namespace allocation diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc index 11a8dfdc47..0e81f5f223 100644 --- a/paddle/fluid/memory/allocation/retry_allocator_test.cc +++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc @@ -133,9 +133,7 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) { ASSERT_TRUE(false); allocation.reset(); } catch (BadAlloc &ex) { - ASSERT_TRUE(std::string(ex.what()).find( - "Cannot allocate " + std::to_string(allocate_size) + - " on GPU " + std::to_string(p.device)) != + ASSERT_TRUE(std::string(ex.what()).find("Cannot allocate") != std::string::npos); } } diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 001d5f4d6a..76a121241e 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -105,21 +105,6 @@ bool CPUAllocator::UseGpu() const { return false; } #ifdef PADDLE_WITH_CUDA -static void ClearCUDAOutOfMemoryError(cudaError_t* status) { - if (*status == cudaErrorMemoryAllocation) { - *status = cudaSuccess; - } - - PADDLE_ENFORCE_CUDA_SUCCESS(*status); - - *status = cudaGetLastError(); - if (*status == cudaErrorMemoryAllocation) { - *status = cudaSuccess; - } - - PADDLE_ENFORCE_CUDA_SUCCESS(*status); -} - void* GPUAllocator::Alloc(size_t* index, size_t size) { // CUDA documentation doesn't explain if cudaMalloc returns nullptr // if size is 0. We just make sure it does. @@ -135,7 +120,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { gpu_alloc_size_ += size; return p; } else { - ClearCUDAOutOfMemoryError(&result); + platform::RaiseNonOutOfMemoryError(&result); /** * NOTE(zjl): Sometimes cudaMemGetInfo would raise OOM error @@ -147,7 +132,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { size_t avail = 0, total = 0; result = cudaMemGetInfo(&avail, &total); if (result != cudaSuccess) avail = 0; - ClearCUDAOutOfMemoryError(&result); + platform::RaiseNonOutOfMemoryError(&result); PADDLE_THROW_BAD_ALLOC( "\n\nOut of memory error on GPU %d. " diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index c8d312c61b..dba85d3141 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -302,5 +302,20 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) { "error code : %d, %s", error_code, CudaErrorWebsite()); } + +void RaiseNonOutOfMemoryError(cudaError_t *status) { + if (*status == cudaErrorMemoryAllocation) { + *status = cudaSuccess; + } + + PADDLE_ENFORCE_CUDA_SUCCESS(*status); + + *status = cudaGetLastError(); + if (*status == cudaErrorMemoryAllocation) { + *status = cudaSuccess; + } + + PADDLE_ENFORCE_CUDA_SUCCESS(*status); +} } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index e468c4aab0..4b4e2b4ac3 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -95,6 +95,9 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, //! Set memory dst with value count size asynchronously void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream); +//! Raise error if status is not cudaSuccess or OOM, otherwise reset status. +void RaiseNonOutOfMemoryError(cudaError_t *status); + } // namespace platform } // namespace paddle -- GitLab