diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index 2ba3b6d0b5b2a89fcbea472c2eb90d7874ce0104..af899230dee36334f3cb07e977bdc04c6e8bdfc3 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -35,14 +35,27 @@ void CUDAAllocator::FreeImpl(Allocation* allocation) { Allocation* CUDAAllocator::AllocateImpl(size_t size) { platform::CUDADeviceGuard guard(place_.device); void* ptr; - auto status = cudaMalloc(&ptr, size); - if (UNLIKELY(status != cudaSuccess)) { - PADDLE_ENFORCE_NE(cudaGetLastError(), cudaSuccess); - PADDLE_THROW_BAD_ALLOC("Cannot allocate %d on GPU %d, cuda status %d, %s", - size, place_.device, status, - cudaGetErrorString(status)); + auto result = cudaMalloc(&ptr, size); + if (LIKELY(result == cudaSuccess)) { + return new Allocation(ptr, size, platform::Place(place_)); } - return new Allocation(ptr, size, platform::Place(place_)); + + platform::RaiseNonOutOfMemoryError(&result); + + size_t avail = 0, total = 0; + result = cudaMemGetInfo(&avail, &total); + if (result != cudaSuccess) avail = 0; + platform::RaiseNonOutOfMemoryError(&result); + + PADDLE_THROW_BAD_ALLOC( + "\n\nOut of memory error on GPU %d. " + "Cannot allocate %s memory on GPU %d, " + "available memory is only %s.\n\n" + "Please check whether there is any other process using GPU %d.\n" + "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n" + "2. If no, please decrease the batch size of your model.\n", + place_.device, string::HumanReadableSize(size), place_.device, + string::HumanReadableSize(avail), place_.device); } } // namespace allocation diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc index 11a8dfdc472d6b6cdec2b3f618aeb7065a10447b..0e81f5f2238f755de27750b405e771146b3cbf7d 100644 --- a/paddle/fluid/memory/allocation/retry_allocator_test.cc +++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc @@ -133,9 +133,7 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) { ASSERT_TRUE(false); allocation.reset(); } catch (BadAlloc &ex) { - ASSERT_TRUE(std::string(ex.what()).find( - "Cannot allocate " + std::to_string(allocate_size) + - " on GPU " + std::to_string(p.device)) != + ASSERT_TRUE(std::string(ex.what()).find("Cannot allocate") != std::string::npos); } } diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 001d5f4d6ae5fbf2b7947ce04bf0785c26498487..76a121241e952d1c5c511bfbec17791fd605e23c 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -105,21 +105,6 @@ bool CPUAllocator::UseGpu() const { return false; } #ifdef PADDLE_WITH_CUDA -static void ClearCUDAOutOfMemoryError(cudaError_t* status) { - if (*status == cudaErrorMemoryAllocation) { - *status = cudaSuccess; - } - - PADDLE_ENFORCE_CUDA_SUCCESS(*status); - - *status = cudaGetLastError(); - if (*status == cudaErrorMemoryAllocation) { - *status = cudaSuccess; - } - - PADDLE_ENFORCE_CUDA_SUCCESS(*status); -} - void* GPUAllocator::Alloc(size_t* index, size_t size) { // CUDA documentation doesn't explain if cudaMalloc returns nullptr // if size is 0. We just make sure it does. @@ -135,7 +120,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { gpu_alloc_size_ += size; return p; } else { - ClearCUDAOutOfMemoryError(&result); + platform::RaiseNonOutOfMemoryError(&result); /** * NOTE(zjl): Sometimes cudaMemGetInfo would raise OOM error @@ -147,7 +132,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { size_t avail = 0, total = 0; result = cudaMemGetInfo(&avail, &total); if (result != cudaSuccess) avail = 0; - ClearCUDAOutOfMemoryError(&result); + platform::RaiseNonOutOfMemoryError(&result); PADDLE_THROW_BAD_ALLOC( "\n\nOut of memory error on GPU %d. " diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index c8d312c61bdd7e6e420b69de417b2b8d69ab2b8a..dba85d31415b35d1cd95ef4897135b22808987ea 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -302,5 +302,20 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) { "error code : %d, %s", error_code, CudaErrorWebsite()); } + +void RaiseNonOutOfMemoryError(cudaError_t *status) { + if (*status == cudaErrorMemoryAllocation) { + *status = cudaSuccess; + } + + PADDLE_ENFORCE_CUDA_SUCCESS(*status); + + *status = cudaGetLastError(); + if (*status == cudaErrorMemoryAllocation) { + *status = cudaSuccess; + } + + PADDLE_ENFORCE_CUDA_SUCCESS(*status); +} } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index e468c4aab0b01c19b69a2e57e794c0ad0a117c71..4b4e2b4ac317e9eb2bdd79f6e9147c16d8df655e 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -95,6 +95,9 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, //! Set memory dst with value count size asynchronously void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream); +//! Raise error if status is not cudaSuccess or OOM, otherwise reset status. +void RaiseNonOutOfMemoryError(cudaError_t *status); + } // namespace platform } // namespace paddle