未验证 提交 42910361 编写于 作者: F From00 提交者: GitHub

Support allocate CUDA managed memory (#39075)

* Rough implementation for experiment

* Support allocate cuda managed memory

* Fix CI error

* Modify UT

* Check whether support memory oversubscription

* Fix ROCM Compile error

* Fix ROCM Compile error

* Fix UT cuda_managed_memory_test

* Set UT timeout to 40

* Add UT OOMExceptionTest

* Set UT timeout to 50
上级 4d7ad277
...@@ -20,18 +20,29 @@ if (WITH_GPU) ...@@ -20,18 +20,29 @@ if (WITH_GPU)
nv_test(stream_safe_cuda_alloc_test nv_test(stream_safe_cuda_alloc_test
SRCS stream_safe_cuda_alloc_test.cu SRCS stream_safe_cuda_alloc_test.cu
DEPS malloc cuda_graph_with_memory_pool) DEPS malloc cuda_graph_with_memory_pool)
nv_test(cuda_managed_memory_test
SRCS cuda_managed_memory_test.cu
DEPS malloc gpu_info place)
if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test) if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test)
set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES
ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true; ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true;FLAGS_allocator_strategy=auto_growth")
FLAGS_allocator_strategy=auto_growth") endif()
endif()
endif() endif()
if (WITH_ROCM) if (WITH_ROCM)
hip_test(malloc_test hip_test(malloc_test
SRCS malloc_test.cu SRCS malloc_test.cu
DEPS device_context malloc) DEPS device_context malloc)
hip_test(cuda_managed_memory_test
SRCS cuda_managed_memory_test.cu
DEPS malloc gpu_info place)
endif()
if(WITH_TESTING AND TEST cuda_managed_memory_test)
set_tests_properties(cuda_managed_memory_test PROPERTIES
ENVIRONMENT "FLAGS_use_cuda_managed_memory=true;FLAGS_allocator_strategy=auto_growth"
TIMEOUT 50)
endif() endif()
if(WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") if(WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
......
...@@ -15,6 +15,7 @@ endif() ...@@ -15,6 +15,7 @@ endif()
if (WITH_GPU) if (WITH_GPU)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator) nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator) nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
...@@ -27,6 +28,7 @@ endif() ...@@ -27,6 +28,7 @@ endif()
if (WITH_ROCM) if (WITH_ROCM)
hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
hip_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
hip_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator) hip_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
hip_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator) hip_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
...@@ -42,7 +44,7 @@ endif() ...@@ -42,7 +44,7 @@ endif()
cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator) cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
if (WITH_GPU OR WITH_ROCM) if (WITH_GPU OR WITH_ROCM)
set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator device_context) set(AllocatorFacadeDeps gpu_info cuda_allocator cuda_managed_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator device_context)
if(CUDA_VERSION GREATER_EQUAL 10.2) if(CUDA_VERSION GREATER_EQUAL 10.2)
list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator) list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator)
endif() endif()
......
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <shared_mutex> #include <shared_mutex>
#include "paddle/fluid/memory/allocation/cuda_allocator.h" #include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/cuda_managed_allocator.h"
#include "paddle/fluid/memory/allocation/pinned_allocator.h" #include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
#include "paddle/fluid/memory/allocation/thread_local_allocator.h" #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
...@@ -80,6 +81,11 @@ PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false, ...@@ -80,6 +81,11 @@ PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, false, PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, false,
"Enable StreamSafeCUDAAllocator"); "Enable StreamSafeCUDAAllocator");
PADDLE_DEFINE_EXPORTED_bool(use_cuda_managed_memory, false,
"Whether to use CUDAManagedAllocator to allocate "
"managed memory, only available for auto_growth "
"strategy");
DECLARE_string(allocator_strategy); DECLARE_string(allocator_strategy);
namespace paddle { namespace paddle {
...@@ -436,6 +442,37 @@ class AllocatorFacadePrivate { ...@@ -436,6 +442,37 @@ class AllocatorFacadePrivate {
std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace()); std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
} }
void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}
// Create a new CUDAAllocator or CUDAManagedAllocator for the given device
std::shared_ptr<Allocator> CreateCUDAAllocator(platform::CUDAPlace p) {
if (FLAGS_use_cuda_managed_memory) {
PADDLE_ENFORCE_EQ(
strategy_, AllocatorStrategy::kAutoGrowth,
platform::errors::InvalidArgument(
"CUDA managed memory is only implemented for auto_growth "
"strategy, not support %s strategy.\n"
"Please use auto_growth strategy by command `export "
"FLAGS_allocator_strategy=\"auto_growth\"`, or disable managed "
"memory by command `export FLAGS_use_cuda_managed_memory=false`",
FLAGS_allocator_strategy));
if (!platform::IsGPUManagedMemorySupported(p.device)) {
PADDLE_THROW(platform::errors::Unavailable(
"Failed to create CUDAManagedAllocator on GPU %d.\n\n"
"You have enabled CUDA managed memory, but the gpu device does not "
"support allocating managed memory.\n"
"If you don't actually need to use managed memory, please disable "
"it with command `export FLAGS_use_cuda_managed_memory=false`.\n"
"Or you must use the gpu device that supports managed memory."));
}
return std::make_shared<CUDAManagedAllocator>(p);
}
return std::make_shared<CUDAAllocator>(p);
}
void InitStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { void InitStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
strategy_, AllocatorStrategy::kAutoGrowth, strategy_, AllocatorStrategy::kAutoGrowth,
...@@ -452,13 +489,9 @@ class AllocatorFacadePrivate { ...@@ -452,13 +489,9 @@ class AllocatorFacadePrivate {
} }
} }
void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}
void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
#if defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_HIP)
auto cuda_allocator = std::make_shared<CUDAAllocator>(p); auto cuda_allocator = CreateCUDAAllocator(p);
cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>( cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), 0, allow_free_idle_chunk_); cuda_allocator, platform::GpuMinChunkSize(), 0, allow_free_idle_chunk_);
#endif #endif
...@@ -485,14 +518,14 @@ class AllocatorFacadePrivate { ...@@ -485,14 +518,14 @@ class AllocatorFacadePrivate {
std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>( std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), p); cuda_allocator, platform::GpuMinChunkSize(), p);
} else { } else {
auto cuda_allocator = std::make_shared<CUDAAllocator>(p); auto cuda_allocator = CreateCUDAAllocator(p);
cuda_allocators_[p][stream] = cuda_allocators_[p][stream] =
std::make_shared<AutoGrowthBestFitAllocator>( std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), cuda_allocator, platform::GpuMinChunkSize(),
allow_free_idle_chunk_); allow_free_idle_chunk_);
} }
#else #else
auto cuda_allocator = std::make_shared<CUDAAllocator>(p); auto cuda_allocator = CreateCUDAAllocator(p);
auto alignment = platform::GpuMinChunkSize(); auto alignment = platform::GpuMinChunkSize();
bool need_addr_align = true; bool need_addr_align = true;
// NOTE: sometimes, since cuda runtime can not be forked, calling any cuda // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
...@@ -535,7 +568,7 @@ class AllocatorFacadePrivate { ...@@ -535,7 +568,7 @@ class AllocatorFacadePrivate {
void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
bool allow_free_idle_chunk) { bool allow_free_idle_chunk) {
#if defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_HIP)
auto cuda_allocator = std::make_shared<CUDAAllocator>(p); auto cuda_allocator = CreateCUDAAllocator(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>( allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk); cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
#endif #endif
...@@ -562,13 +595,13 @@ class AllocatorFacadePrivate { ...@@ -562,13 +595,13 @@ class AllocatorFacadePrivate {
std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>( std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), p); cuda_allocator, platform::GpuMinChunkSize(), p);
} else { } else {
auto cuda_allocator = std::make_shared<CUDAAllocator>(p); auto cuda_allocator = CreateCUDAAllocator(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>( allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk); cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
} }
#else #else
auto cuda_allocator = std::make_shared<CUDAAllocator>(p); auto cuda_allocator = CreateCUDAAllocator(p);
auto alignment = platform::GpuMinChunkSize(); auto alignment = platform::GpuMinChunkSize();
bool need_addr_align = true; bool need_addr_align = true;
// NOTE: sometimes, since cuda runtime can not be forked, calling any cuda // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
...@@ -690,7 +723,7 @@ class AllocatorFacadePrivate { ...@@ -690,7 +723,7 @@ class AllocatorFacadePrivate {
int device_count = platform::GetGPUDeviceCount(); int device_count = platform::GetGPUDeviceCount();
for (int i = 0; i < device_count; ++i) { for (int i = 0; i < device_count; ++i) {
platform::CUDAPlace p(i); platform::CUDAPlace p(i);
system_allocators_[p] = std::make_shared<CUDAAllocator>(p); system_allocators_[p] = CreateCUDAAllocator(p);
} }
#endif #endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
......
...@@ -67,16 +67,24 @@ pten::Allocation* CUDAAllocator::AllocateImpl(size_t size) { ...@@ -67,16 +67,24 @@ pten::Allocation* CUDAAllocator::AllocateImpl(size_t size) {
limit_size, limit_size); limit_size, limit_size);
} }
std::string managed_memory_msg;
if (platform::IsGPUManagedMemoryOversubscriptionSupported(place_.device)) {
managed_memory_msg = string::Sprintf(
"If the above ways do not solve the out of memory problem, you can try "
"to use CUDA managed memory. The command is `export "
"FLAGS_use_cuda_managed_memory=false`.");
}
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on GPU %d. " "\n\nOut of memory error on GPU %d. "
"Cannot allocate %s memory on GPU %d, %s memory has been allocated and " "Cannot allocate %s memory on GPU %d, %s memory has been allocated and "
"available memory is only %s.\n\n" "available memory is only %s.\n\n"
"Please check whether there is any other process using GPU %d.\n" "Please check whether there is any other process using GPU %d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another GPU.\n" "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
"2. If no, please decrease the batch size of your model. %s\n\n", "2. If no, please decrease the batch size of your model. %s\n%s\n",
place_.device, string::HumanReadableSize(size), place_.device, place_.device, string::HumanReadableSize(size), place_.device,
string::HumanReadableSize(allocated), string::HumanReadableSize(avail), string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
place_.device, err_msg)); place_.device, err_msg, managed_memory_msg));
} }
} // namespace allocation } // namespace allocation
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/cuda_managed_allocator.h"
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <string>
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace memory {
namespace allocation {
bool CUDAManagedAllocator::IsAllocThreadSafe() const { return true; }
void CUDAManagedAllocator::FreeImpl(pten::Allocation* allocation) {
PADDLE_ENFORCE_EQ(
allocation->place(), place_,
platform::errors::PermissionDenied(
"GPU memory is freed in incorrect device. This may be a bug"));
platform::RecordedGpuFree(allocation->ptr(), allocation->size(),
place_.device);
delete allocation;
}
pten::Allocation* CUDAManagedAllocator::AllocateImpl(size_t size) {
std::call_once(once_flag_, [this] { platform::SetDeviceId(place_.device); });
int dev_id = place_.device;
void* ptr;
auto result = platform::RecordedGpuMalloc(&ptr, size, dev_id,
/* malloc_managed_memory = */ true);
if (LIKELY(result == gpuSuccess)) {
return new Allocation(ptr, size, platform::Place(place_));
}
uint64_t limit_size = platform::RecordedGpuLimitSize(dev_id);
uint64_t malloc_size = platform::RecordedGpuMallocSize(dev_id);
bool is_limited =
platform::IsGpuMallocRecorded(dev_id) && malloc_size + size > limit_size;
std::string err_msg;
if (UNLIKELY(is_limited)) {
int64_t limit_size_mb = limit_size >> 20;
err_msg = string::Sprintf(
"Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger "
"value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum "
"GPU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size_mb, limit_size_mb);
}
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on GPU %d. "
"Cannot allocate %s CUDA managed memory on GPU %d, %s memory has been "
"allocated.\n\n"
"Please check whether there is any other process using GPU %d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
"2. If no, please decrease the batch size of your model. %s\n\n",
dev_id, string::HumanReadableSize(size), dev_id,
string::HumanReadableSize(malloc_size), dev_id, err_msg));
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
namespace allocation {
class CUDAManagedAllocator : public Allocator {
public:
explicit CUDAManagedAllocator(const platform::CUDAPlace& place)
: place_(place) {}
bool IsAllocThreadSafe() const override;
protected:
void FreeImpl(pten::Allocation* allocation) override;
pten::Allocation* AllocateImpl(size_t size) override;
private:
platform::CUDAPlace place_;
std::once_flag once_flag_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include "gtest/gtest.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
__global__ void write_kernel(int* data, uint64_t n, uint64_t step) {
int thread_num = gridDim.x * blockDim.x;
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
for (uint64_t i = thread_id; i * step < n; i += thread_num) {
*(data + i * step) = 1;
}
}
__global__ void sum_kernel(int* data, uint64_t n, uint64_t step, int* sum) {
int thread_num = gridDim.x * blockDim.x;
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
for (uint64_t i = thread_id; i * step < n; i += thread_num) {
atomicAdd(sum, *(data + i * step));
}
}
TEST(ManagedMemoryTest, H2DTest) {
if (!platform::IsGPUManagedMemorySupported(0)) {
return;
}
uint64_t n_data = 1024;
uint64_t step = 1;
allocation::AllocationPtr allocation =
Alloc(platform::CUDAPlace(0), n_data * sizeof(int));
int* data = static_cast<int*>(allocation->ptr());
memset(data, 0, n_data * sizeof(int)); // located on host memory
write_kernel<<<1, 1024>>>(data, n_data, step); // trans to device memory
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
#endif
int sum = 0;
for (uint64_t i = 0; i < n_data; ++i) {
sum += *(data + i);
}
EXPECT_EQ(sum, n_data / step);
allocation = nullptr;
}
TEST(ManagedMemoryTest, D2HTest) {
if (!platform::IsGPUManagedMemorySupported(0)) {
return;
}
uint64_t n_data = 1024;
uint64_t step = 1;
AllocationPtr allocation =
Alloc(platform::CUDAPlace(0), n_data * sizeof(int));
int* data = static_cast<int*>(allocation->ptr());
write_kernel<<<1, 1024>>>(data, n_data, step); // located on device memory
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
#endif
memset(data, 0, n_data * sizeof(int)); // trans to host memory
int sum = 0;
for (uint64_t i = 0; i < n_data; ++i) {
sum += *(data + i);
}
EXPECT_EQ(sum, 0);
}
TEST(ManagedMemoryTest, OversubscribeGPUMemoryTest) {
if (!platform::IsGPUManagedMemoryOversubscriptionSupported(0)) {
return;
}
uint64_t available_mem = platform::GpuAvailableMemToAlloc();
uint64_t n_data = available_mem * 2 / sizeof(int) +
1; // requires more than 2 * available_mem bytes
uint64_t step = 1024;
AllocationPtr data_allocation =
Alloc(platform::CUDAPlace(0), n_data * sizeof(int));
AllocationPtr sum_allocation = Alloc(platform::CUDAPlace(0), sizeof(int));
int* data = static_cast<int*>(data_allocation->ptr());
int* sum = static_cast<int*>(sum_allocation->ptr());
(*sum) = 0;
write_kernel<<<5120, 1024>>>(data, n_data, step);
sum_kernel<<<5120, 1024>>>(data, n_data, step, sum);
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
#endif
EXPECT_EQ(*sum, (n_data + step - 1) / step);
}
TEST(ManagedMemoryTest, OOMExceptionTest) {
EXPECT_THROW(Alloc(platform::CUDAPlace(0), size_t(1) << 60),
memory::allocation::BadAlloc);
}
} // namespace memory
} // namespace paddle
...@@ -167,18 +167,28 @@ class RecordedGpuMallocHelper { ...@@ -167,18 +167,28 @@ class RecordedGpuMallocHelper {
* or cudaSuccess would be returned, and the cudaGetLastError() flag * or cudaSuccess would be returned, and the cudaGetLastError() flag
* would be clear. * would be clear.
*/ */
gpuError_t Malloc(void **ptr, size_t size) { gpuError_t Malloc(void **ptr, size_t size,
bool malloc_managed_memory = false) {
LockGuardPtr<std::mutex> lock(mtx_); LockGuardPtr<std::mutex> lock(mtx_);
if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) { if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
return gpuErrorOutOfMemory; return gpuErrorOutOfMemory;
} }
CUDADeviceGuard guard(dev_id_); CUDADeviceGuard guard(dev_id_);
gpuError_t result;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
auto result = hipMalloc(ptr, size); if (UNLIKELY(malloc_managed_memory)) {
result = hipMallocManaged(ptr, size);
} else {
result = hipMalloc(ptr, size);
}
#else #else
CUDAGraphCaptureModeGuard capture_mode_guard; CUDAGraphCaptureModeGuard capture_mode_guard;
auto result = cudaMalloc(ptr, size); if (UNLIKELY(malloc_managed_memory)) {
result = cudaMallocManaged(ptr, size);
} else {
result = cudaMalloc(ptr, size);
}
#endif #endif
if (result == gpuSuccess) { if (result == gpuSuccess) {
cur_size_.fetch_add(size); cur_size_.fetch_add(size);
...@@ -318,8 +328,10 @@ std::once_flag RecordedGpuMallocHelper::once_flag_; ...@@ -318,8 +328,10 @@ std::once_flag RecordedGpuMallocHelper::once_flag_;
std::vector<std::unique_ptr<RecordedGpuMallocHelper>> std::vector<std::unique_ptr<RecordedGpuMallocHelper>>
RecordedGpuMallocHelper::instances_; RecordedGpuMallocHelper::instances_;
gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id) { gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id,
return RecordedGpuMallocHelper::Instance(dev_id)->Malloc(ptr, size); bool malloc_managed_memory) {
return RecordedGpuMallocHelper::Instance(dev_id)->Malloc(
ptr, size, malloc_managed_memory);
} }
void RecordedGpuFree(void *p, size_t size, int dev_id) { void RecordedGpuFree(void *p, size_t size, int dev_id) {
...@@ -352,6 +364,10 @@ uint64_t RecordedGpuMallocSize(int dev_id) { ...@@ -352,6 +364,10 @@ uint64_t RecordedGpuMallocSize(int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->RecordedSize(); return RecordedGpuMallocHelper::Instance(dev_id)->RecordedSize();
} }
uint64_t RecordedGpuLimitSize(int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->LimitSize();
}
bool IsGpuMallocRecorded(int dev_id) { bool IsGpuMallocRecorded(int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->NeedRecord(); return RecordedGpuMallocHelper::Instance(dev_id)->NeedRecord();
} }
...@@ -363,6 +379,15 @@ void EmptyCache(void) { ...@@ -363,6 +379,15 @@ void EmptyCache(void) {
} }
} }
bool IsGPUManagedMemorySupported(int dev_id) {
return pten::backends::gpu::IsGPUManagedMemorySupported(dev_id);
}
bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) {
return pten::backends::gpu::IsGPUManagedMemoryOversubscriptionSupported(
dev_id);
}
void *GetGpuBasePtr(void *ptr, int dev_id) { void *GetGpuBasePtr(void *ptr, int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->GetBasePtr(ptr); return RecordedGpuMallocHelper::Instance(dev_id)->GetBasePtr(ptr);
} }
......
...@@ -114,7 +114,8 @@ void GpuDestroyStream(gpuStream_t stream); ...@@ -114,7 +114,8 @@ void GpuDestroyStream(gpuStream_t stream);
void GpuDeviceSync(); void GpuDeviceSync();
//! CudaMalloc with recorded info //! CudaMalloc with recorded info
gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id); gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id,
bool malloc_managed_memory = false);
//! CudaFree with recorded info //! CudaFree with recorded info
void RecordedGpuFree(void *p, size_t size, int dev_id); void RecordedGpuFree(void *p, size_t size, int dev_id);
...@@ -141,11 +142,17 @@ bool RecordedGpuMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail, ...@@ -141,11 +142,17 @@ bool RecordedGpuMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
//! Get recorded cudaMalloc size. If record is disabled, return 0. //! Get recorded cudaMalloc size. If record is disabled, return 0.
uint64_t RecordedGpuMallocSize(int dev_id); uint64_t RecordedGpuMallocSize(int dev_id);
uint64_t RecordedGpuLimitSize(int dev_id);
bool IsGpuMallocRecorded(int dev_id); bool IsGpuMallocRecorded(int dev_id);
//! Empty idle cached memory held by the allocator. //! Empty idle cached memory held by the allocator.
void EmptyCache(void); void EmptyCache(void);
bool IsGPUManagedMemorySupported(int dev_id);
bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id);
//! Get the primitive pointer return from cudaMalloc, just implemented with //! Get the primitive pointer return from cudaMalloc, just implemented with
//! testing, do not use for release //! testing, do not use for release
void *GetGpuBasePtr(void *ptr, int dev_id); void *GetGpuBasePtr(void *ptr, int dev_id);
......
...@@ -290,6 +290,43 @@ void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); } ...@@ -290,6 +290,43 @@ void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); }
gpuError_t GpuGetLastError() { return cudaGetLastError(); } gpuError_t GpuGetLastError() { return cudaGetLastError(); }
// See
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
// for more detail about managed memory requirements
bool IsGPUManagedMemorySupported(int dev_id) {
PADDLE_ENFORCE_LT(dev_id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
dev_id,
GetGPUDeviceCount()));
#if defined(__linux__) || defined(_WIN32)
int ManagedMemoryAttr;
PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(
&ManagedMemoryAttr, cudaDevAttrManagedMemory, dev_id));
return ManagedMemoryAttr != 0;
#else
return false;
#endif
}
bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) {
PADDLE_ENFORCE_LT(dev_id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
dev_id,
GetGPUDeviceCount()));
#ifdef __linux__
return IsGPUManagedMemorySupported(dev_id) &&
GetGPUComputeCapability(dev_id) >= 60;
#else
return false;
#endif
}
} // namespace gpu } // namespace gpu
} // namespace backends } // namespace backends
} // namespace pten } // namespace pten
...@@ -104,6 +104,10 @@ void GpuDeviceSync(); ...@@ -104,6 +104,10 @@ void GpuDeviceSync();
gpuError_t GpuGetLastError(); gpuError_t GpuGetLastError();
bool IsGPUManagedMemorySupported(int dev_id);
bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id);
class GPUDeviceGuard { class GPUDeviceGuard {
public: public:
explicit inline GPUDeviceGuard(int dev_id) { explicit inline GPUDeviceGuard(int dev_id) {
......
...@@ -292,6 +292,40 @@ void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); } ...@@ -292,6 +292,40 @@ void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); }
gpuError_t GpuGetLastError() { return hipGetLastError(); } gpuError_t GpuGetLastError() { return hipGetLastError(); }
bool IsGPUManagedMemorySupported(int dev_id) {
PADDLE_ENFORCE_LT(dev_id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
dev_id,
GetGPUDeviceCount()));
#if defined(__linux__) || defined(_WIN32)
int ManagedMemoryAttr;
PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute(
&ManagedMemoryAttr, hipDeviceAttributeManagedMemory, dev_id));
return ManagedMemoryAttr != 0;
#else
return false;
#endif
}
bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) {
PADDLE_ENFORCE_LT(dev_id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
dev_id,
GetGPUDeviceCount()));
#ifdef __linux__
return IsGPUManagedMemorySupported(dev_id) &&
GetGPUComputeCapability(dev_id) >= 60;
#else
return false;
#endif
}
} // namespace gpu } // namespace gpu
} // namespace backends } // namespace backends
} // namespace pten } // namespace pten
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册