提交 2002e71d 编写于 作者: S sneaxiy

fix pinned allocator

上级 21fdf8e8
......@@ -112,8 +112,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
dst->set_layout(src.layout());
auto src_place = src.place();
auto src_ptr = src.data<void>();
auto dst_ptr =
dst->mutable_data(dst_place, src.type(), memory::Allocator::kCrossDevice);
auto dst_ptr = dst->mutable_data(dst_place, src.type());
auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
......
......@@ -2,7 +2,10 @@ cc_library(allocator SRCS allocator.cc DEPS place)
cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
if (WITH_GPU)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
endif()
cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
......@@ -29,7 +32,7 @@ cc_library(naive_managed_allocator SRCS naive_managed_allocator.cc DEPS allocato
cc_test(naive_managed_allocator_test SRCS naive_managed_allocator_test.cc DEPS naive_managed_allocator)
nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
if (WITH_GPU)
set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator)
set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard)
else ()
set(AllocatorFacadeDeps)
endif()
......@@ -48,8 +51,7 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS
auto_increment_allocator
zero_size_allocator
conditional_allocator
retry_allocator
cuda_device_guard)
retry_allocator)
nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
......
......@@ -25,17 +25,18 @@
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#include "paddle/fluid/memory/allocation/naive_managed_allocator.h"
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/memory/allocation/zero_size_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/gpu_info.h"
#endif
DEFINE_int32(
DEFINE_int64(
gpu_allocator_retry_time, 0,
"The retry time (milliseconds) when allocator fails "
"to allocate memory. No retry if this value is not greater than 0");
......@@ -49,51 +50,34 @@ class CPUManagedAllocator : public ManagedAllocator {
public:
CPUManagedAllocator()
: normal_allocator_(NaiveManagedAllocator::Create(
std::unique_ptr<Allocator>(new CPUAllocator()))),
communication_allocator_(NaiveManagedAllocator::Create(
std::unique_ptr<Allocator>(new CPUPinnedAllocator()))) {}
std::unique_ptr<Allocator>(new CPUAllocator()))) {}
std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override {
if (attr == kCrossDevice) {
return communication_allocator_->Allocate(size, attr);
} else {
return normal_allocator_->Allocate(size, attr);
}
return normal_allocator_->Allocate(size, attr);
}
std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
if (attr == kCrossDevice) {
return communication_allocator_->AllocateShared(size, attr);
} else {
return normal_allocator_->AllocateShared(size, attr);
}
return normal_allocator_->AllocateShared(size, attr);
}
bool IsAllocThreadSafe() const override { return true; }
private:
std::shared_ptr<ManagedAllocator> normal_allocator_;
std::shared_ptr<ManagedAllocator> communication_allocator_;
};
#ifdef PADDLE_WITH_CUDA
// TODO(yy): Dirty code here. This class should be configurable in runtime.
class CUDAManagedAllocator : public ManagedAllocator {
class ChunkedManagedAllocator : public ManagedAllocator {
public:
explicit CUDAManagedAllocator(int dev_id) {
platform::CUDADeviceGuard guard(dev_id);
max_chunk_size_ = platform::GpuMaxChunkSize();
raw_allocator_ = NaiveManagedAllocator::Create(std::unique_ptr<Allocator>(
new CUDAAllocator(platform::CUDAPlace(dev_id))));
explicit ChunkedManagedAllocator(std::unique_ptr<Allocator> system_allocator,
size_t max_chunk_size, size_t capacity = 1,
int64_t retry_time = -1)
: max_chunk_size_(max_chunk_size), retry_time_(retry_time) {
raw_allocator_ = NaiveManagedAllocator::Create(std::move(system_allocator));
if (max_chunk_size_ == 0) {
default_allocator_ = raw_allocator_;
} else {
size_t available, total;
platform::GpuMemoryUsage(&available, &total);
size_t capacity = available / max_chunk_size_;
if (capacity == 1) {
VLOG(10) << "Create BestFitAllocator with chunk_size "
<< max_chunk_size_;
......@@ -119,7 +103,7 @@ class CUDAManagedAllocator : public ManagedAllocator {
default_allocator_.reset(cond_allocator);
}
~CUDAManagedAllocator() {
~ChunkedManagedAllocator() {
// Specify destruct order.
default_allocator_.reset();
chunks_.clear();
......@@ -140,27 +124,71 @@ class CUDAManagedAllocator : public ManagedAllocator {
std::unique_ptr<Allocator> unmanaged_allocator(new LockedAllocator(
std::unique_ptr<Allocator>(new BestFitAllocator(allocation))));
if (FLAGS_gpu_allocator_retry_time <= 0) {
if (retry_time_ <= 0) {
VLOG(10) << "Create NaiveManagedAllocator without retry";
return std::make_shared<AlignedAllocator<64u>>(
NaiveManagedAllocator::Create(std::move(unmanaged_allocator)));
} else {
VLOG(10) << "Create RetryAllocator with retry_time "
<< FLAGS_gpu_allocator_retry_time << "ms";
VLOG(10) << "Create RetryAllocator with retry_time " << retry_time_
<< "ms";
return std::make_shared<AlignedAllocator<64u>>(RetryAllocator::Create(
std::move(unmanaged_allocator),
static_cast<size_t>(FLAGS_gpu_allocator_retry_time)));
std::move(unmanaged_allocator), static_cast<size_t>(retry_time_)));
}
}
bool IsAllocThreadSafe() const override { return true; }
private:
protected:
size_t max_chunk_size_;
int64_t retry_time_;
std::vector<std::unique_ptr<Allocation>> chunks_;
std::shared_ptr<ManagedAllocator> raw_allocator_;
std::shared_ptr<ManagedAllocator> default_allocator_;
};
#ifdef PADDLE_WITH_CUDA
class CUDAManagedAllocator : public ChunkedManagedAllocator {
public:
explicit CUDAManagedAllocator(int dev_id)
: ChunkedManagedAllocator(
std::unique_ptr<Allocator>(
new CUDAAllocator(platform::CUDAPlace(dev_id))),
GetMaxChunkSize(dev_id), GetCapcity(dev_id), GetRetryTime()) {}
private:
static size_t GetMaxChunkSize(int dev_id) {
platform::CUDADeviceGuard guard(dev_id);
return platform::GpuMaxChunkSize();
}
static size_t GetCapcity(int dev_id) {
platform::CUDADeviceGuard guard(dev_id);
size_t available, total;
platform::GpuMemoryUsage(&available, &total);
size_t max_chunk_size = platform::GpuMaxChunkSize();
return max_chunk_size == 0 ? 0 : available / max_chunk_size;
}
static int64_t GetRetryTime() { return FLAGS_gpu_allocator_retry_time; }
};
class CUDAPinnedManagedAllocator : public ChunkedManagedAllocator {
public:
CUDAPinnedManagedAllocator()
: ChunkedManagedAllocator(
std::unique_ptr<Allocator>(new CPUPinnedAllocator()),
platform::CUDAPinnedMaxChunkSize(), GetCapacity(), -1) {
} // never retry
private:
static size_t GetCapacity() {
size_t total = platform::CpuTotalPhysicalMemory();
size_t max_chunk_size = platform::CUDAPinnedMaxChunkSize();
return max_chunk_size == 0 ? 0 : total / max_chunk_size;
}
};
#endif
class AllocatorFacadePrivate {
......@@ -173,6 +201,7 @@ class AllocatorFacadePrivate {
AllocatorFacadePrivate() {
InitCPUAllocator();
InitCUDAAllocator();
InitCUDAPinnedAllocator();
WrapZeroSizeAllocator();
}
......@@ -183,13 +212,21 @@ class AllocatorFacadePrivate {
void InitCUDAAllocator() {
#ifdef PADDLE_WITH_CUDA
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
int device_count = platform::GetCUDADeviceCount();
for (int dev_id = 0; dev_id < device_count; ++dev_id) {
allocators_[platform::CUDAPlace(dev_id)] =
std::make_shared<CUDAManagedAllocator>(dev_id);
}
#endif
}
void InitCUDAPinnedAllocator() {
#ifdef PADDLE_WITH_CUDA
allocators_[platform::CUDAPinnedPlace()] =
std::make_shared<CUDAPinnedManagedAllocator>();
#endif
}
void WrapZeroSizeAllocator() {
for (auto& pair : allocators_) {
pair.second =
......
......@@ -16,37 +16,70 @@
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_int32(gpu_allocator_retry_time);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_int64(gpu_allocator_retry_time);
#endif
namespace paddle {
namespace memory {
namespace allocation {
TEST(allocator, allocator) {
#ifdef PADDLE_WITH_CUDA
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
auto &instance = AllocatorFacade::Instance();
platform::Place place;
size_t size = 1024;
{
auto cpu_allocation = instance.Alloc(platform::CPUPlace(), 1024);
place = platform::CPUPlace();
size = 1024;
auto cpu_allocation = instance.Alloc(place, size);
ASSERT_NE(cpu_allocation, nullptr);
ASSERT_NE(cpu_allocation->ptr(), nullptr);
ASSERT_EQ(cpu_allocation->place(), place);
ASSERT_EQ(cpu_allocation->size(), size);
}
#ifdef PADDLE_WITH_CUDA
{
auto gpu_allocation = instance.Alloc(platform::CUDAPlace(0), 1024);
place = platform::CUDAPlace(0);
size = 1024;
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
// Allocate 2GB gpu memory
auto gpu_allocation = instance.Alloc(platform::CUDAPlace(0),
2 * static_cast<size_t>(1 << 30));
place = platform::CUDAPlace(0);
size = 2 * static_cast<size_t>(1 << 30);
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{}
{
place = platform::CUDAPinnedPlace();
size = (1 << 20);
auto cuda_pinned_allocation =
instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
ASSERT_NE(cuda_pinned_allocation, nullptr);
ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
ASSERT_EQ(cuda_pinned_allocation->place(), place);
ASSERT_GE(cuda_pinned_allocation->size(), size);
}
#endif
}
} // namespace allocation
......
......@@ -17,6 +17,7 @@
#include <atomic> // NOLINT
#include <functional>
#include <memory>
#include <mutex> // NOLINT
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/memory/allocation/allocator.h"
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#include <mutex> // NOLINT
namespace paddle {
namespace memory {
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#pragma once
#include <memory>
#include <mutex> // NOLINT
#include <thread> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
......
......@@ -22,9 +22,9 @@ namespace allocation {
std::unique_ptr<Allocation> CPUPinnedAllocator::Allocate(size_t size,
Allocator::Attr attr) {
PADDLE_ENFORCE_EQ(
attr, kCrossDevice,
"CPUPinnedAllocator should be used for Cross-Device Communication");
// PADDLE_ENFORCE_EQ(
// attr, kCrossDevice,
// "CPUPinnedAllocator should be used for Cross-Device Communication");
void* ptr;
PADDLE_ENFORCE(cudaMallocHost(&ptr, size));
......
......@@ -23,7 +23,7 @@ namespace allocation {
class CPUPinnedAllocation : public Allocation {
public:
CPUPinnedAllocation(void* ptr, size_t size)
: Allocation(ptr, size, platform::CPUPlace()) {}
: Allocation(ptr, size, platform::CUDAPinnedPlace()) {}
};
class CPUPinnedAllocator : public UnmanagedAllocator {
......
......@@ -30,12 +30,7 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
// If use_pinned_memory is true, CPUAllocator calls mlock, which
// returns pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the amount
// of memory available to the system for paging. So, by default, we
// should set false to use_pinned_memory.
DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
DECLARE_bool(use_pinned_memory);
DECLARE_double(fraction_of_gpu_memory_to_use);
namespace paddle {
namespace memory {
......
......@@ -98,7 +98,6 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace& place) {
}
#ifdef PADDLE_WITH_CUDA
BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
static std::once_flag init_flag;
static detail::BuddyAllocator** a_arr = nullptr;
......@@ -128,15 +127,21 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
platform::SetDeviceId(gpu_id);
return a_arr[gpu_id];
}
#endif
template <>
size_t Used<platform::CUDAPlace>(const platform::CUDAPlace& place) {
#ifdef PADDLE_WITH_CUDA
return GetGPUBuddyAllocator(place.device)->Used();
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
}
template <>
void* Alloc<platform::CUDAPlace>(const platform::CUDAPlace& place,
size_t size) {
#ifdef PADDLE_WITH_CUDA
auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
auto* ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
......@@ -156,13 +161,21 @@ void* Alloc<platform::CUDAPlace>(const platform::CUDAPlace& place,
cudaMemset(ptr, 0xEF, size);
}
return ptr;
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
}
template <>
void Free<platform::CUDAPlace>(const platform::CUDAPlace& place, void* p) {
#ifdef PADDLE_WITH_CUDA
GetGPUBuddyAllocator(place.device)->Free(p);
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
}
#ifdef PADDLE_WITH_CUDA
BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
static std::once_flag init_flag;
static BuddyAllocator* ba = nullptr;
......@@ -176,15 +189,21 @@ BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
return ba;
}
#endif
template <>
size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place) {
#ifdef PADDLE_WITH_CUDA
return GetCUDAPinnedBuddyAllocator()->Used();
#else
PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
#endif
}
template <>
void* Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place,
size_t size) {
#ifdef PADDLE_WITH_CUDA
auto* buddy_allocator = GetCUDAPinnedBuddyAllocator();
void* ptr = buddy_allocator->Alloc(size);
......@@ -196,14 +215,20 @@ void* Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place,
memset(ptr, 0xEF, size);
}
return ptr;
#else
PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
#endif
}
template <>
void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place,
void* p) {
#ifdef PADDLE_WITH_CUDA
GetCUDAPinnedBuddyAllocator()->Free(p);
}
#else
PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
#endif
}
struct AllocVisitor : public boost::static_visitor<void*> {
inline explicit AllocVisitor(size_t size) : size_(size) {}
......
......@@ -27,6 +27,8 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
}
#ifdef PADDLE_WITH_CUDA
static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K
template <>
void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
......@@ -36,6 +38,10 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
} else {
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
// FIXME(zjl): do we really need it?
if (num <= kMaxGpuAsyncCopyBytes) {
cudaStreamSynchronize(0);
}
}
}
......@@ -48,6 +54,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
} else {
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
// FIXME(zjl): do we really need it?
if (num <= kMaxGpuAsyncCopyBytes) {
cudaStreamSynchronize(0);
}
}
}
......
......@@ -56,10 +56,17 @@ DEFINE_double(
"Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
"reserve the rest for page tables, etc");
// If use_pinned_memory is true, CPUAllocator calls mlock, which
// returns pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the amount
// of memory available to the system for paging. So, by default, we
// should set false to use_pinned_memory.
DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
namespace paddle {
namespace platform {
inline size_t CpuTotalPhysicalMemory() {
size_t CpuTotalPhysicalMemory() {
#ifdef __APPLE__
int mib[2];
mib[0] = CTL_HW;
......
......@@ -19,6 +19,8 @@ limitations under the License. */
namespace paddle {
namespace platform {
size_t CpuTotalPhysicalMemory();
//! Get the maximum allocation size for a machine.
size_t CpuMaxAllocSize();
......
......@@ -13,11 +13,11 @@ limitations under the License. */
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/memory/memory.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
namespace paddle {
......
......@@ -19,7 +19,9 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/place.h"
......
......@@ -63,8 +63,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
#ifdef PADDLE_WITH_CUDA
auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
tensor.dims(), platform::CPUPlace(),
memory::Allocator::kCrossDevice));
tensor.dims(), platform::CPUPlace()));
paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
sizeof(CUR_TYPE) * tensor.numel(),
......
......@@ -110,10 +110,10 @@ def __bootstrap__():
os.environ['OMP_NUM_THREADS'] = str(num_threads)
read_env_flags = [
'check_nan_inf', 'benchmark', 'warpctc_dir', 'eager_delete_scope',
'use_mkldnn', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
'paddle_num_threads', "dist_threadpool_size", 'cpu_deterministic',
'eager_delete_tensor_gb', 'use_legacy_allocator'
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
'init_allocated_mem', 'paddle_num_threads', "dist_threadpool_size",
'cpu_deterministic', 'eager_delete_tensor_gb', 'use_legacy_allocator'
]
if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册