提交 2002e71d 编写于 作者: S sneaxiy

fix pinned allocator

上级 21fdf8e8
...@@ -112,8 +112,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -112,8 +112,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
dst->set_layout(src.layout()); dst->set_layout(src.layout());
auto src_place = src.place(); auto src_place = src.place();
auto src_ptr = src.data<void>(); auto src_ptr = src.data<void>();
auto dst_ptr = auto dst_ptr = dst->mutable_data(dst_place, src.type());
dst->mutable_data(dst_place, src.type(), memory::Allocator::kCrossDevice);
auto size = src.numel() * SizeOfType(src.type()); auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
......
...@@ -2,7 +2,10 @@ cc_library(allocator SRCS allocator.cc DEPS place) ...@@ -2,7 +2,10 @@ cc_library(allocator SRCS allocator.cc DEPS place)
cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator) cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
if (WITH_GPU)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
endif()
cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator) cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
...@@ -29,7 +32,7 @@ cc_library(naive_managed_allocator SRCS naive_managed_allocator.cc DEPS allocato ...@@ -29,7 +32,7 @@ cc_library(naive_managed_allocator SRCS naive_managed_allocator.cc DEPS allocato
cc_test(naive_managed_allocator_test SRCS naive_managed_allocator_test.cc DEPS naive_managed_allocator) cc_test(naive_managed_allocator_test SRCS naive_managed_allocator_test.cc DEPS naive_managed_allocator)
nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
if (WITH_GPU) if (WITH_GPU)
set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator) set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard)
else () else ()
set(AllocatorFacadeDeps) set(AllocatorFacadeDeps)
endif() endif()
...@@ -48,8 +51,7 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS ...@@ -48,8 +51,7 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS
auto_increment_allocator auto_increment_allocator
zero_size_allocator zero_size_allocator
conditional_allocator conditional_allocator
retry_allocator retry_allocator)
cuda_device_guard)
nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade) nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
......
...@@ -25,17 +25,18 @@ ...@@ -25,17 +25,18 @@
#include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h" #include "paddle/fluid/memory/allocation/locked_allocator.h"
#include "paddle/fluid/memory/allocation/naive_managed_allocator.h" #include "paddle/fluid/memory/allocation/naive_managed_allocator.h"
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/memory/allocation/retry_allocator.h" #include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/memory/allocation/zero_size_allocator.h" #include "paddle/fluid/memory/allocation/zero_size_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/memory/allocation/cuda_allocator.h" #include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/gpu_info.h"
#endif #endif
DEFINE_int32( DEFINE_int64(
gpu_allocator_retry_time, 0, gpu_allocator_retry_time, 0,
"The retry time (milliseconds) when allocator fails " "The retry time (milliseconds) when allocator fails "
"to allocate memory. No retry if this value is not greater than 0"); "to allocate memory. No retry if this value is not greater than 0");
...@@ -49,51 +50,34 @@ class CPUManagedAllocator : public ManagedAllocator { ...@@ -49,51 +50,34 @@ class CPUManagedAllocator : public ManagedAllocator {
public: public:
CPUManagedAllocator() CPUManagedAllocator()
: normal_allocator_(NaiveManagedAllocator::Create( : normal_allocator_(NaiveManagedAllocator::Create(
std::unique_ptr<Allocator>(new CPUAllocator()))), std::unique_ptr<Allocator>(new CPUAllocator()))) {}
communication_allocator_(NaiveManagedAllocator::Create(
std::unique_ptr<Allocator>(new CPUPinnedAllocator()))) {}
std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override { std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override {
if (attr == kCrossDevice) { return normal_allocator_->Allocate(size, attr);
return communication_allocator_->Allocate(size, attr);
} else {
return normal_allocator_->Allocate(size, attr);
}
} }
std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override { std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
if (attr == kCrossDevice) { return normal_allocator_->AllocateShared(size, attr);
return communication_allocator_->AllocateShared(size, attr);
} else {
return normal_allocator_->AllocateShared(size, attr);
}
} }
bool IsAllocThreadSafe() const override { return true; } bool IsAllocThreadSafe() const override { return true; }
private: private:
std::shared_ptr<ManagedAllocator> normal_allocator_; std::shared_ptr<ManagedAllocator> normal_allocator_;
std::shared_ptr<ManagedAllocator> communication_allocator_;
}; };
#ifdef PADDLE_WITH_CUDA
// TODO(yy): Dirty code here. This class should be configurable in runtime. // TODO(yy): Dirty code here. This class should be configurable in runtime.
class CUDAManagedAllocator : public ManagedAllocator { class ChunkedManagedAllocator : public ManagedAllocator {
public: public:
explicit CUDAManagedAllocator(int dev_id) { explicit ChunkedManagedAllocator(std::unique_ptr<Allocator> system_allocator,
platform::CUDADeviceGuard guard(dev_id); size_t max_chunk_size, size_t capacity = 1,
max_chunk_size_ = platform::GpuMaxChunkSize(); int64_t retry_time = -1)
: max_chunk_size_(max_chunk_size), retry_time_(retry_time) {
raw_allocator_ = NaiveManagedAllocator::Create(std::unique_ptr<Allocator>( raw_allocator_ = NaiveManagedAllocator::Create(std::move(system_allocator));
new CUDAAllocator(platform::CUDAPlace(dev_id))));
if (max_chunk_size_ == 0) { if (max_chunk_size_ == 0) {
default_allocator_ = raw_allocator_; default_allocator_ = raw_allocator_;
} else { } else {
size_t available, total;
platform::GpuMemoryUsage(&available, &total);
size_t capacity = available / max_chunk_size_;
if (capacity == 1) { if (capacity == 1) {
VLOG(10) << "Create BestFitAllocator with chunk_size " VLOG(10) << "Create BestFitAllocator with chunk_size "
<< max_chunk_size_; << max_chunk_size_;
...@@ -119,7 +103,7 @@ class CUDAManagedAllocator : public ManagedAllocator { ...@@ -119,7 +103,7 @@ class CUDAManagedAllocator : public ManagedAllocator {
default_allocator_.reset(cond_allocator); default_allocator_.reset(cond_allocator);
} }
~CUDAManagedAllocator() { ~ChunkedManagedAllocator() {
// Specify destruct order. // Specify destruct order.
default_allocator_.reset(); default_allocator_.reset();
chunks_.clear(); chunks_.clear();
...@@ -140,27 +124,71 @@ class CUDAManagedAllocator : public ManagedAllocator { ...@@ -140,27 +124,71 @@ class CUDAManagedAllocator : public ManagedAllocator {
std::unique_ptr<Allocator> unmanaged_allocator(new LockedAllocator( std::unique_ptr<Allocator> unmanaged_allocator(new LockedAllocator(
std::unique_ptr<Allocator>(new BestFitAllocator(allocation)))); std::unique_ptr<Allocator>(new BestFitAllocator(allocation))));
if (FLAGS_gpu_allocator_retry_time <= 0) { if (retry_time_ <= 0) {
VLOG(10) << "Create NaiveManagedAllocator without retry"; VLOG(10) << "Create NaiveManagedAllocator without retry";
return std::make_shared<AlignedAllocator<64u>>( return std::make_shared<AlignedAllocator<64u>>(
NaiveManagedAllocator::Create(std::move(unmanaged_allocator))); NaiveManagedAllocator::Create(std::move(unmanaged_allocator)));
} else { } else {
VLOG(10) << "Create RetryAllocator with retry_time " VLOG(10) << "Create RetryAllocator with retry_time " << retry_time_
<< FLAGS_gpu_allocator_retry_time << "ms"; << "ms";
return std::make_shared<AlignedAllocator<64u>>(RetryAllocator::Create( return std::make_shared<AlignedAllocator<64u>>(RetryAllocator::Create(
std::move(unmanaged_allocator), std::move(unmanaged_allocator), static_cast<size_t>(retry_time_)));
static_cast<size_t>(FLAGS_gpu_allocator_retry_time)));
} }
} }
bool IsAllocThreadSafe() const override { return true; } bool IsAllocThreadSafe() const override { return true; }
private: protected:
size_t max_chunk_size_; size_t max_chunk_size_;
int64_t retry_time_;
std::vector<std::unique_ptr<Allocation>> chunks_; std::vector<std::unique_ptr<Allocation>> chunks_;
std::shared_ptr<ManagedAllocator> raw_allocator_; std::shared_ptr<ManagedAllocator> raw_allocator_;
std::shared_ptr<ManagedAllocator> default_allocator_; std::shared_ptr<ManagedAllocator> default_allocator_;
}; };
#ifdef PADDLE_WITH_CUDA
class CUDAManagedAllocator : public ChunkedManagedAllocator {
public:
explicit CUDAManagedAllocator(int dev_id)
: ChunkedManagedAllocator(
std::unique_ptr<Allocator>(
new CUDAAllocator(platform::CUDAPlace(dev_id))),
GetMaxChunkSize(dev_id), GetCapcity(dev_id), GetRetryTime()) {}
private:
static size_t GetMaxChunkSize(int dev_id) {
platform::CUDADeviceGuard guard(dev_id);
return platform::GpuMaxChunkSize();
}
static size_t GetCapcity(int dev_id) {
platform::CUDADeviceGuard guard(dev_id);
size_t available, total;
platform::GpuMemoryUsage(&available, &total);
size_t max_chunk_size = platform::GpuMaxChunkSize();
return max_chunk_size == 0 ? 0 : available / max_chunk_size;
}
static int64_t GetRetryTime() { return FLAGS_gpu_allocator_retry_time; }
};
class CUDAPinnedManagedAllocator : public ChunkedManagedAllocator {
public:
CUDAPinnedManagedAllocator()
: ChunkedManagedAllocator(
std::unique_ptr<Allocator>(new CPUPinnedAllocator()),
platform::CUDAPinnedMaxChunkSize(), GetCapacity(), -1) {
} // never retry
private:
static size_t GetCapacity() {
size_t total = platform::CpuTotalPhysicalMemory();
size_t max_chunk_size = platform::CUDAPinnedMaxChunkSize();
return max_chunk_size == 0 ? 0 : total / max_chunk_size;
}
};
#endif #endif
class AllocatorFacadePrivate { class AllocatorFacadePrivate {
...@@ -173,6 +201,7 @@ class AllocatorFacadePrivate { ...@@ -173,6 +201,7 @@ class AllocatorFacadePrivate {
AllocatorFacadePrivate() { AllocatorFacadePrivate() {
InitCPUAllocator(); InitCPUAllocator();
InitCUDAAllocator(); InitCUDAAllocator();
InitCUDAPinnedAllocator();
WrapZeroSizeAllocator(); WrapZeroSizeAllocator();
} }
...@@ -183,13 +212,21 @@ class AllocatorFacadePrivate { ...@@ -183,13 +212,21 @@ class AllocatorFacadePrivate {
void InitCUDAAllocator() { void InitCUDAAllocator() {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) { int device_count = platform::GetCUDADeviceCount();
for (int dev_id = 0; dev_id < device_count; ++dev_id) {
allocators_[platform::CUDAPlace(dev_id)] = allocators_[platform::CUDAPlace(dev_id)] =
std::make_shared<CUDAManagedAllocator>(dev_id); std::make_shared<CUDAManagedAllocator>(dev_id);
} }
#endif #endif
} }
void InitCUDAPinnedAllocator() {
#ifdef PADDLE_WITH_CUDA
allocators_[platform::CUDAPinnedPlace()] =
std::make_shared<CUDAPinnedManagedAllocator>();
#endif
}
void WrapZeroSizeAllocator() { void WrapZeroSizeAllocator() {
for (auto& pair : allocators_) { for (auto& pair : allocators_) {
pair.second = pair.second =
......
...@@ -16,37 +16,70 @@ ...@@ -16,37 +16,70 @@
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_int32(gpu_allocator_retry_time); DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_int64(gpu_allocator_retry_time);
#endif
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
TEST(allocator, allocator) { TEST(allocator, allocator) {
#ifdef PADDLE_WITH_CUDA
FLAGS_fraction_of_gpu_memory_to_use = 0.01; FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500; FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
auto &instance = AllocatorFacade::Instance(); auto &instance = AllocatorFacade::Instance();
platform::Place place;
size_t size = 1024;
{ {
auto cpu_allocation = instance.Alloc(platform::CPUPlace(), 1024); place = platform::CPUPlace();
size = 1024;
auto cpu_allocation = instance.Alloc(place, size);
ASSERT_NE(cpu_allocation, nullptr); ASSERT_NE(cpu_allocation, nullptr);
ASSERT_NE(cpu_allocation->ptr(), nullptr);
ASSERT_EQ(cpu_allocation->place(), place);
ASSERT_EQ(cpu_allocation->size(), size);
} }
#ifdef PADDLE_WITH_CUDA
{ {
auto gpu_allocation = instance.Alloc(platform::CUDAPlace(0), 1024); place = platform::CUDAPlace(0);
size = 1024;
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr); ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
} }
{ {
// Allocate 2GB gpu memory // Allocate 2GB gpu memory
auto gpu_allocation = instance.Alloc(platform::CUDAPlace(0), place = platform::CUDAPlace(0);
2 * static_cast<size_t>(1 << 30)); size = 2 * static_cast<size_t>(1 << 30);
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr); ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
} }
{} {
place = platform::CUDAPinnedPlace();
size = (1 << 20);
auto cuda_pinned_allocation =
instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
ASSERT_NE(cuda_pinned_allocation, nullptr);
ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
ASSERT_EQ(cuda_pinned_allocation->place(), place);
ASSERT_GE(cuda_pinned_allocation->size(), size);
}
#endif
} }
} // namespace allocation } // namespace allocation
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <atomic> // NOLINT #include <atomic> // NOLINT
#include <functional> #include <functional>
#include <memory> #include <memory>
#include <mutex> // NOLINT
#include <thread> // NOLINT #include <thread> // NOLINT
#include <vector> #include <vector>
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/memory/allocation/locked_allocator.h" #include "paddle/fluid/memory/allocation/locked_allocator.h"
#include <mutex> // NOLINT
namespace paddle { namespace paddle {
namespace memory { namespace memory {
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <memory> #include <memory>
#include <mutex> // NOLINT
#include <thread> // NOLINT #include <thread> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
......
...@@ -22,9 +22,9 @@ namespace allocation { ...@@ -22,9 +22,9 @@ namespace allocation {
std::unique_ptr<Allocation> CPUPinnedAllocator::Allocate(size_t size, std::unique_ptr<Allocation> CPUPinnedAllocator::Allocate(size_t size,
Allocator::Attr attr) { Allocator::Attr attr) {
PADDLE_ENFORCE_EQ( // PADDLE_ENFORCE_EQ(
attr, kCrossDevice, // attr, kCrossDevice,
"CPUPinnedAllocator should be used for Cross-Device Communication"); // "CPUPinnedAllocator should be used for Cross-Device Communication");
void* ptr; void* ptr;
PADDLE_ENFORCE(cudaMallocHost(&ptr, size)); PADDLE_ENFORCE(cudaMallocHost(&ptr, size));
......
...@@ -23,7 +23,7 @@ namespace allocation { ...@@ -23,7 +23,7 @@ namespace allocation {
class CPUPinnedAllocation : public Allocation { class CPUPinnedAllocation : public Allocation {
public: public:
CPUPinnedAllocation(void* ptr, size_t size) CPUPinnedAllocation(void* ptr, size_t size)
: Allocation(ptr, size, platform::CPUPlace()) {} : Allocation(ptr, size, platform::CUDAPinnedPlace()) {}
}; };
class CPUPinnedAllocator : public UnmanagedAllocator { class CPUPinnedAllocator : public UnmanagedAllocator {
......
...@@ -30,12 +30,7 @@ limitations under the License. */ ...@@ -30,12 +30,7 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
// If use_pinned_memory is true, CPUAllocator calls mlock, which DECLARE_bool(use_pinned_memory);
// returns pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the amount
// of memory available to the system for paging. So, by default, we
// should set false to use_pinned_memory.
DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
namespace paddle { namespace paddle {
namespace memory { namespace memory {
......
...@@ -98,7 +98,6 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace& place) { ...@@ -98,7 +98,6 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace& place) {
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
static std::once_flag init_flag; static std::once_flag init_flag;
static detail::BuddyAllocator** a_arr = nullptr; static detail::BuddyAllocator** a_arr = nullptr;
...@@ -128,15 +127,21 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { ...@@ -128,15 +127,21 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
platform::SetDeviceId(gpu_id); platform::SetDeviceId(gpu_id);
return a_arr[gpu_id]; return a_arr[gpu_id];
} }
#endif
template <> template <>
size_t Used<platform::CUDAPlace>(const platform::CUDAPlace& place) { size_t Used<platform::CUDAPlace>(const platform::CUDAPlace& place) {
#ifdef PADDLE_WITH_CUDA
return GetGPUBuddyAllocator(place.device)->Used(); return GetGPUBuddyAllocator(place.device)->Used();
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
} }
template <> template <>
void* Alloc<platform::CUDAPlace>(const platform::CUDAPlace& place, void* Alloc<platform::CUDAPlace>(const platform::CUDAPlace& place,
size_t size) { size_t size) {
#ifdef PADDLE_WITH_CUDA
auto* buddy_allocator = GetGPUBuddyAllocator(place.device); auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
auto* ptr = buddy_allocator->Alloc(size); auto* ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) { if (ptr == nullptr) {
...@@ -156,13 +161,21 @@ void* Alloc<platform::CUDAPlace>(const platform::CUDAPlace& place, ...@@ -156,13 +161,21 @@ void* Alloc<platform::CUDAPlace>(const platform::CUDAPlace& place,
cudaMemset(ptr, 0xEF, size); cudaMemset(ptr, 0xEF, size);
} }
return ptr; return ptr;
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
} }
template <> template <>
void Free<platform::CUDAPlace>(const platform::CUDAPlace& place, void* p) { void Free<platform::CUDAPlace>(const platform::CUDAPlace& place, void* p) {
#ifdef PADDLE_WITH_CUDA
GetGPUBuddyAllocator(place.device)->Free(p); GetGPUBuddyAllocator(place.device)->Free(p);
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
} }
#ifdef PADDLE_WITH_CUDA
BuddyAllocator* GetCUDAPinnedBuddyAllocator() { BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
static std::once_flag init_flag; static std::once_flag init_flag;
static BuddyAllocator* ba = nullptr; static BuddyAllocator* ba = nullptr;
...@@ -176,15 +189,21 @@ BuddyAllocator* GetCUDAPinnedBuddyAllocator() { ...@@ -176,15 +189,21 @@ BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
return ba; return ba;
} }
#endif
template <> template <>
size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place) { size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place) {
#ifdef PADDLE_WITH_CUDA
return GetCUDAPinnedBuddyAllocator()->Used(); return GetCUDAPinnedBuddyAllocator()->Used();
#else
PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
#endif
} }
template <> template <>
void* Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place, void* Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place,
size_t size) { size_t size) {
#ifdef PADDLE_WITH_CUDA
auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(); auto* buddy_allocator = GetCUDAPinnedBuddyAllocator();
void* ptr = buddy_allocator->Alloc(size); void* ptr = buddy_allocator->Alloc(size);
...@@ -196,14 +215,20 @@ void* Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place, ...@@ -196,14 +215,20 @@ void* Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place,
memset(ptr, 0xEF, size); memset(ptr, 0xEF, size);
} }
return ptr; return ptr;
#else
PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
#endif
} }
template <> template <>
void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place, void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place,
void* p) { void* p) {
#ifdef PADDLE_WITH_CUDA
GetCUDAPinnedBuddyAllocator()->Free(p); GetCUDAPinnedBuddyAllocator()->Free(p);
} #else
PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
#endif #endif
}
struct AllocVisitor : public boost::static_visitor<void*> { struct AllocVisitor : public boost::static_visitor<void*> {
inline explicit AllocVisitor(size_t size) : size_(size) {} inline explicit AllocVisitor(size_t size) : size_(size) {}
......
...@@ -27,6 +27,8 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst, ...@@ -27,6 +27,8 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K
template <> template <>
void Copy<platform::CPUPlace, platform::CUDAPlace>( void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
...@@ -36,6 +38,10 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>( ...@@ -36,6 +38,10 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
} else { } else {
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
// FIXME(zjl): do we really need it?
if (num <= kMaxGpuAsyncCopyBytes) {
cudaStreamSynchronize(0);
}
} }
} }
...@@ -48,6 +54,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>( ...@@ -48,6 +54,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
} else { } else {
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
// FIXME(zjl): do we really need it?
if (num <= kMaxGpuAsyncCopyBytes) {
cudaStreamSynchronize(0);
}
} }
} }
......
...@@ -56,10 +56,17 @@ DEFINE_double( ...@@ -56,10 +56,17 @@ DEFINE_double(
"Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
"reserve the rest for page tables, etc"); "reserve the rest for page tables, etc");
// If use_pinned_memory is true, CPUAllocator calls mlock, which
// returns pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the amount
// of memory available to the system for paging. So, by default, we
// should set false to use_pinned_memory.
DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
inline size_t CpuTotalPhysicalMemory() { size_t CpuTotalPhysicalMemory() {
#ifdef __APPLE__ #ifdef __APPLE__
int mib[2]; int mib[2];
mib[0] = CTL_HW; mib[0] = CTL_HW;
......
...@@ -19,6 +19,8 @@ limitations under the License. */ ...@@ -19,6 +19,8 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
size_t CpuTotalPhysicalMemory();
//! Get the maximum allocation size for a machine. //! Get the maximum allocation size for a machine.
size_t CpuMaxAllocSize(); size_t CpuMaxAllocSize();
......
...@@ -13,11 +13,11 @@ limitations under the License. */ ...@@ -13,11 +13,11 @@ limitations under the License. */
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
namespace paddle { namespace paddle {
......
...@@ -19,7 +19,9 @@ limitations under the License. */ ...@@ -19,7 +19,9 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
......
...@@ -63,8 +63,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -63,8 +63,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>()); auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>( auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
tensor.dims(), platform::CPUPlace(), tensor.dims(), platform::CPUPlace()));
memory::Allocator::kCrossDevice));
paddle::platform::GpuMemcpySync(dst_ptr, src_ptr, paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
sizeof(CUR_TYPE) * tensor.numel(), sizeof(CUR_TYPE) * tensor.numel(),
......
...@@ -110,10 +110,10 @@ def __bootstrap__(): ...@@ -110,10 +110,10 @@ def __bootstrap__():
os.environ['OMP_NUM_THREADS'] = str(num_threads) os.environ['OMP_NUM_THREADS'] = str(num_threads)
read_env_flags = [ read_env_flags = [
'check_nan_inf', 'benchmark', 'warpctc_dir', 'eager_delete_scope', 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
'use_mkldnn', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
'paddle_num_threads', "dist_threadpool_size", 'cpu_deterministic', 'init_allocated_mem', 'paddle_num_threads', "dist_threadpool_size",
'eager_delete_tensor_gb', 'use_legacy_allocator' 'cpu_deterministic', 'eager_delete_tensor_gb', 'use_legacy_allocator'
] ]
if core.is_compiled_with_dist(): if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline') read_env_flags.append('rpc_deadline')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册