diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 89917cdfae4d92ce4c4e1672a4401f79f554cbd3..9fe92831e3aee745f5899ca247259f46e37fbfbb 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -112,8 +112,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, dst->set_layout(src.layout()); auto src_place = src.place(); auto src_ptr = src.data(); - auto dst_ptr = - dst->mutable_data(dst_place, src.type(), memory::Allocator::kCrossDevice); + auto dst_ptr = dst->mutable_data(dst_place, src.type()); auto size = src.numel() * SizeOfType(src.type()); if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { memory::Copy(boost::get(dst_place), dst_ptr, diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 5620b30f5a6466a23263c716663dfb7a84869636..b2be837832336b2e5e08291feae67333854ae676 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -2,7 +2,10 @@ cc_library(allocator SRCS allocator.cc DEPS place) cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator) cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) -nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) + +if (WITH_GPU) + nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) +endif() cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator) @@ -29,7 +32,7 @@ cc_library(naive_managed_allocator SRCS naive_managed_allocator.cc DEPS allocato cc_test(naive_managed_allocator_test SRCS naive_managed_allocator_test.cc DEPS naive_managed_allocator) nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) if (WITH_GPU) - set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator) + set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard) else () set(AllocatorFacadeDeps) endif() @@ -48,8 +51,7 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS auto_increment_allocator zero_size_allocator conditional_allocator - retry_allocator - cuda_device_guard) + retry_allocator) nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 02ea5d7e783d1bbfa0f9a2005dfc9e473ef8f4c6..f82668bffee4095eaacee8e3517b64f1e1414896 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -25,17 +25,18 @@ #include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/locked_allocator.h" #include "paddle/fluid/memory/allocation/naive_managed_allocator.h" -#include "paddle/fluid/memory/allocation/pinned_allocator.h" #include "paddle/fluid/memory/allocation/retry_allocator.h" #include "paddle/fluid/memory/allocation/zero_size_allocator.h" -#include "paddle/fluid/platform/cuda_device_guard.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/memory/allocation/cuda_allocator.h" +#include "paddle/fluid/memory/allocation/pinned_allocator.h" +#include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/gpu_info.h" #endif -DEFINE_int32( +DEFINE_int64( gpu_allocator_retry_time, 0, "The retry time (milliseconds) when allocator fails " "to allocate memory. No retry if this value is not greater than 0"); @@ -49,51 +50,34 @@ class CPUManagedAllocator : public ManagedAllocator { public: CPUManagedAllocator() : normal_allocator_(NaiveManagedAllocator::Create( - std::unique_ptr(new CPUAllocator()))), - communication_allocator_(NaiveManagedAllocator::Create( - std::unique_ptr(new CPUPinnedAllocator()))) {} + std::unique_ptr(new CPUAllocator()))) {} std::unique_ptr Allocate(size_t size, Attr attr) override { - if (attr == kCrossDevice) { - return communication_allocator_->Allocate(size, attr); - } else { - return normal_allocator_->Allocate(size, attr); - } + return normal_allocator_->Allocate(size, attr); } std::shared_ptr AllocateShared(size_t size, Attr attr) override { - if (attr == kCrossDevice) { - return communication_allocator_->AllocateShared(size, attr); - } else { - return normal_allocator_->AllocateShared(size, attr); - } + return normal_allocator_->AllocateShared(size, attr); } bool IsAllocThreadSafe() const override { return true; } private: std::shared_ptr normal_allocator_; - std::shared_ptr communication_allocator_; }; -#ifdef PADDLE_WITH_CUDA // TODO(yy): Dirty code here. This class should be configurable in runtime. -class CUDAManagedAllocator : public ManagedAllocator { +class ChunkedManagedAllocator : public ManagedAllocator { public: - explicit CUDAManagedAllocator(int dev_id) { - platform::CUDADeviceGuard guard(dev_id); - max_chunk_size_ = platform::GpuMaxChunkSize(); - - raw_allocator_ = NaiveManagedAllocator::Create(std::unique_ptr( - new CUDAAllocator(platform::CUDAPlace(dev_id)))); + explicit ChunkedManagedAllocator(std::unique_ptr system_allocator, + size_t max_chunk_size, size_t capacity = 1, + int64_t retry_time = -1) + : max_chunk_size_(max_chunk_size), retry_time_(retry_time) { + raw_allocator_ = NaiveManagedAllocator::Create(std::move(system_allocator)); if (max_chunk_size_ == 0) { default_allocator_ = raw_allocator_; } else { - size_t available, total; - platform::GpuMemoryUsage(&available, &total); - size_t capacity = available / max_chunk_size_; - if (capacity == 1) { VLOG(10) << "Create BestFitAllocator with chunk_size " << max_chunk_size_; @@ -119,7 +103,7 @@ class CUDAManagedAllocator : public ManagedAllocator { default_allocator_.reset(cond_allocator); } - ~CUDAManagedAllocator() { + ~ChunkedManagedAllocator() { // Specify destruct order. default_allocator_.reset(); chunks_.clear(); @@ -140,27 +124,71 @@ class CUDAManagedAllocator : public ManagedAllocator { std::unique_ptr unmanaged_allocator(new LockedAllocator( std::unique_ptr(new BestFitAllocator(allocation)))); - if (FLAGS_gpu_allocator_retry_time <= 0) { + if (retry_time_ <= 0) { VLOG(10) << "Create NaiveManagedAllocator without retry"; return std::make_shared>( NaiveManagedAllocator::Create(std::move(unmanaged_allocator))); } else { - VLOG(10) << "Create RetryAllocator with retry_time " - << FLAGS_gpu_allocator_retry_time << "ms"; + VLOG(10) << "Create RetryAllocator with retry_time " << retry_time_ + << "ms"; return std::make_shared>(RetryAllocator::Create( - std::move(unmanaged_allocator), - static_cast(FLAGS_gpu_allocator_retry_time))); + std::move(unmanaged_allocator), static_cast(retry_time_))); } } bool IsAllocThreadSafe() const override { return true; } - private: + protected: size_t max_chunk_size_; + int64_t retry_time_; std::vector> chunks_; std::shared_ptr raw_allocator_; std::shared_ptr default_allocator_; }; + +#ifdef PADDLE_WITH_CUDA + +class CUDAManagedAllocator : public ChunkedManagedAllocator { + public: + explicit CUDAManagedAllocator(int dev_id) + : ChunkedManagedAllocator( + std::unique_ptr( + new CUDAAllocator(platform::CUDAPlace(dev_id))), + GetMaxChunkSize(dev_id), GetCapcity(dev_id), GetRetryTime()) {} + + private: + static size_t GetMaxChunkSize(int dev_id) { + platform::CUDADeviceGuard guard(dev_id); + return platform::GpuMaxChunkSize(); + } + + static size_t GetCapcity(int dev_id) { + platform::CUDADeviceGuard guard(dev_id); + size_t available, total; + platform::GpuMemoryUsage(&available, &total); + size_t max_chunk_size = platform::GpuMaxChunkSize(); + return max_chunk_size == 0 ? 0 : available / max_chunk_size; + } + + static int64_t GetRetryTime() { return FLAGS_gpu_allocator_retry_time; } +}; + +class CUDAPinnedManagedAllocator : public ChunkedManagedAllocator { + public: + CUDAPinnedManagedAllocator() + : ChunkedManagedAllocator( + std::unique_ptr(new CPUPinnedAllocator()), + platform::CUDAPinnedMaxChunkSize(), GetCapacity(), -1) { + } // never retry + + private: + static size_t GetCapacity() { + size_t total = platform::CpuTotalPhysicalMemory(); + size_t max_chunk_size = platform::CUDAPinnedMaxChunkSize(); + return max_chunk_size == 0 ? 0 : total / max_chunk_size; + } +}; + #endif class AllocatorFacadePrivate { @@ -173,6 +201,7 @@ class AllocatorFacadePrivate { AllocatorFacadePrivate() { InitCPUAllocator(); InitCUDAAllocator(); + InitCUDAPinnedAllocator(); WrapZeroSizeAllocator(); } @@ -183,13 +212,21 @@ class AllocatorFacadePrivate { void InitCUDAAllocator() { #ifdef PADDLE_WITH_CUDA - for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) { + int device_count = platform::GetCUDADeviceCount(); + for (int dev_id = 0; dev_id < device_count; ++dev_id) { allocators_[platform::CUDAPlace(dev_id)] = std::make_shared(dev_id); } #endif } + void InitCUDAPinnedAllocator() { +#ifdef PADDLE_WITH_CUDA + allocators_[platform::CUDAPinnedPlace()] = + std::make_shared(); +#endif + } + void WrapZeroSizeAllocator() { for (auto& pair : allocators_) { pair.second = diff --git a/paddle/fluid/memory/allocation/allocator_facade_test.cc b/paddle/fluid/memory/allocation/allocator_facade_test.cc index 5185bf944460f6b36677bdd77c33048b79b6e6f2..802d79e15de253d4e67e35046bdf1d689258da6d 100644 --- a/paddle/fluid/memory/allocation/allocator_facade_test.cc +++ b/paddle/fluid/memory/allocation/allocator_facade_test.cc @@ -16,37 +16,70 @@ #include #include +#ifdef PADDLE_WITH_CUDA DECLARE_double(fraction_of_gpu_memory_to_use); -DECLARE_int32(gpu_allocator_retry_time); +DECLARE_double(fraction_of_cuda_pinned_memory_to_use); +DECLARE_int64(gpu_allocator_retry_time); +#endif namespace paddle { namespace memory { namespace allocation { TEST(allocator, allocator) { +#ifdef PADDLE_WITH_CUDA FLAGS_fraction_of_gpu_memory_to_use = 0.01; FLAGS_gpu_allocator_retry_time = 500; + FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; +#endif auto &instance = AllocatorFacade::Instance(); + platform::Place place; + size_t size = 1024; { - auto cpu_allocation = instance.Alloc(platform::CPUPlace(), 1024); + place = platform::CPUPlace(); + size = 1024; + auto cpu_allocation = instance.Alloc(place, size); ASSERT_NE(cpu_allocation, nullptr); + ASSERT_NE(cpu_allocation->ptr(), nullptr); + ASSERT_EQ(cpu_allocation->place(), place); + ASSERT_EQ(cpu_allocation->size(), size); } +#ifdef PADDLE_WITH_CUDA { - auto gpu_allocation = instance.Alloc(platform::CUDAPlace(0), 1024); + place = platform::CUDAPlace(0); + size = 1024; + auto gpu_allocation = instance.Alloc(place, size); ASSERT_NE(gpu_allocation, nullptr); + ASSERT_NE(gpu_allocation->ptr(), nullptr); + ASSERT_EQ(gpu_allocation->place(), place); + ASSERT_GE(gpu_allocation->size(), size); } { // Allocate 2GB gpu memory - auto gpu_allocation = instance.Alloc(platform::CUDAPlace(0), - 2 * static_cast(1 << 30)); + place = platform::CUDAPlace(0); + size = 2 * static_cast(1 << 30); + auto gpu_allocation = instance.Alloc(place, size); ASSERT_NE(gpu_allocation, nullptr); + ASSERT_NE(gpu_allocation->ptr(), nullptr); + ASSERT_EQ(gpu_allocation->place(), place); + ASSERT_GE(gpu_allocation->size(), size); } - {} + { + place = platform::CUDAPinnedPlace(); + size = (1 << 20); + auto cuda_pinned_allocation = + instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20); + ASSERT_NE(cuda_pinned_allocation, nullptr); + ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr); + ASSERT_EQ(cuda_pinned_allocation->place(), place); + ASSERT_GE(cuda_pinned_allocation->size(), size); + } +#endif } } // namespace allocation diff --git a/paddle/fluid/memory/allocation/auto_increment_allocator.h b/paddle/fluid/memory/allocation/auto_increment_allocator.h index f026c413d4b6afa6f8cbb1ad32a18dc048566dee..36ddd2b32e5b383f37f851befd7daf5816946323 100644 --- a/paddle/fluid/memory/allocation/auto_increment_allocator.h +++ b/paddle/fluid/memory/allocation/auto_increment_allocator.h @@ -17,6 +17,7 @@ #include // NOLINT #include #include +#include // NOLINT #include // NOLINT #include #include "paddle/fluid/memory/allocation/allocator.h" diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc index 1e0febe10bb3ab49192cd2f7b855df01953702e5..dea87229f9143efb1d0efbe121bf923c6df0810a 100644 --- a/paddle/fluid/memory/allocation/locked_allocator.cc +++ b/paddle/fluid/memory/allocation/locked_allocator.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/memory/allocation/locked_allocator.h" +#include // NOLINT namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h index f092a5bad007ee6d1081c22ffacdbb6190bd4a73..d6b877ba4f7da3e191624b94eb832b7aa8c0069f 100644 --- a/paddle/fluid/memory/allocation/locked_allocator.h +++ b/paddle/fluid/memory/allocation/locked_allocator.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once #include +#include // NOLINT #include // NOLINT #include "paddle/fluid/memory/allocation/allocator.h" diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index dd1f5a3dd0f70b0603b4bd89db54c2fa9373740b..650dab1b27c8095b3bd1c6d33cff20a2d0d7c5de 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -22,9 +22,9 @@ namespace allocation { std::unique_ptr CPUPinnedAllocator::Allocate(size_t size, Allocator::Attr attr) { - PADDLE_ENFORCE_EQ( - attr, kCrossDevice, - "CPUPinnedAllocator should be used for Cross-Device Communication"); + // PADDLE_ENFORCE_EQ( + // attr, kCrossDevice, + // "CPUPinnedAllocator should be used for Cross-Device Communication"); void* ptr; PADDLE_ENFORCE(cudaMallocHost(&ptr, size)); diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h index 2c9e09cd721bea969fceaa307bcf3ef93be6568c..d001a91d893e759ae838c93f6e104f5ed4b3a00b 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.h +++ b/paddle/fluid/memory/allocation/pinned_allocator.h @@ -23,7 +23,7 @@ namespace allocation { class CPUPinnedAllocation : public Allocation { public: CPUPinnedAllocation(void* ptr, size_t size) - : Allocation(ptr, size, platform::CPUPlace()) {} + : Allocation(ptr, size, platform::CUDAPinnedPlace()) {} }; class CPUPinnedAllocator : public UnmanagedAllocator { diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 1b96798d23cec34a1863f56c1e4027ce32b2eec5..2019d1a14f6dd5ed09c251f26c6ca352faa594ae 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -30,12 +30,7 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gpu_info.h" -// If use_pinned_memory is true, CPUAllocator calls mlock, which -// returns pinned and locked memory as staging areas for data exchange -// between host and device. Allocates too much would reduce the amount -// of memory available to the system for paging. So, by default, we -// should set false to use_pinned_memory. -DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory."); +DECLARE_bool(use_pinned_memory); DECLARE_double(fraction_of_gpu_memory_to_use); namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index fd81a0a7c6e2fb6b25c9b94c60f75d4bbdde7441..75686df4341aafc238b619c4d859ee001a6218d9 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -98,7 +98,6 @@ size_t Used(const platform::CPUPlace& place) { } #ifdef PADDLE_WITH_CUDA - BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { static std::once_flag init_flag; static detail::BuddyAllocator** a_arr = nullptr; @@ -128,15 +127,21 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { platform::SetDeviceId(gpu_id); return a_arr[gpu_id]; } +#endif template <> size_t Used(const platform::CUDAPlace& place) { +#ifdef PADDLE_WITH_CUDA return GetGPUBuddyAllocator(place.device)->Used(); +#else + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); +#endif } template <> void* Alloc(const platform::CUDAPlace& place, size_t size) { +#ifdef PADDLE_WITH_CUDA auto* buddy_allocator = GetGPUBuddyAllocator(place.device); auto* ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { @@ -156,13 +161,21 @@ void* Alloc(const platform::CUDAPlace& place, cudaMemset(ptr, 0xEF, size); } return ptr; +#else + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); +#endif } template <> void Free(const platform::CUDAPlace& place, void* p) { +#ifdef PADDLE_WITH_CUDA GetGPUBuddyAllocator(place.device)->Free(p); +#else + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); +#endif } +#ifdef PADDLE_WITH_CUDA BuddyAllocator* GetCUDAPinnedBuddyAllocator() { static std::once_flag init_flag; static BuddyAllocator* ba = nullptr; @@ -176,15 +189,21 @@ BuddyAllocator* GetCUDAPinnedBuddyAllocator() { return ba; } +#endif template <> size_t Used(const platform::CUDAPinnedPlace& place) { +#ifdef PADDLE_WITH_CUDA return GetCUDAPinnedBuddyAllocator()->Used(); +#else + PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device."); +#endif } template <> void* Alloc(const platform::CUDAPinnedPlace& place, size_t size) { +#ifdef PADDLE_WITH_CUDA auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(); void* ptr = buddy_allocator->Alloc(size); @@ -196,14 +215,20 @@ void* Alloc(const platform::CUDAPinnedPlace& place, memset(ptr, 0xEF, size); } return ptr; +#else + PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device."); +#endif } template <> void Free(const platform::CUDAPinnedPlace& place, void* p) { +#ifdef PADDLE_WITH_CUDA GetCUDAPinnedBuddyAllocator()->Free(p); -} +#else + PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device."); #endif +} struct AllocVisitor : public boost::static_visitor { inline explicit AllocVisitor(size_t size) : size_(size) {} diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index a177d4985fd0e2cca983b6873af89c60f526b811..2a6f70a01e303aa1b608248cbeb8dcfa24837a0c 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -27,6 +27,8 @@ void Copy(platform::CPUPlace, void* dst, } #ifdef PADDLE_WITH_CUDA +static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K + template <> void Copy( platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, @@ -36,6 +38,10 @@ void Copy( platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); } else { platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); + // FIXME(zjl): do we really need it? + if (num <= kMaxGpuAsyncCopyBytes) { + cudaStreamSynchronize(0); + } } } @@ -48,6 +54,10 @@ void Copy( platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); } else { platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); + // FIXME(zjl): do we really need it? + if (num <= kMaxGpuAsyncCopyBytes) { + cudaStreamSynchronize(0); + } } } diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index 2880c09263f10e9c624e11b77188171f48d9db28..f12070acf8ba1fc1353fcffa95642a79f3597774 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -56,10 +56,17 @@ DEFINE_double( "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," "reserve the rest for page tables, etc"); +// If use_pinned_memory is true, CPUAllocator calls mlock, which +// returns pinned and locked memory as staging areas for data exchange +// between host and device. Allocates too much would reduce the amount +// of memory available to the system for paging. So, by default, we +// should set false to use_pinned_memory. +DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory."); + namespace paddle { namespace platform { -inline size_t CpuTotalPhysicalMemory() { +size_t CpuTotalPhysicalMemory() { #ifdef __APPLE__ int mib[2]; mib[0] = CTL_HW; diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 30c8fbcfce92a8b06a175ddf198cde572f72b2a4..e2221414e1a6c29b53b1bc1f64da4e984a98f25f 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -19,6 +19,8 @@ limitations under the License. */ namespace paddle { namespace platform { +size_t CpuTotalPhysicalMemory(); + //! Get the maximum allocation size for a machine. size_t CpuMaxAllocSize(); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 6b1d5e297dd3b5a41d63e61d02afd367859d1431..e026ff703dafab1e6ca04de2e01f90c875649f46 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -13,11 +13,11 @@ limitations under the License. */ #include #include #include -#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/memory/memory.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/framework/rw_lock.h" +#include "paddle/fluid/platform/cuda_device_guard.h" #endif namespace paddle { diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 25a693ab95f1b360a532527e7838dca0b952f294..3d5c4ac2dc0ccd74674cc978dc4fb093ad180f9a 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -19,7 +19,9 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_info.h" +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" +#endif #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index e55f734e45b6b40741386979324ac88b1b989fa1..b39323f843f8dbf5a7e4bac841c8cb8ed7efdc07 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -63,8 +63,7 @@ struct CastToPyBufferImpl { #ifdef PADDLE_WITH_CUDA auto *src_ptr = static_cast(tensor.data()); auto *dst_ptr = static_cast(dst_tensor.mutable_data( - tensor.dims(), platform::CPUPlace(), - memory::Allocator::kCrossDevice)); + tensor.dims(), platform::CPUPlace())); paddle::platform::GpuMemcpySync(dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(), diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index ea1086cd4d0b71c8075d6895a68935a4f098fbc3..f29b85b3072e5452d1562f916bd05065f7538800 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -110,10 +110,10 @@ def __bootstrap__(): os.environ['OMP_NUM_THREADS'] = str(num_threads) read_env_flags = [ - 'check_nan_inf', 'benchmark', 'warpctc_dir', 'eager_delete_scope', - 'use_mkldnn', 'initial_cpu_memory_in_mb', 'init_allocated_mem', - 'paddle_num_threads', "dist_threadpool_size", 'cpu_deterministic', - 'eager_delete_tensor_gb', 'use_legacy_allocator' + 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', + 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', + 'init_allocated_mem', 'paddle_num_threads', "dist_threadpool_size", + 'cpu_deterministic', 'eager_delete_tensor_gb', 'use_legacy_allocator' ] if core.is_compiled_with_dist(): read_env_flags.append('rpc_deadline')