diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index ad19d729ebde4a9c81c283518f3cb2ac28152443..265a5c6fe2479f1c96569c0c55209063797da881 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -202,6 +202,8 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) cc_test(tuple_test SRCS tuple_test.cc ) +cc_test(inlined_vector_test SRCS inlined_vector_test.cc) + if (NOT WIN32) cc_test(rw_lock_test SRCS rw_lock_test.cc) endif (NOT WIN32) diff --git a/paddle/fluid/framework/inlined_stack.h b/paddle/fluid/framework/inlined_vector.h similarity index 71% rename from paddle/fluid/framework/inlined_stack.h rename to paddle/fluid/framework/inlined_vector.h index 1083c9f77c5476dc20a8e0ccf5acd0f718436ef6..0adff9d212161768232ff3133c9f64f77ab467ea 100644 --- a/paddle/fluid/framework/inlined_stack.h +++ b/paddle/fluid/framework/inlined_vector.h @@ -14,18 +14,18 @@ #pragma once -#include +#include #include "paddle/fluid/platform/enforce.h" namespace paddle { namespace framework { template -class InlinedStack { +class InlinedVector { static_assert(N > 0, "N must be larger than 0"); public: - inline void push(const T& item) { + inline void push_back(const T& item) { if (size_ < N) { head_[size_] = item; } else { @@ -34,21 +34,21 @@ class InlinedStack { ++size_; } - inline void pop() { - PADDLE_ENFORCE(!empty(), "Try to pop element from empty stack."); + inline void pop_back() { + PADDLE_ENFORCE(!empty(), "Try to pop back element from empty vector."); if (size_ > N) { tail_.pop_back(); } --size_; } - inline const T& top() const { - PADDLE_ENFORCE(!empty(), "Try to get top element of empty stack."); + inline const T& back() const { + PADDLE_ENFORCE(!empty(), "Try to get back element of empty vector."); return size_ <= N ? head_[size_ - 1] : tail_.back(); } - inline T& top() { - PADDLE_ENFORCE(!empty(), "Try to get top element of empty stack."); + inline T& back() { + PADDLE_ENFORCE(!empty(), "Try to get back element of empty vector."); return size_ <= N ? head_[size_ - 1] : tail_.back(); } @@ -63,10 +63,19 @@ class InlinedStack { return i < N ? head_[i] : tail_[i - N]; } + operator std::vector() const { + std::vector ret; + ret.reserve(size_); + for (size_t i = 0; i < size_; ++i) { + ret.emplace_back((*this)[i]); + } + return ret; + } + private: T head_[N]; size_t size_{0}; - std::deque tail_; + std::vector tail_; }; } // namespace framework diff --git a/paddle/fluid/framework/inlined_vector_test.cc b/paddle/fluid/framework/inlined_vector_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..b2b7a95b5e964fc82830dbdb58f050686dcb978f --- /dev/null +++ b/paddle/fluid/framework/inlined_vector_test.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/inlined_vector.h" +#include +#include "gtest/gtest.h" + +namespace paddle { +namespace framework { + +TEST(inlined_stack, inlined_stack) { + size_t max_num = 10; + + InlinedVector stack; + + for (size_t i = 0; i < max_num; ++i) { + ASSERT_EQ(stack.size(), i); + stack.push_back(i); + ASSERT_EQ(stack.size(), i + 1); + } + + std::vector vec = stack; + + ASSERT_EQ(stack.size(), vec.size()); + + for (size_t i = 0; i < vec.size(); ++i) { + ASSERT_EQ(stack[i], vec[i]); + } + + for (size_t i = 0; i < max_num; ++i) { + ASSERT_EQ(stack[i], i); + } + + for (size_t i = 0; i < max_num; ++i) { + ASSERT_EQ(stack.back(), max_num - 1 - i); + stack.pop_back(); + ASSERT_EQ(stack.size(), max_num - 1 - i); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 26ae89fe2869cbb0486daa6ce37c9bb86f043d7e..7552eee77e469f803a69d1e7dd28549fe9866c69 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -3,13 +3,18 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator) cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) -cc_library(multi_bin_buffered_allocator SRCS multi_bin_buffered_allocator.cc DEPS allocator) +cc_library(multi_bin_buffered_allocator SRCS multi_bin_buffered_allocator.cc DEPS allocator gflags) cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler) +cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator) cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator) cc_test(multi_bin_buffered_allocator_test SRCS multi_bin_buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator multi_bin_buffered_allocator cpu_allocator) -cc_library(auto_increment_best_fit_allocator SRCS auto_increment_best_fit_allocator.cc DEPS allocator) -cc_test(auto_increment_best_fit_allocator_test SRCS auto_increment_best_fit_allocator_test.cc DEPS cpu_allocator auto_increment_best_fit_allocator) +cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator) +cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator) + +if (NOT WIN32) + cc_test(test_multi_bin_buffered_allocator_division_plan SRCS test_multi_bin_buffered_allocator_division_plan.cc DEPS multi_bin_buffered_allocator) +endif() if (WITH_GPU) nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) @@ -42,30 +47,20 @@ else () set(AllocatorFacadeDeps) endif() +list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator multi_bin_buffered_allocator auto_growth_best_fit_allocator legacy_allocator zero_size_allocator) + cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator) -cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator) cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator) -cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags) -cc_library(allocator_facade SRCS allocator_facade.cc DEPS - ${AllocatorFacadeDeps} - cpu_allocator - locked_allocator - best_fit_allocator - aligned_allocator - auto_increment_allocator - zero_size_allocator - conditional_allocator - retry_allocator - buffered_allocator - multi_bin_buffered_allocator - auto_increment_best_fit_allocator - allocator_strategy - legacy_allocator - ) +cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps}) +cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy) nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade) cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator) cc_test(allocator_facade_test SRCS allocator_facade_test.cc DEPS allocator_facade) + +cc_test(naive_best_fit_allocator_facade_test SRCS naive_best_fit_allocator_facade_test.cc DEPS allocator_facade) + +cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS allocator_facade) diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h index 602d85bf9e8811bb7b1effc2292a84be1301b14c..b536d4276e3b6236d0748eee588d345dd15c6954 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.h +++ b/paddle/fluid/memory/allocation/aligned_allocator.h @@ -14,6 +14,7 @@ #pragma once #include +#include #include "paddle/fluid/memory/allocation/allocator.h" namespace paddle { diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc index 15a722730074554e721cd0d887d0a4b740e2b40a..5a5253d911abc722c026730e7e88eb326bb82afd 100644 --- a/paddle/fluid/memory/allocation/allocator.cc +++ b/paddle/fluid/memory/allocation/allocator.cc @@ -27,24 +27,24 @@ bool Allocator::IsAllocThreadSafe() const { return false; } AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) { auto ptr = AllocateImpl(size, attr); - ptr->RegisterAllocatorChain(this); + ptr->RegisterDecoratedAllocator(this); return AllocationPtr(ptr); } void Allocator::FreeImpl(Allocation* allocation) { - Allocator* allocator = allocation->TopAllocator(); + Allocator* allocator = allocation->TopDecoratedAllocator(); allocator->Free(allocation); } void Allocator::Free(Allocation* allocation) { - allocation->PopAllocator(); + allocation->PopDecoratedAllocator(); FreeImpl(allocation); } const char* BadAlloc::what() const noexcept { return msg_.c_str(); } void AllocationDeleter::operator()(Allocation* allocation) const { - Allocator* allocator = allocation->TopAllocator(); + Allocator* allocator = allocation->TopDecoratedAllocator(); allocator->Free(allocation); } diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index fabd1ff57fedc7376bc8d4dd48166607cd73b59a..3497e46516f7cc0897ef697b6fa1abed23a1cd51 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -15,8 +15,9 @@ #pragma once #include #include +#include #include -#include "paddle/fluid/framework/inlined_stack.h" +#include "paddle/fluid/framework/inlined_vector.h" #include "paddle/fluid/platform/place.h" namespace paddle { @@ -78,29 +79,26 @@ class Allocation { virtual ~Allocation(); - // This function should only be used in unittest - std::vector GetAllocatorChain() const { - std::vector allocators; - for (size_t i = 0; i < allocator_chain_.size(); ++i) { - allocators.push_back(allocator_chain_[i]); - } - return allocators; + private: + std::vector DecoratedAllocators() const { + return static_cast>(decorated_allocators_); } - private: - inline void RegisterAllocatorChain(Allocator* allocator) { - allocator_chain_.push(allocator); + inline void RegisterDecoratedAllocator(Allocator* allocator) { + decorated_allocators_.push_back(allocator); } - inline void PopAllocator() { allocator_chain_.pop(); } + inline void PopDecoratedAllocator() { decorated_allocators_.pop_back(); } - inline Allocator* TopAllocator() { return allocator_chain_.top(); } + inline Allocator* TopDecoratedAllocator() { + return decorated_allocators_.back(); + } private: void* ptr_; size_t size_; platform::Place place_; - framework::InlinedStack allocator_chain_; + framework::InlinedVector decorated_allocators_; friend class Allocator; friend class AllocationDeleter; diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index b35032fb3c1d4436e94851b7731572834ce0cf37..0f7d5926f1f7ea7fa7d0c6909dea303371cdb78b 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -17,12 +17,13 @@ #include #include #include +#include #include #include "paddle/fluid/memory/allocation/aligned_allocator.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" +#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" #include "paddle/fluid/memory/allocation/auto_increment_allocator.h" -#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h" #include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/conditional_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h" @@ -32,6 +33,7 @@ #include "paddle/fluid/memory/allocation/retry_allocator.h" #include "paddle/fluid/memory/allocation/zero_size_allocator.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/memory/allocation/cuda_allocator.h" @@ -51,6 +53,21 @@ namespace paddle { namespace memory { namespace allocation { +static inline std::shared_ptr WrapRetryAndBufferedAllocator( + std::shared_ptr allocator, int64_t retry_time, + bool enable_buffered) { + if (retry_time > 0) { + auto* retry_allocator = + new RetryAllocator(std::move(allocator), retry_time); + allocator.reset(retry_allocator); + } + + if (enable_buffered) { + allocator.reset(new MultiBinBufferedAllocator(allocator)); + } + return allocator; +} + // TODO(yy): Dirty code here. This class should be configurable in runtime. class CPUManagedAllocator : public Allocator { public: @@ -117,17 +134,10 @@ class ChunkedAllocator : public Allocator { std::shared_ptr allocator(new LockedAllocator( std::shared_ptr(new BestFitAllocator(allocation)))); - if (retry_time_ > 0) { - auto* retry_allocator = - new RetryAllocator(std::move(allocator), retry_time_); - allocator.reset(retry_allocator); - } + allocator = WrapRetryAndBufferedAllocator(allocator, retry_time_, + FLAGS_enable_buffered_allocator); - if (FLAGS_enable_buffered_allocator) { - allocator.reset(new MultiBinBufferedAllocator(allocator)); - } - - return std::make_shared>(std::move(allocator)); + return std::make_shared>(std::move(allocator)); } bool IsAllocThreadSafe() const override { return true; } @@ -210,7 +220,7 @@ class AllocatorFacadePrivate { break; } case AllocatorStrategy::kAutoGrowthBestFit: { - InitCPUAllocator(); + InitAutoGrowthCPUAllocator(); InitAutoGrowthCUDAAllocator(); InitAutoGrowthCUDAPinnedAllocator(); WrapZeroSizeAllocator(); @@ -224,15 +234,25 @@ class AllocatorFacadePrivate { } private: + void InitAutoGrowthCPUAllocator() { + auto cpu_allocator = std::make_shared>( + std::make_shared()); + allocators_[platform::CPUPlace()] = + std::make_shared( + cpu_allocator, platform::CpuMaxChunkSize(), 4096); + } + void InitAutoGrowthCUDAAllocator() { #ifdef PADDLE_WITH_CUDA int dev_cnt = platform::GetCUDADeviceCount(); for (int dev_id = 0; dev_id < dev_cnt; ++dev_id) { auto cuda_allocator = std::make_shared>( std::make_shared(platform::CUDAPlace(dev_id))); - allocators_[platform::CUDAPlace(dev_id)] = - std::make_shared( - cuda_allocator, platform::GpuMaxChunkSize(), 4096); + auto allocator = std::make_shared( + cuda_allocator, platform::GpuMaxChunkSize(), 4096); + + allocators_[platform::CUDAPlace(dev_id)] = WrapRetryAndBufferedAllocator( + allocator, FLAGS_gpu_allocator_retry_time, false); } #endif } @@ -242,7 +262,7 @@ class AllocatorFacadePrivate { auto cuda_pinned_allocator = std::make_shared>( std::make_shared()); allocators_[platform::CUDAPinnedPlace()] = - std::make_shared( + std::make_shared( cuda_pinned_allocator, platform::CUDAPinnedMaxChunkSize(), 4096); #endif } @@ -300,8 +320,7 @@ AllocatorFacade& AllocatorFacade::Instance() { std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size, Allocator::Attr attr) { - return std::shared_ptr(Alloc(place, size, attr).release(), - AllocationDeleter()); + return std::shared_ptr(Alloc(place, size, attr)); } AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc index d96fe0851d0ddeba2b982899c13a0d55677a179d..e2a9c8414ad2942f921f8ca5bbdd9f261aba3e1d 100644 --- a/paddle/fluid/memory/allocation/allocator_strategy.cc +++ b/paddle/fluid/memory/allocation/allocator_strategy.cc @@ -19,7 +19,9 @@ DEFINE_string( allocator_strategy, "legacy", "The allocation strategy. Legacy means the original allocator of Fluid." - "New means the experimental allocators of Fluid. in [legacy, new]"); + "naive_best_fit means the experimental best fit allocator. " + "auto_growth_best_fit means the experimental auto growth best fit " + "allocator. Enum in [legacy, naive_best_fit, auto_growth_best_fit]."); namespace paddle { namespace memory { @@ -28,7 +30,7 @@ namespace allocation { static AllocatorStrategy GetStrategyFromFlag() { if (FLAGS_allocator_strategy == "legacy") { return AllocatorStrategy::kLegacy; - } else if (FLAGS_allocator_strategy == "navie_best_fit") { + } else if (FLAGS_allocator_strategy == "naive_best_fit") { return AllocatorStrategy::kNaiveBestFit; } else if (FLAGS_allocator_strategy == "auto_growth_best_fit") { return AllocatorStrategy::kAutoGrowthBestFit; diff --git a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc similarity index 92% rename from paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.cc rename to paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index ee52b10aa613e61544935a485b6d53fed3c903c9..3d901e04d036d7cd10dbfb2f1576636545feafef 100644 --- a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h" +#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" #include #include #include @@ -29,16 +29,14 @@ static size_t align(size_t size, size_t alignment) { return remaining == 0 ? size : size + alignment - remaining; } -AutoIncrementBestFitAllocator::AutoIncrementBestFitAllocator( +AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator( const std::shared_ptr &underlying_allocator, size_t chunk_size, size_t alignment) : underlying_allocator_(underlying_allocator), chunk_size_(align(chunk_size, alignment)), alignment_(alignment) {} -Allocation *AutoIncrementBestFitAllocator::AllocateImpl(size_t size, - Attr attr) { - if (size == 0) return nullptr; +Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size, Attr attr) { size = align(size, alignment_); std::lock_guard guard(mtx_); auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr)); @@ -95,7 +93,7 @@ Allocation *AutoIncrementBestFitAllocator::AllocateImpl(size_t size, return new Chunk::BlockAllocation(block_it); } -void AutoIncrementBestFitAllocator::FreeImpl(Allocation *allocation) { +void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) { auto &block_it = static_cast(allocation)->block_it_; auto &blocks = block_it->chunk_->blocks_; diff --git a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h similarity index 96% rename from paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h rename to paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h index 6e569c2627cf752d89b9d37c3c546215ec3df223..f60dad8112dc149fb38da0e026049e60a1d98b01 100644 --- a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h @@ -25,9 +25,9 @@ namespace paddle { namespace memory { namespace allocation { -class AutoIncrementBestFitAllocator : public Allocator { +class AutoGrowthBestFitAllocator : public Allocator { public: - explicit AutoIncrementBestFitAllocator( + explicit AutoGrowthBestFitAllocator( const std::shared_ptr &underlying_allocator, size_t chunk_size, size_t alignment); diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..8b8fb5d93881271bd684b01319fc6f5de0c3f190 --- /dev/null +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle/fluid/memory/allocation/allocator_facade.h" + +#ifdef PADDLE_WITH_CUDA +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_double(fraction_of_cuda_pinned_memory_to_use); +DECLARE_int64(gpu_allocator_retry_time); +#endif + +DECLARE_string(allocator_strategy); + +namespace paddle { +namespace memory { +namespace allocation { + +static inline size_t AlignTo(size_t size, size_t alignment = 4096) { + auto remaining = size % alignment; + return remaining == 0 ? size : size + alignment - remaining; +} + +TEST(allocator, allocator) { +#ifdef PADDLE_WITH_CUDA + FLAGS_fraction_of_gpu_memory_to_use = 0.01; + FLAGS_gpu_allocator_retry_time = 500; + FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; +#endif + + FLAGS_allocator_strategy = "auto_growth_best_fit"; + + auto &instance = AllocatorFacade::Instance(); + platform::Place place; + size_t size = 1024; + + { + place = platform::CPUPlace(); + size = 1024; + auto cpu_allocation = instance.Alloc(place, size); + ASSERT_NE(cpu_allocation, nullptr); + ASSERT_NE(cpu_allocation->ptr(), nullptr); + ASSERT_EQ(cpu_allocation->place(), place); + ASSERT_EQ(cpu_allocation->size(), AlignTo(size)); + } + +#ifdef PADDLE_WITH_CUDA + { + place = platform::CUDAPlace(0); + size = 1024; + auto gpu_allocation = instance.Alloc(place, size); + ASSERT_NE(gpu_allocation, nullptr); + ASSERT_NE(gpu_allocation->ptr(), nullptr); + ASSERT_EQ(gpu_allocation->place(), place); + ASSERT_GE(gpu_allocation->size(), AlignTo(size)); + } + + { + // Allocate 2GB gpu memory + place = platform::CUDAPlace(0); + size = 2 * static_cast(1 << 30); + auto gpu_allocation = instance.Alloc(place, size); + ASSERT_NE(gpu_allocation, nullptr); + ASSERT_NE(gpu_allocation->ptr(), nullptr); + ASSERT_EQ(gpu_allocation->place(), place); + ASSERT_GE(gpu_allocation->size(), AlignTo(size)); + } + + { + place = platform::CUDAPinnedPlace(); + size = (1 << 20); + auto cuda_pinned_allocation = + instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20); + ASSERT_NE(cuda_pinned_allocation, nullptr); + ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr); + ASSERT_EQ(cuda_pinned_allocation->place(), place); + ASSERT_GE(cuda_pinned_allocation->size(), AlignTo(size)); + } +#endif +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc similarity index 85% rename from paddle/fluid/memory/allocation/auto_increment_best_fit_allocator_test.cc rename to paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc index c5fb209279652f20f4f72cbf168dd09862ffc55b..087eb8c9cc56c87d481462f109f74088a3263c5c 100644 --- a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc @@ -22,18 +22,18 @@ #include -#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h" +#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h" namespace paddle { namespace memory { namespace allocation { -TEST(allocator, auto_increment_best_fit_allocator) { +TEST(allocator, auto_growth_best_fit_allocator) { auto cpu_allocator = std::make_shared(); auto allocator = - std::make_shared(cpu_allocator, 0, 4096); + std::make_shared(cpu_allocator, 0, 4096); std::mutex mtx; std::condition_variable cv; @@ -60,13 +60,9 @@ TEST(allocator, auto_increment_best_fit_allocator) { } cv.notify_all(); - thread_main(); - for (auto &th : ths) { th.join(); } - - std::cout << "test ends" << std::endl; } } // namespace allocation diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc index 7b2138cf34ce1dc5358d4494a519b07b09608f6c..854a117b0e7532962d5e0c95fd947527ac3b307a 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc +++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/memory/allocation/buffered_allocator.h" #include +#include #include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/locked_allocator.h" diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 1c42994bece1de3f7aa23b4056c3b1d8e4024fe5..0fd68b2a2242ea9c4a9d74571c6ce9cd6f5bf0a7 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -37,8 +37,6 @@ DEFINE_bool(init_allocated_mem, false, "that initializing the allocated memory with a small value " "during unit testing."); DECLARE_double(fraction_of_gpu_memory_to_use); -DECLARE_double(initial_gpu_memory_in_mb); -DECLARE_double(reallocate_gpu_memory_in_mb); DECLARE_bool(benchmark); namespace paddle { @@ -72,8 +70,7 @@ BuddyAllocator *GetCPUBuddyAllocator() { std::call_once(init_flag, []() { a = new detail::BuddyAllocator( std::unique_ptr(new detail::CPUAllocator), - platform::CpuMinChunkSize(), platform::CpuMaxChunkSize(), - platform::CpuMaxChunkSize()); + platform::CpuMinChunkSize(), platform::CpuMaxChunkSize()); }); return a; @@ -147,28 +144,16 @@ class GPUBuddyAllocatorList { PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id); std::call_once(flags_[dev_id], [this, dev_id] { platform::SetDeviceId(dev_id); - size_t first_size = platform::GpuFirstAllocateChunkSize(); - size_t re_size = platform::GpuReAllocateChunkSize(); - allocators_[dev_id] = - new BuddyAllocator(std::unique_ptr( - new detail::GPUAllocator(dev_id)), - platform::GpuMinChunkSize(), first_size, re_size); - VLOG(2) << "\n\nNOTE: each GPU device use " - << string::HumanReadableSize(first_size) << "(initial chunk) " - << string::HumanReadableSize(re_size) << "(reallocate chunk) " - << "% of GPU memory.\n" - << "You can set GFlags environment variable '" - << "FLAGS_fraction_of_gpu_memory_to_use" - << "' or " - "'FLAGS_initial_gpu_memory_in_mb/" - "FLAGS_reallocate_gpu_memory_in_mb' to change the fraction " - "of GPU usage.\n\n"; - VLOG(2) << "Currently, FLAGS_fraction_of_gpu_memory_to_use=" - << FLAGS_fraction_of_gpu_memory_to_use << ", " - << "FLAGS_initial_gpu_memory_in_mb=" - << FLAGS_initial_gpu_memory_in_mb << ", " - << "FLAGS_reallocate_gpu_memory_in_mb=" - << FLAGS_reallocate_gpu_memory_in_mb; + allocators_[dev_id] = new BuddyAllocator( + std::unique_ptr( + new detail::GPUAllocator(dev_id)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + VLOG(10) << "\n\nNOTE: each GPU device use " + << FLAGS_fraction_of_gpu_memory_to_use * 100 + << "% of GPU memory.\n" + << "You can set GFlags environment variable '" + << "FLAGS_fraction_of_gpu_memory_to_use" + << "' to change the fraction of GPU usage.\n\n"; }); return allocators_[dev_id]; } @@ -251,7 +236,6 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() { ba = new BuddyAllocator(std::unique_ptr( new detail::CUDAPinnedAllocator), platform::CUDAPinnedMinChunkSize(), - platform::CUDAPinnedMaxChunkSize(), platform::CUDAPinnedMaxChunkSize()); }); diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc index 03a17814e1a6f5285cec7f12e6840bc4f08d9a27..c43099cc88f839ad92d36774d49aafd7192f916f 100644 --- a/paddle/fluid/memory/allocation/locked_allocator.cc +++ b/paddle/fluid/memory/allocation/locked_allocator.cc @@ -14,8 +14,10 @@ #include "paddle/fluid/memory/allocation/locked_allocator.h" #include // NOLINT +#include #include "paddle/fluid/memory/allocation/allocation_with_underlying.h" #include "paddle/fluid/platform/lock_guard_ptr.h" + namespace paddle { namespace memory { namespace allocation { diff --git a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc index 3acb17e4a0d392ddc9fae6ed500bd00d905b608e..c649a7161e187c99a35d059534cc847715b6e78b 100644 --- a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc +++ b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc @@ -17,20 +17,37 @@ #include #include #include +#include // NOLINT #include #include +#include #include "paddle/fluid/platform/lock_guard_ptr.h" -DEFINE_double(buffered_allocator_excess_times, 2, - "Tolerant memory size times of buffered_allocator"); +DEFINE_double( + buffered_allocator_excess_times, 2, + "Excess memory size times of buffered_allocator. BufferedAllocator" + " would try to reuse memory freed previously, but the size of freed" + " allocation may not be exactly the same as the requested. Here, we" + " use a flag to control the excess times of reused memory size. " + "Not quite sure what is the best excess times value."); -DEFINE_string(division_plan_path, "", "Division plan file path"); +DEFINE_string( + buffered_allocator_division_plan_path, "", + "The file path which " + "determines the memory size division plans of BufferedAllocator." + "If it is empty, use the default division plan. The file must be a " + "text file which each lines indicates the bound of division plan. " + "For example, if the text file has 3 lines, which are '500M', '1G', " + " '2G', the division plan would be [0, 500M), [500M, 1G), [1G, 2G) " + "and [2G, +inf). Allocation request whose requested memory size is " + "inside the last interval of division plan would be dispatched to " + " underlying_allocator directly without caching when freed."); namespace paddle { namespace memory { namespace allocation { -std::string TrimStringAndToLowerCase(const std::string &str) { +static std::string TrimStringAndToUpperCase(const std::string &str) { auto not_space = [](char ch) { return std::isspace(ch) == 0; }; auto first_idx = static_cast( std::find_if(str.begin(), str.end(), not_space) - str.begin()); @@ -38,41 +55,69 @@ std::string TrimStringAndToLowerCase(const std::string &str) { std::find_if(str.rbegin(), str.rend(), not_space) - str.rbegin()); if (first_idx == str.size() || last_idx == str.size()) return ""; - last_idx = str.size() - 1 - last_idx; + last_idx = str.size() - last_idx; auto ret = str.substr(first_idx, last_idx - first_idx); std::for_each(ret.begin(), ret.end(), - [](char &ch) { ch = std::tolower(ch); }); + [](char &ch) { ch = std::toupper(ch); }); return ret; } -static size_t ParseStringToBytes(const std::string &str) { - std::string ret = str; - if (ret.back() == 'b') { - ret.pop_back(); +namespace { + +enum DivisionPlanFileStatus { kEOF, kException, kNormal }; + +} // NOLINT + +static size_t ParseStringToBytes(const std::string &original_str, + DivisionPlanFileStatus *ret_code) { + std::string str = TrimStringAndToUpperCase(original_str); + + if (str.empty()) { + *ret_code = kEOF; + return 0; + } + + if (str.back() == 'B') { + str.pop_back(); + if (str.empty()) { + *ret_code = kException; + return 0; + } } - PADDLE_ENFORCE(!ret.empty(), "Wrong format: %s", str); size_t multiples = 1; - switch (ret.back()) { - case 'g': + switch (str.back()) { + case 'G': multiples *= (static_cast(1) << 30); break; - case 'm': + case 'M': multiples *= (static_cast(1) << 20); break; - case 'k': + case 'K': multiples *= (static_cast(1) << 10); break; default: break; } - if (multiples != 1) ret.pop_back(); - ret = TrimStringAndToLowerCase(ret); - double ret_val = 0.0; - std::stringstream ss(ret); - PADDLE_ENFORCE((ss >> ret_val).good(), "Wrong format %s", str); - return static_cast(ret_val * multiples); + if (multiples != 1) { + str.pop_back(); + if (str.empty()) { + *ret_code = kException; + return 0; + } + } + + str = TrimStringAndToUpperCase(str); + double mem_val = -1.0; + std::stringstream ss(str); + if (!(ss >> mem_val) || mem_val < 0) { + *ret_code = kException; + return 0; + } + + *ret_code = kNormal; + return static_cast(mem_val * multiples); } static std::string GetDebugStringOfPlan(const std::vector &plan) { @@ -84,16 +129,27 @@ static std::string GetDebugStringOfPlan(const std::vector &plan) { return ret + "]"; } -static std::vector ReadDivisionPlanFromFile( +std::vector ReadBufferedAllocatorDivisionPlanFromFile( const std::string &filepath) { std::ifstream is(filepath.c_str()); - PADDLE_ENFORCE(is.good(), "File not exist"); + PADDLE_ENFORCE(is.good(), "File %s not exist", filepath); std::string str; std::vector plan; + size_t line_num = 1; while (std::getline(is, str).good()) { - str = TrimStringAndToLowerCase(str); - if (str.empty()) break; - plan.push_back(ParseStringToBytes(str)); + DivisionPlanFileStatus status; + size_t ret = ParseStringToBytes(str, &status); + if (status == kEOF) { + break; + } + if (status == kException) { + PADDLE_THROW( + "Invalid format in line %d of file %s: '%s'. Only support B, KB, MB, " + "GB.", + line_num, filepath, str); + } + plan.push_back(ret); + ++line_num; } return plan; } @@ -110,11 +166,12 @@ static void CheckAndModifyMemoryDivisionPlan( } PADDLE_ENFORCE(is_strictly_sorted, "Divison plan must be stricted sorted"); - // Insert 0 and remove MAX to disivion plan for clean binary searching code + // Insert 0 to disivion plan for clean binary searching code if (division_plan->empty() || division_plan->front() != 0) { division_plan->insert(division_plan->begin(), 0); } + // Remove MAX from disivion plan for clean binary searching code constexpr auto kSizeTypeMax = std::numeric_limits::max(); if (division_plan->back() == kSizeTypeMax) { division_plan->pop_back(); @@ -124,21 +181,17 @@ static void CheckAndModifyMemoryDivisionPlan( } static std::vector GetDefaultDivisionPlan() { - if (!FLAGS_division_plan_path.empty()) { - return ReadDivisionPlanFromFile(FLAGS_division_plan_path); + if (!FLAGS_buffered_allocator_division_plan_path.empty()) { + return ReadBufferedAllocatorDivisionPlanFromFile( + FLAGS_buffered_allocator_division_plan_path); } + // Default division plan is 4K, 8K, 16K, ..., 500M, 1G constexpr size_t kMaxLogSize = 30; - std::vector plan; for (size_t i = 12; i <= kMaxLogSize; ++i) { plan.push_back(static_cast(1) << i); } - /* - for (size_t i = 0; i < sizeof(size_t) * 8; ++i) { - plan.push_back(static_cast(1) << i); - } - */ return plan; } @@ -164,6 +217,7 @@ MultiBinBufferedAllocator::MultiBinBufferedAllocator( division_plan_(division_plan) { CheckAndModifyMemoryDivisionPlan(&division_plan_); allocations_.resize(division_plan_.size() - 1); + accumulated_cache_size_.assign(division_plan_.size() - 1, 0UL); mtx_.resize(division_plan_.size() - 1); if (underlying_allocator_->IsAllocThreadSafe()) { for (auto &mtx : mtx_) { @@ -182,28 +236,22 @@ void MultiBinBufferedAllocator::FreeImpl(Allocation *allocation) { platform::LockGuardPtr guard(mtx_[bin_index]); allocations_[bin_index].emplace(allocation->size(), AllocationPtr(allocation)); + accumulated_cache_size_[bin_index] += allocation->size(); } else { underlying_allocator_->Free(allocation); } } -// bin_index is not used currently. // Maybe we can design more flexible FreeCache strategy based on bin_index -size_t MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) { +// and require size. +size_t MultiBinBufferedAllocator::ClearCache() { size_t accumulated_size = 0; // FIXME(zjl): free the largest first when there is no extra for (size_t i = allocations_.size() - 1; i != static_cast(-1); --i) { platform::LockGuardPtr lock(mtx_[i]); - if (allocations_[i].empty()) continue; - auto it = --allocations_[i].end(); - do { - accumulated_size += it->second->size(); - underlying_allocator_->Free(it->second.release()); - allocations_[i].erase(it--); - if (accumulated_size >= size) { - return accumulated_size; - } - } while (!allocations_[i].empty()); + allocations_[i].clear(); + accumulated_size += accumulated_cache_size_[i]; + accumulated_cache_size_[i] = 0; } return accumulated_size; } @@ -212,10 +260,6 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) { auto bin_index = FindDivisionPlanBinIndex(division_plan_, size); auto upper_size = TolerantUpperSize(size); - // if (bin_index >= allocations_.size()) { - // VLOG(2) << "Allocate " << size << " from underlying directly"; - //} - for (; bin_index < allocations_.size() && upper_size >= division_plan_[bin_index]; ++bin_index) { @@ -226,6 +270,7 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) { size_t sz = it->second->size(); auto ret = std::move(it->second); allocation.erase(it); + accumulated_cache_size_[bin_index] -= sz; VLOG(3) << "Allocate " << sz << "(required " << size << ") from cache directly"; return ret.release(); @@ -239,10 +284,7 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) { VLOG(2) << "Allocate " << size << " from underlying directly"; return ret; } catch (BadAlloc &) { - VLOG(1) << retry_time << "-th BadAlloc raises, try to free " << size - << " bytes caches"; - // size_t actual_free_size = FreeCache(size, bin_index); - size_t actual_free_size = FreeCache(-1UL, bin_index); + size_t actual_free_size = ClearCache(); VLOG(1) << retry_time << "-th free " << actual_free_size << " bytes caches"; if (actual_free_size == 0) throw; @@ -251,6 +293,8 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) { } } +void UseMultiBinBufferedAllocatorGFlags() {} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h index f550f76e50c3856cab2573210395bef1e26ebd17..b93f4c062b481587aba37d9219ce8a7f516c2062 100644 --- a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h +++ b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h @@ -16,6 +16,8 @@ #include #include +#include // NOLINT +#include #include #include "paddle/fluid/memory/allocation/allocator.h" @@ -24,6 +26,9 @@ namespace paddle { namespace memory { namespace allocation { +std::vector ReadBufferedAllocatorDivisionPlanFromFile( + const std::string& filepath); + class MultiBinBufferedAllocator : public Allocator { public: explicit MultiBinBufferedAllocator( @@ -34,21 +39,24 @@ class MultiBinBufferedAllocator : public Allocator { bool IsAllocThreadSafe() const override { return mtx_.front() != nullptr; } - void ClearCache() { FreeCache(static_cast(-1), 0); } + size_t ClearCache(); + + const std::vector& DivisionPlan() const { return division_plan_; } protected: Allocation* AllocateImpl(size_t size, Attr attr) override; void FreeImpl(Allocation* allocation) override; private: - size_t FreeCache(size_t size, size_t bin_index); - std::shared_ptr underlying_allocator_; std::vector> allocations_; + std::vector accumulated_cache_size_; std::vector division_plan_; std::vector> mtx_; }; +extern void UseMultiBinBufferedAllocatorGFlags(); + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator_test.cc b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator_test.cc index 22787a8512338c9e2045f3e482171656811dd0eb..be5dfba6448799fa0011491785988060ed5d2f9c 100644 --- a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator_test.cc +++ b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator_test.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h" #include +#include #include #include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h" @@ -123,10 +124,31 @@ TEST(buffered_allocator, lazy_free) { { underlying_allocator->ResetCounter(); - allocator->ClearCache(); + size_t cache_size = allocator->ClearCache(); + ASSERT_EQ(cache_size, static_cast(alloc_size + 2048)); ASSERT_EQ(underlying_allocator->GetAllocCount(), kZero); ASSERT_EQ(underlying_allocator->GetFreeCount(), kTwo); } + + { + underlying_allocator->ResetCounter(); + auto p = allocator->Allocate(allocator->DivisionPlan().back(), + allocator->kDefault); + ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne); + ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero); + } + + ASSERT_EQ(underlying_allocator->GetFreeCount(), kOne); + + { + underlying_allocator->ResetCounter(); + auto p = allocator->Allocate(allocator->DivisionPlan().back() - 1, + allocator->kDefault); + ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne); + ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero); + } + + ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero); } } diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6952c19092843290f98baa7e6fbce623bae05f79 --- /dev/null +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle/fluid/memory/allocation/allocator_facade.h" + +#ifdef PADDLE_WITH_CUDA +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_double(fraction_of_cuda_pinned_memory_to_use); +DECLARE_int64(gpu_allocator_retry_time); +#endif + +DECLARE_bool(enable_buffered_allocator); + +DECLARE_string(allocator_strategy); + +namespace paddle { +namespace memory { +namespace allocation { + +TEST(allocator, allocator) { +#ifdef PADDLE_WITH_CUDA + FLAGS_fraction_of_gpu_memory_to_use = 0.01; + FLAGS_gpu_allocator_retry_time = 500; + FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; +#endif + + FLAGS_allocator_strategy = "naive_best_fit"; + FLAGS_enable_buffered_allocator = true; + + auto &instance = AllocatorFacade::Instance(); + platform::Place place; + size_t size = 1024; + + { + place = platform::CPUPlace(); + size = 1024; + auto cpu_allocation = instance.Alloc(place, size); + ASSERT_NE(cpu_allocation, nullptr); + ASSERT_NE(cpu_allocation->ptr(), nullptr); + ASSERT_EQ(cpu_allocation->place(), place); + ASSERT_EQ(cpu_allocation->size(), size); + } + +#ifdef PADDLE_WITH_CUDA + { + place = platform::CUDAPlace(0); + size = 1024; + auto gpu_allocation = instance.Alloc(place, size); + ASSERT_NE(gpu_allocation, nullptr); + ASSERT_NE(gpu_allocation->ptr(), nullptr); + ASSERT_EQ(gpu_allocation->place(), place); + ASSERT_GE(gpu_allocation->size(), size); + } + + { + // Allocate 2GB gpu memory + place = platform::CUDAPlace(0); + size = 2 * static_cast(1 << 30); + auto gpu_allocation = instance.Alloc(place, size); + ASSERT_NE(gpu_allocation, nullptr); + ASSERT_NE(gpu_allocation->ptr(), nullptr); + ASSERT_EQ(gpu_allocation->place(), place); + ASSERT_GE(gpu_allocation->size(), size); + } + + { + place = platform::CUDAPinnedPlace(); + size = (1 << 20); + auto cuda_pinned_allocation = + instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20); + ASSERT_NE(cuda_pinned_allocation, nullptr); + ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr); + ASSERT_EQ(cuda_pinned_allocation->place(), place); + ASSERT_GE(cuda_pinned_allocation->size(), size); + } +#endif +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h index 70b9c2ba1d6f91525fa87d8f1d442da58b641f2f..379f576d6e1ed8f256a0233b203423a487ee73e4 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.h +++ b/paddle/fluid/memory/allocation/retry_allocator.h @@ -18,6 +18,7 @@ #include // NOLINT #include #include // NOLINT +#include #include "paddle/fluid/memory/allocation/allocator.h" namespace paddle { diff --git a/paddle/fluid/memory/allocation/test_multi_bin_buffered_allocator_division_plan.cc b/paddle/fluid/memory/allocation/test_multi_bin_buffered_allocator_division_plan.cc new file mode 100644 index 0000000000000000000000000000000000000000..15daa8413f1f8a2f6aaaeea60dc78ad54ea0e014 --- /dev/null +++ b/paddle/fluid/memory/allocation/test_multi_bin_buffered_allocator_division_plan.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h" + +DECLARE_string(buffered_allocator_division_plan_path); + +namespace paddle { +namespace memory { +namespace allocation { + +TEST(buffered_allocator, division_plan) { + std::string path = "/tmp/buffered_allocator_divison_plan"; + FLAGS_buffered_allocator_division_plan_path = path; + + { + std::vector plan( + {"100b", "300.7K", "500.3m", "1.02gB", "2g", "4G"}); + + std::ofstream os(path); + for (auto &p : plan) { + os << p << std::endl; + } + os.close(); + } + + auto plan = ReadBufferedAllocatorDivisionPlanFromFile( + FLAGS_buffered_allocator_division_plan_path); + ASSERT_EQ(plan.size(), 6UL); + ASSERT_EQ(plan[0], 100UL); + ASSERT_EQ(plan[1], static_cast(300.7 * 1024)); + ASSERT_EQ(plan[2], static_cast(500.3 * 1024 * 1024)); + ASSERT_EQ(plan[3], static_cast(1.02 * 1024 * 1024 * 1024)); + ASSERT_EQ(plan[4], static_cast(2.0 * 1024 * 1024 * 1024)); + ASSERT_EQ(plan[5], static_cast(4.0 * 1024 * 1024 * 1024)); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.cc b/paddle/fluid/memory/allocation/zero_size_allocator.cc index a0211b6d83281316f8f669a6039cb3ed3ed1e464..39743bcb10c700c9a8446b9040c8a8707d57ec7d 100644 --- a/paddle/fluid/memory/allocation/zero_size_allocator.cc +++ b/paddle/fluid/memory/allocation/zero_size_allocator.cc @@ -22,21 +22,22 @@ bool ZeroSizeAllocator::IsAllocThreadSafe() const { return underlying_allocator_->IsAllocThreadSafe(); } -void ZeroSizeAllocator::FreeImpl(Allocation *allocation) { - if (dynamic_cast(allocation)) { - delete allocation; +Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { + if (size == 0) { + return new Allocation(nullptr, 0, place_); } else { - underlying_allocator_->Free(allocation); + return underlying_allocator_->Allocate(size, attr).release(); } } -Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { - if (size == 0) { - return new ZeroSizeAllocation(place_); +void ZeroSizeAllocator::FreeImpl(Allocation *allocation) { + if (allocation->size() == 0) { + delete allocation; } else { - return underlying_allocator_->Allocate(size, attr).release(); + underlying_allocator_->Free(allocation); } } + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.h b/paddle/fluid/memory/allocation/zero_size_allocator.h index e6081798364272e41b13a9914bc92160dbda70bf..08a7a06dbf290b55994a407fe478f792b0c0964a 100644 --- a/paddle/fluid/memory/allocation/zero_size_allocator.h +++ b/paddle/fluid/memory/allocation/zero_size_allocator.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include "paddle/fluid/memory/allocation/allocator.h" @@ -23,12 +24,6 @@ namespace allocation { // The allocator handles the request's size is zero. Allocator will always // return an allocation even the request size is zero. However, the // allocation.ptr() is nullptr -class ZeroSizeAllocation : public Allocation { - public: - explicit ZeroSizeAllocation(const platform::Place& p) - : Allocation(nullptr, 0, p) {} -}; - class ZeroSizeAllocator : public Allocator { public: ZeroSizeAllocator(std::shared_ptr underlying_allocator, diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index 80d32ba564c3060fe2dc4e1a7eb499eda2c1e1d3..26ef27c3caafadb4801b0ae52133f6175655ce0a 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -25,11 +25,9 @@ namespace detail { BuddyAllocator::BuddyAllocator( std::unique_ptr system_allocator, size_t min_chunk_size, - size_t first_allocate_chunk_size, size_t reallocate_chunk_size) + size_t max_chunk_size) : min_chunk_size_(min_chunk_size), - first_allocate_chunk_size_(first_allocate_chunk_size), - reallocate_chunk_size_(reallocate_chunk_size), - max_chunk_size_(first_allocate_chunk_size), + max_chunk_size_(max_chunk_size), cache_(system_allocator->UseGpu()), system_allocator_(std::move(system_allocator)) {} @@ -38,10 +36,9 @@ BuddyAllocator::~BuddyAllocator() { "have actually been freed"; while (!pool_.empty()) { auto block = static_cast(std::get<2>(*pool_.begin())); - auto desc = cache_.load(block); - VLOG(10) << "Free from block (" << block << ", " << desc.size << ")"; + VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")"; - system_allocator_->Free(block, desc.size, desc.index); + system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); cache_.invalidate(block); pool_.erase(pool_.begin()); } @@ -66,7 +63,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { // if the allocation is huge, send directly to the system allocator if (size > max_chunk_size_) { VLOG(10) << "Allocate from system allocator."; - return SystemAlloc(size, false); + return SystemAlloc(size); } // query and allocate from the existing chunk @@ -75,9 +72,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { // refill the pool if failure if (it == pool_.end()) { it = RefillPool(); - // if still failure, try to allocate from SystemAllocator + // if still failure, fail fatally if (it == pool_.end()) { - return SystemAlloc(size, false); + return nullptr; } } else { VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it) @@ -101,7 +98,7 @@ void BuddyAllocator::Free(void* p) { VLOG(10) << "Free from address " << block; - if (block->type(cache_) == MemoryBlock::UNMANAGED_HUGE_CHUNK) { + if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) { VLOG(10) << "Free directly from system allocator"; system_allocator_->Free(block, block->total_size(cache_), block->index(cache_)); @@ -171,12 +168,9 @@ void BuddyAllocator::Free(void* p) { size_t BuddyAllocator::Used() { return total_used_; } size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; } -size_t BuddyAllocator::GetMaxChunkSize() { - std::lock_guard lock(mutex_); - return max_chunk_size_; -} +size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; } -void* BuddyAllocator::SystemAlloc(size_t size, bool is_managed) { +void* BuddyAllocator::SystemAlloc(size_t size) { size_t index = 0; void* p = system_allocator_->Alloc(&index, size); @@ -184,23 +178,25 @@ void* BuddyAllocator::SystemAlloc(size_t size, bool is_managed) { if (p == nullptr) return nullptr; - static_cast(p)->init( - &cache_, is_managed ? MemoryBlock::MANAGED_HUGE_CHUNK - : MemoryBlock::UNMANAGED_HUGE_CHUNK, - index, size, nullptr, nullptr); + static_cast(p)->init(&cache_, MemoryBlock::HUGE_CHUNK, index, + size, nullptr, nullptr); return static_cast(p)->data(); } BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { - if (total_used_ + total_free_ > 0) { - max_chunk_size_ = reallocate_chunk_size_; +#ifdef PADDLE_WITH_CUDA + if (system_allocator_->UseGpu()) { + if ((total_used_ + total_free_) == 0) { + // Compute the maximum allocation size for the first allocation. + max_chunk_size_ = platform::GpuMaxChunkSize(); + } } +#endif // Allocate a new maximum sized block size_t index = 0; - size_t chunk_size = max_chunk_size_; - void* p = system_allocator_->Alloc(&index, chunk_size); + void* p = system_allocator_->Alloc(&index, max_chunk_size_); if (p == nullptr) return pool_.end(); @@ -208,7 +204,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { << " from system allocator"; static_cast(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index, - chunk_size, nullptr, nullptr); + max_chunk_size_, nullptr, nullptr); // gpu fallback allocation if (system_allocator_->UseGpu() && @@ -216,10 +212,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { fallback_alloc_count_++; } - total_free_ += chunk_size; + total_free_ += max_chunk_size_; // dump the block into pool - return pool_.insert(IndexSizeAddress(index, chunk_size, p)).first; + return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first; } BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) { @@ -275,24 +271,27 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, void BuddyAllocator::CleanIdleFallBackAlloc() { // If fallback allocation does not exist, return directly - if (!fallback_alloc_count_ || !system_allocator_->UseGpu()) return; + if (!fallback_alloc_count_) return; for (auto pool = pool_.rbegin(); pool != pool_.rend();) { + // If free memory block less than max_chunk_size_, return directly + if (std::get<1>(*pool) < max_chunk_size_) return; + MemoryBlock* block = static_cast(std::get<2>(*pool)); - auto desc = cache_.load(block); - if (desc.index == 0) { + // If no GPU fallback allocator, return + if (!system_allocator_->UseGpu() || block->index(cache_) == 0) { return; } VLOG(10) << "Return block " << block << " to fallback allocator."; - system_allocator_->Free(block, desc.size, block->index(cache_)); + system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); cache_.invalidate(block); pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); - total_free_ -= desc.size; + total_free_ -= max_chunk_size_; fallback_alloc_count_--; // If no fall allocation exists, return directly @@ -316,21 +315,19 @@ void BuddyAllocator::CleanIdleNormalAlloc() { if (!shall_free_alloc()) return; for (auto pool = pool_.rbegin(); pool != pool_.rend();) { - MemoryBlock* block = static_cast(std::get<2>(*pool)); - auto desc = cache_.load(block); + // If free memory block less than max_chunk_size_, return directly + if (std::get<1>(*pool) < max_chunk_size_) return; - if (desc.type != MemoryBlock::MANAGED_HUGE_CHUNK) { - return; - } + MemoryBlock* block = static_cast(std::get<2>(*pool)); VLOG(10) << "Return block " << block << " to base allocator."; - system_allocator_->Free(block, desc.size, desc.index); + system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); cache_.invalidate(block); pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); - total_free_ -= desc.size; + total_free_ -= max_chunk_size_; if (!shall_free_alloc()) return; } diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h index 88d6f736a8f10e4481a061e22e6c75450c3038f9..3f86a51f0d0b8504bbc4b0477f123093b343e9cf 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.h +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -34,8 +34,7 @@ namespace detail { class BuddyAllocator { public: BuddyAllocator(std::unique_ptr system_allocator, - size_t min_chunk_size, size_t first_allocate_chunk_size, - size_t reallocate_chunk_size); + size_t min_chunk_size, size_t max_chunk_size); ~BuddyAllocator(); @@ -58,7 +57,7 @@ class BuddyAllocator { using PoolSet = std::set; /*! \brief Allocate fixed-size memory from system */ - void* SystemAlloc(size_t size, bool is_managed = true); + void* SystemAlloc(size_t size); /*! \brief If existing chunks are not suitable, refill pool */ PoolSet::iterator RefillPool(); @@ -88,11 +87,7 @@ class BuddyAllocator { size_t total_free_ = 0; // the total size of free memory size_t min_chunk_size_; // the minimum size of each chunk - - size_t first_allocate_chunk_size_; - size_t reallocate_chunk_size_; - - size_t max_chunk_size_; + size_t max_chunk_size_; // the maximum size of each chunk private: /** diff --git a/paddle/fluid/memory/detail/memory_block.h b/paddle/fluid/memory/detail/memory_block.h index 5e5ff5b849d795f36e3b53ae626617ae7eea2751..5cceba659beeec1b3c986dc43229f6725e3e11de 100644 --- a/paddle/fluid/memory/detail/memory_block.h +++ b/paddle/fluid/memory/detail/memory_block.h @@ -27,11 +27,10 @@ class MetadataCache; // MemoryBlock::Desc and the payload. struct MemoryBlock { enum Type { - FREE_CHUNK, // memory is free and idle - ARENA_CHUNK, // memory is being occupied - MANAGED_HUGE_CHUNK, // memory is huge and out of management - UNMANAGED_HUGE_CHUNK, // memory is huge and managed by allocator - INVALID_CHUNK // memory is invalid + FREE_CHUNK, // memory is free and idle + ARENA_CHUNK, // memory is being occupied + HUGE_CHUNK, // memory is out of management + INVALID_CHUNK // memory is invalid }; // init saves the MemoryBlock::Desc of the memory block in a MetadataCache. diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 9553298d5e6b315d510b6cdcc8ab30dd33a5b2c9..400a6d7bfa5912774c4bbb2a5868dd9a471afd00 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -38,22 +38,6 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, "additional trunks of the same size will be requested from gpu " "until the gpu has no memory left for another trunk."); -DEFINE_double( - initial_gpu_memory_in_mb, -1.0, - "GPU memory chunk size in MB." - "Allocator would allocate FLAGS_initial_gpu_memory_in_mb size " - "chunk first and reallocate FLAGS_reallocate_gpu_memory_in_mb size " - "chunk when the first chunk is not enough. This flag has higher priority " - "than FLAGS_fraction_of_gpu_memory_to_use. Disable when less than 0."); - -DEFINE_double(reallocate_gpu_memory_in_mb, -1.0, - "GPU memory chunk size in MB." - "If FLAGS_initial_gpu_memory_in_mb is set and " - "FLAGS_reallocate_gpu_memory_in_mb " - "is less than 0, it would be replaced by " - "FLAGS_initial_gpu_memory_in_mb. Disable " - "when FLAGS_initial_gpu_memory_in_mb is less than 0."); - DEFINE_bool( enable_cublas_tensor_op_math, false, "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, " @@ -227,54 +211,13 @@ size_t GpuMaxChunkSize() { size_t allocating = static_cast(FLAGS_fraction_of_gpu_memory_to_use * (total - reserving)); + PADDLE_ENFORCE_LE(allocating, available, "Insufficient GPU memory to allocation."); return allocating; } -size_t GpuFirstAllocateChunkSize() { - if (FLAGS_initial_gpu_memory_in_mb <= 0) { - return GpuMaxChunkSize(); - } - - size_t total = 0; - size_t available = 0; - - GpuMemoryUsage(&available, &total); - VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/" - << total / 1024 / 1024 << "M"; - - size_t initial_mem = - static_cast(FLAGS_initial_gpu_memory_in_mb * (1 << 20)); - PADDLE_ENFORCE_LE(initial_mem, available, - "Insufficient GPU memory to allocation."); - return initial_mem; -} - -size_t GpuReAllocateChunkSize() { - if (FLAGS_initial_gpu_memory_in_mb <= 0) { - return GpuMaxChunkSize(); - } - - double reallocate_mem = FLAGS_reallocate_gpu_memory_in_mb; - if (reallocate_mem < 0) { - PADDLE_ENFORCE(FLAGS_initial_gpu_memory_in_mb > 0, - "FLAGS_init_gpu_memory_to_use_mb must be larger than 0"); - reallocate_mem = FLAGS_initial_gpu_memory_in_mb; - } - - size_t total = 0; - size_t available = 0; - GpuMemoryUsage(&available, &total); - VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/" - << total / 1024 / 1024 << "M"; - size_t realloc_mem = static_cast(reallocate_mem * (1 << 20)); - PADDLE_ENFORCE_LE(realloc_mem, available, - "Insufficient GPU memory to allocation."); - return realloc_mem; -} - void GpuMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream) { PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream), diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index 7c05658851d0ea1118d706ed3810809e68593df4..1e1ab2503f53fe20bbe62c48f65d8535947f1aa8 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -66,12 +66,6 @@ size_t GpuMinChunkSize(); //! Get the maximum chunk size for GPU buddy allocator. size_t GpuMaxChunkSize(); -//! Get init chunk size for GPU buddy allocator. -size_t GpuFirstAllocateChunkSize(); - -//! Get reallocate chunk size for GPU buddy allocator. -size_t GpuReAllocateChunkSize(); - //! Copy memory from address src to dst asynchronously. void GpuMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream); diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc index 4e1056cfb9e3d8c50139db50b67491ce3b839fd3..ddde7baf4cf3b44ac5d8a22fcc98acef50294575 100644 --- a/paddle/fluid/platform/temporary_allocator.cc +++ b/paddle/fluid/platform/temporary_allocator.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/platform/temporary_allocator.h" +#include #include "paddle/fluid/memory/allocation/allocator_facade.h" DEFINE_int64(limit_of_tmp_allocation, -1, diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h index cead316ed94135f66eeb81d3bf911986660d3f43..912d45eaf17fe8c05840995275dd3e2e688b38ef 100644 --- a/paddle/fluid/platform/temporary_allocator.h +++ b/paddle/fluid/platform/temporary_allocator.h @@ -16,6 +16,7 @@ #include // NOLINT #include #include +#include #include // NOLINT #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/lock_guard_ptr.h" diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 9e6b89f7459af7fbbef28f3d6395a00a0c1b1f1e..6f2e41c159942249a24ef97a485e02e534e9b35f 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -39,6 +39,7 @@ limitations under the License. */ #include "paddle/fluid/imperative/profiler.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/memory/allocation/legacy_allocator.h" +#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" @@ -133,6 +134,9 @@ PYBIND11_MODULE(core, m) { paddle::platform::CpuTotalPhysicalMemory(); paddle::memory::allocation::UseAllocatorStrategyGFlag(); + + paddle::memory::allocation::UseMultiBinBufferedAllocatorGFlags(); + m.doc() = "C++ core of PaddlePaddle"; // using framework in this function. Since it is inside a function, it will diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h index 16bb3771f2e9bcc07028ef2039fed8691f9aab97..66b768665b6d0b97b4ca1470020132bfc9576bbb 100644 --- a/paddle/fluid/string/printf.h +++ b/paddle/fluid/string/printf.h @@ -105,14 +105,12 @@ void Printf(const char* fmt, const Args&... args) { Fprintf(std::cout, fmt, args...); } -template -std::string HumanReadableSize(T size) { +inline std::string HumanReadableSize(double f_size) { size_t i = 0; - double f_size = static_cast(size); double orig = f_size; const std::vector units( {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"}); - while (f_size > 1024) { + while (f_size >= 1024) { f_size /= 1024; i++; } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 83003fc68b809b94d4f2362e55b3792d78d28ad8..ad2ce30ab5394d40a50e4e776d64cfd5feb64d36 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -130,7 +130,8 @@ def __bootstrap__(): 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion', 'allocator_strategy', 'enable_buffered_allocator', - 'buffered_allocator_excess_times', 'reader_queue_speed_test_mode', + 'buffered_allocator_excess_times', + 'buffered_allocator_division_plan_path', 'reader_queue_speed_test_mode', 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', 'inner_op_parallelism', 'enable_parallel_graph', 'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize', @@ -163,7 +164,6 @@ def __bootstrap__(): if core.is_compiled_with_cuda(): read_env_flags += [ - 'initial_gpu_memory_in_mb', 'reallocate_gpu_memory_in_mb', 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',