diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index ddd9fe809853a830ca676cc98f1819f683866def..b534de4a9c07f05cc1ae7b2cabfd6785c482f174 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -168,7 +168,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, startup_exe = fluid.Executor(place) startup_exe.run(startup_prog) strategy = fluid.ExecutionStrategy() - strategy.num_threads = args.cpus + strategy.num_threads = 0 #args.cpus strategy.allow_op_delay = False build_strategy = fluid.BuildStrategy() if args.reduce_strategy == "reduce": @@ -187,6 +187,8 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, num_trainers = 1 trainer_id = 0 + print('Use parallel_executor') + strategy.type = 2 exe = fluid.ParallelExecutor( True, avg_loss.name, diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index f692e7722a1c9a54a4509ce7c78cc68e1f28da74..947c497ce2b9becd1282b62187f743008985a17f 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -172,7 +172,7 @@ def get_model(args, is_train, main_prog, startup_prog): reader, dshape, class_dim = _model_reader_dshape_classdim(args, is_train) pyreader = None - trainer_count = int(os.getenv("PADDLE_TRAINERS")) + trainer_count = int(os.getenv("PADDLE_TRAINERS", 1)) with fluid.program_guard(main_prog, startup_prog): with fluid.unique_name.guard(): if args.use_reader_op: diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 664b3460252aeb711cdab2a5940cd9987a26ed6a..5620b30f5a6466a23263c716663dfb7a84869636 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -48,8 +48,11 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS auto_increment_allocator zero_size_allocator conditional_allocator + retry_allocator cuda_device_guard) nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade) cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator naive_managed_allocator best_fit_allocator locked_allocator cpu_allocator) + +cc_test(allocator_facade_test SRCS allocator_facade_test.cc DEPS allocator_facade) diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc index 98b4b035861fb3bfe1531ecbf780aef395789606..ffaeadcbdc6a8c8e390934355ee82a00e80c098b 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.cc +++ b/paddle/fluid/memory/allocation/aligned_allocator.cc @@ -26,6 +26,11 @@ std::shared_ptr ThinAlignedAllocator::AllocateShared( size_t size, Allocator::Attr attr) { return std::shared_ptr(Allocate(size, attr).release()); } + +bool ThinAlignedAllocator::IsAllocThreadSafe() const { + return underlying_allocator_->IsAllocThreadSafe(); +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h index 13c69c153a2155750fcf4edee151ba277d7a29b4..529943dc3da89187a0a360968f16292e62739f9a 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.h +++ b/paddle/fluid/memory/allocation/aligned_allocator.h @@ -77,6 +77,8 @@ class ThinAlignedAllocator : public ManagedAllocator { std::shared_ptr AllocateShared(size_t size, Attr attr) override; + bool IsAllocThreadSafe() const; + protected: std::shared_ptr underlying_allocator_; }; diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 4f07c1610dc3ef45e5bff6df32a71d4af9c55243..02ea5d7e783d1bbfa0f9a2005dfc9e473ef8f4c6 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/memory/allocation/allocator.h" +#include #include +#include #include #include "paddle/fluid/memory/allocation/aligned_allocator.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" @@ -24,6 +26,7 @@ #include "paddle/fluid/memory/allocation/locked_allocator.h" #include "paddle/fluid/memory/allocation/naive_managed_allocator.h" #include "paddle/fluid/memory/allocation/pinned_allocator.h" +#include "paddle/fluid/memory/allocation/retry_allocator.h" #include "paddle/fluid/memory/allocation/zero_size_allocator.h" #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/gpu_info.h" @@ -32,6 +35,11 @@ #include "paddle/fluid/memory/allocation/cuda_allocator.h" #endif +DEFINE_int32( + gpu_allocator_retry_time, 0, + "The retry time (milliseconds) when allocator fails " + "to allocate memory. No retry if this value is not greater than 0"); + namespace paddle { namespace memory { namespace allocation { @@ -60,6 +68,7 @@ class CPUManagedAllocator : public ManagedAllocator { return normal_allocator_->AllocateShared(size, attr); } } + bool IsAllocThreadSafe() const override { return true; } private: @@ -86,8 +95,12 @@ class CUDAManagedAllocator : public ManagedAllocator { size_t capacity = available / max_chunk_size_; if (capacity == 1) { + VLOG(10) << "Create BestFitAllocator with chunk_size " + << max_chunk_size_; default_allocator_ = BestFitAllocatorCreator(); } else { + VLOG(10) << "Create AutoIncrementAllocator with chunk_size " + << max_chunk_size_ << " and capacity " << capacity; default_allocator_ = std::make_shared( [this] { return std::move(BestFitAllocatorCreator()); }, capacity); } @@ -116,6 +129,7 @@ class CUDAManagedAllocator : public ManagedAllocator { std::unique_ptr Allocate(size_t size, Attr attr) override { return default_allocator_->Allocate(size, attr); } + std::shared_ptr AllocateShared(size_t size, Attr attr) override { return default_allocator_->AllocateShared(size, attr); } @@ -123,10 +137,20 @@ class CUDAManagedAllocator : public ManagedAllocator { std::shared_ptr BestFitAllocatorCreator() { chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_)); auto* allocation = chunks_.back().get(); - return std::make_shared>( - NaiveManagedAllocator::Create(std::unique_ptr( - new LockedAllocator(std::unique_ptr( - new BestFitAllocator(allocation)))))); + std::unique_ptr unmanaged_allocator(new LockedAllocator( + std::unique_ptr(new BestFitAllocator(allocation)))); + + if (FLAGS_gpu_allocator_retry_time <= 0) { + VLOG(10) << "Create NaiveManagedAllocator without retry"; + return std::make_shared>( + NaiveManagedAllocator::Create(std::move(unmanaged_allocator))); + } else { + VLOG(10) << "Create RetryAllocator with retry_time " + << FLAGS_gpu_allocator_retry_time << "ms"; + return std::make_shared>(RetryAllocator::Create( + std::move(unmanaged_allocator), + static_cast(FLAGS_gpu_allocator_retry_time))); + } } bool IsAllocThreadSafe() const override { return true; } @@ -141,7 +165,8 @@ class CUDAManagedAllocator : public ManagedAllocator { class AllocatorFacadePrivate { public: - std::map> allocators_; + std::unordered_map> + allocators_; ~AllocatorFacadePrivate() = default; @@ -184,13 +209,13 @@ AllocatorFacade& AllocatorFacade::Instance() { std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size, Allocator::Attr attr) { - return m_->allocators_[place]->AllocateShared(size, attr); + return m_->allocators_.at(place)->AllocateShared(size, attr); } std::unique_ptr AllocatorFacade::Alloc(const platform::Place& place, size_t size, Allocator::Attr attr) { - return m_->allocators_[place]->Allocate(size, attr); + return m_->allocators_.at(place)->Allocate(size, attr); } } // namespace allocation diff --git a/paddle/fluid/memory/allocation/allocator_facade_test.cc b/paddle/fluid/memory/allocation/allocator_facade_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..5185bf944460f6b36677bdd77c33048b79b6e6f2 --- /dev/null +++ b/paddle/fluid/memory/allocation/allocator_facade_test.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include +#include + +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_int32(gpu_allocator_retry_time); + +namespace paddle { +namespace memory { +namespace allocation { + +TEST(allocator, allocator) { + FLAGS_fraction_of_gpu_memory_to_use = 0.01; + FLAGS_gpu_allocator_retry_time = 500; + + auto &instance = AllocatorFacade::Instance(); + + { + auto cpu_allocation = instance.Alloc(platform::CPUPlace(), 1024); + ASSERT_NE(cpu_allocation, nullptr); + } + + { + auto gpu_allocation = instance.Alloc(platform::CUDAPlace(0), 1024); + ASSERT_NE(gpu_allocation, nullptr); + } + + { + // Allocate 2GB gpu memory + auto gpu_allocation = instance.Alloc(platform::CUDAPlace(0), + 2 * static_cast(1 << 30)); + ASSERT_NE(gpu_allocation, nullptr); + } + + {} +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index e3ee504f3d042d6a99036e34507c4c8bee306750..745a79014a7aa9fe16e4e8d637d944e971422f97 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include #include @@ -130,5 +131,65 @@ typename Visitor::result_type VisitPlace(const Place &place, return boost::apply_visitor(PlaceVisitorWrapper(visitor), place); } +struct PlaceHashVisitor : public boost::static_visitor { + template + inline size_t operator()(const Place &place) const { + return place.hash(); + } +}; + } // namespace platform } // namespace paddle + +namespace std { + +template <> +struct hash<::paddle::platform::CPUPlace> { + using argument_type = ::paddle::platform::CPUPlace; + using result_type = size_t; + + constexpr inline result_type operator()(const argument_type &place) const { + return static_cast(-1); + } +}; + +template <> +struct hash<::paddle::platform::CUDAPlace> { + using argument_type = ::paddle::platform::CUDAPlace; + using result_type = size_t; + + inline result_type operator()(const argument_type &place) const { + return static_cast(place.device); + } +}; + +template <> +struct hash<::paddle::platform::CUDAPinnedPlace> { + using argument_type = ::paddle::platform::CUDAPinnedPlace; + using result_type = size_t; + + constexpr inline result_type operator()(const argument_type &place) const { + return static_cast(-2); + } +}; + +namespace { // NOLINT +struct PlaceHashVisitor : public boost::static_visitor { + template + inline size_t operator()(const Place &place) const { + return std::hash()(place); + } +}; +} + +template <> +struct hash<::paddle::platform::Place> { + using argument_type = ::paddle::platform::Place; + using result_type = size_t; + + inline result_type operator()(const argument_type &place) const { + return boost::apply_visitor(PlaceHashVisitor(), place); + } +}; + +} // namespace std