From d2584a70828dccd561ab6ded2a417d74a1b77b21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?= <39303645+Shixiaowei02@users.noreply.github.com> Date: Tue, 21 Apr 2020 10:54:44 +0800 Subject: [PATCH] New feature: thread local allocator, test=develop (#23989) * add the thread_local_allocator, test=develop * refactor the thread_local_allocator, test=develop * provides option setting strategy, test=develop --- paddle/fluid/memory/allocation/CMakeLists.txt | 4 +- .../memory/allocation/allocator_facade.cc | 17 +++ .../memory/allocation/allocator_strategy.cc | 4 + .../memory/allocation/allocator_strategy.h | 2 +- .../allocation/thread_local_allocator.cc | 76 +++++++++++++ .../allocation/thread_local_allocator.h | 100 ++++++++++++++++++ .../allocation/thread_local_allocator_test.cc | 93 ++++++++++++++++ paddle/fluid/platform/flags.cc | 3 +- 8 files changed, 296 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/memory/allocation/thread_local_allocator.cc create mode 100644 paddle/fluid/memory/allocation/thread_local_allocator.h create mode 100644 paddle/fluid/memory/allocation/thread_local_allocator_test.cc diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index dc26c19cbc8..fdd6923a67b 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -14,13 +14,15 @@ endif() if (WITH_GPU) nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) + nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator) + cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator) endif() cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator) nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) if (WITH_GPU) - set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard) + set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator) else () set(AllocatorFacadeDeps) endif() diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 63763acb64c..c851f1b10c9 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -32,6 +32,7 @@ #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/memory/allocation/cuda_allocator.h" #include "paddle/fluid/memory/allocation/pinned_allocator.h" +#include "paddle/fluid/memory/allocation/thread_local_allocator.h" #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/gpu_info.h" #endif @@ -80,6 +81,18 @@ class AllocatorFacadePrivate { break; } + case AllocatorStrategy::kThreadLocal: { + InitNaiveBestFitCPUAllocator(); +#ifdef PADDLE_WITH_CUDA + for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); + ++dev_id) { + InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id)); + } + InitNaiveBestFitCUDAPinnedAllocator(); +#endif + break; + } + default: { PADDLE_THROW("Unsupported allocator strategy: %d", static_cast(strategy)); @@ -136,6 +149,10 @@ class AllocatorFacadePrivate { allocators_[p] = std::make_shared(p); } + void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) { + allocators_[p] = std::make_shared(p); + } + void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p) { auto cuda_allocator = std::make_shared(p); allocators_[p] = std::make_shared( diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc index 19b1380612b..74757439fd4 100644 --- a/paddle/fluid/memory/allocation/allocator_strategy.cc +++ b/paddle/fluid/memory/allocation/allocator_strategy.cc @@ -32,6 +32,10 @@ static AllocatorStrategy GetStrategyFromFlag() { return AllocatorStrategy::kAutoGrowth; } + if (FLAGS_allocator_strategy == "thread_local") { + return AllocatorStrategy::kThreadLocal; + } + PADDLE_THROW("Unsupported allocator strategy: %s", FLAGS_allocator_strategy); } diff --git a/paddle/fluid/memory/allocation/allocator_strategy.h b/paddle/fluid/memory/allocation/allocator_strategy.h index ff6e7839ff7..0db9d93e3e6 100644 --- a/paddle/fluid/memory/allocation/allocator_strategy.h +++ b/paddle/fluid/memory/allocation/allocator_strategy.h @@ -18,7 +18,7 @@ namespace paddle { namespace memory { namespace allocation { -enum class AllocatorStrategy { kNaiveBestFit, kAutoGrowth }; +enum class AllocatorStrategy { kNaiveBestFit, kAutoGrowth, kThreadLocal }; extern AllocatorStrategy GetAllocatorStrategy(); diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc new file mode 100644 index 00000000000..96f22530135 --- /dev/null +++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc @@ -0,0 +1,76 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/thread_local_allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +ThreadLocalAllocatorImpl::ThreadLocalAllocatorImpl(const platform::Place& p) + : place_(p) { + if (platform::is_gpu_place(place_)) { + buddy_allocator_.reset(new memory::detail::BuddyAllocator( + std::unique_ptr( + new memory::detail::GPUAllocator( + boost::get(place_).device)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize())); + } else { + LOG(FATAL) << "Thread local allocator only supports CUDAPlace now."; + } +} + +std::shared_ptr ThreadLocalCUDAAllocatorPool::Get( + int gpu_id) { + auto pos = std::distance(devices_.begin(), + std::find(devices_.begin(), devices_.end(), gpu_id)); + PADDLE_ENFORCE_LT( + pos, devices_.size(), + platform::errors::InvalidArgument( + "The position of device should be less than the size of devices.")); + std::call_once(*init_flags_[pos], [this, pos, gpu_id] { + platform::SetDeviceId(devices_[pos]); + allocators_[pos].reset( + new ThreadLocalAllocatorImpl(platform::CUDAPlace(gpu_id))); + }); + return allocators_[pos]; +} + +ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool() + : devices_(platform::GetSelectedDevices()) { + auto gpu_num = devices_.size(); + allocators_.resize(gpu_num); + init_flags_.reserve(gpu_num); + for (size_t i = 0; i < gpu_num; ++i) { + init_flags_.emplace_back(new std::once_flag()); + } +} + +ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) { + VLOG(10) << "ThreadLocalAllocatorImpl::AllocateImpl " << size; + void* ptr = buddy_allocator_->Alloc(size); + auto* tl_allocation = new ThreadLocalAllocation(ptr, size, place_); + tl_allocation->SetThreadLocalAllocatorImpl(shared_from_this()); + return tl_allocation; +} + +void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) { + VLOG(10) << "ThreadLocalAllocatorImpl::FreeImpl " << allocation; + buddy_allocator_->Free(allocation->ptr()); + delete allocation; +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h new file mode 100644 index 00000000000..bc07ad0c4dc --- /dev/null +++ b/paddle/fluid/memory/allocation/thread_local_allocator.h @@ -0,0 +1,100 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/memory/detail/buddy_allocator.h" +#include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class ThreadLocalAllocatorImpl; + +class ThreadLocalAllocation : public Allocation { + public: + ThreadLocalAllocation(void* ptr, size_t size, platform::Place place) + : Allocation(ptr, size, place) {} + + void SetThreadLocalAllocatorImpl( + std::shared_ptr allocator) { + allocator_ = allocator; + } + + std::shared_ptr GetAllocator() { + return allocator_; + } + + private: + std::shared_ptr allocator_; +}; + +class ThreadLocalAllocatorImpl + : public std::enable_shared_from_this { + public: + explicit ThreadLocalAllocatorImpl(const platform::Place& p); + ThreadLocalAllocation* AllocateImpl(size_t size); + void FreeImpl(ThreadLocalAllocation* allocation); + + private: + std::unique_ptr buddy_allocator_; + platform::Place place_; +}; + +class ThreadLocalCUDAAllocatorPool { + public: + static ThreadLocalCUDAAllocatorPool& Instance() { + static thread_local ThreadLocalCUDAAllocatorPool pool; + return pool; + } + + std::shared_ptr Get(int gpu_id); + + private: + ThreadLocalCUDAAllocatorPool(); + std::vector devices_; + std::vector> init_flags_; + std::vector> allocators_; +}; + +class ThreadLocalCUDAAllocator : public Allocator { + public: + explicit ThreadLocalCUDAAllocator(const platform::CUDAPlace& p) + : gpu_id_(p.device) {} + + bool IsAllocThreadSafe() const override { return true; } + + protected: + Allocation* AllocateImpl(size_t size) override { + return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->AllocateImpl( + size); + } + void FreeImpl(Allocation* allocation) override { + auto* tl_allocation = static_cast(allocation); + auto allocator_impl = tl_allocation->GetAllocator(); + allocator_impl->FreeImpl(tl_allocation); + } + + private: + int gpu_id_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc new file mode 100644 index 00000000000..f9e2ea8c27a --- /dev/null +++ b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/thread_local_allocator.h" +#include +#include // NOLINT +#include +#include +#include // NOLINT +#include +#include "gtest/gtest.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/gpu_info.h" + +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_string(allocator_strategy); + +namespace paddle { +namespace memory { +namespace allocation { + +TEST(ThreadLocalAllocator, cross_scope_release) { + FLAGS_fraction_of_gpu_memory_to_use = 0.1; + FLAGS_allocator_strategy = "thread_local"; + + const size_t thread_num = 5; + const std::vector devices = platform::GetSelectedDevices(); + + std::vector> allocator_addresses(devices.size()); + std::vector> thread_allocations(devices.size()); + + for (size_t i = 0; i < devices.size(); ++i) { + allocator_addresses[i].resize(thread_num); + thread_allocations[i].resize(thread_num); + } + + std::vector threads(thread_num); + std::mutex mutex; + std::condition_variable cv; + bool flag = false; + + for (size_t i = 0; i < threads.size(); ++i) { + threads[i] = std::thread([&, i]() { + { + std::unique_lock lock(mutex); + cv.wait(lock, [&] { return flag; }); + } + for (size_t j = 0; j < devices.size(); ++j) { + thread_allocations[j][i] = + memory::Alloc(platform::CUDAPlace(devices[j]), 10); + auto tl_allocator_impl = + ThreadLocalCUDAAllocatorPool::Instance().Get(devices[j]); + allocator_addresses[j][i] = tl_allocator_impl.get(); + } + }); + } + + { + std::lock_guard lock(mutex); + flag = true; + cv.notify_all(); + } + + for (auto &th : threads) { + th.join(); + } + + for (auto &addresses : allocator_addresses) { + std::sort(addresses.begin(), addresses.end()); + ASSERT_EQ(std::adjacent_find(addresses.begin(), addresses.end(), + std::equal_to()), + addresses.end()); + } + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_EXIT(([&]() { thread_allocations.clear(); }(), exit(0)), + ::testing::ExitedWithCode(0), ".*"); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index a5dd3401007..c2af3d0e982 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -303,7 +303,8 @@ DEFINE_double(memory_fraction_of_eager_deletion, 1.0, * Allocator related FLAG * Name: FLAGS_allocator_strategy * Since Version: 1.2 - * Value Range: string, {naive_best_fit, auto_growth}, default=auto_growth + * Value Range: string, {naive_best_fit, auto_growth, thread_local}, + * default=auto_growth * Example: * Note: For selecting allocator policy of PaddlePaddle. */ -- GitLab