From d2584a70828dccd561ab6ded2a417d74a1b77b21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Tue, 21 Apr 2020 10:54:44 +0800
Subject: [PATCH] New feature: thread local allocator, test=develop (#23989)

* add the thread_local_allocator, test=develop

* refactor the thread_local_allocator, test=develop

* provides option setting strategy, test=develop
---
 paddle/fluid/memory/allocation/CMakeLists.txt |   4 +-
 .../memory/allocation/allocator_facade.cc     |  17 +++
 .../memory/allocation/allocator_strategy.cc   |   4 +
 .../memory/allocation/allocator_strategy.h    |   2 +-
 .../allocation/thread_local_allocator.cc      |  76 +++++++++++++
 .../allocation/thread_local_allocator.h       | 100 ++++++++++++++++++
 .../allocation/thread_local_allocator_test.cc |  93 ++++++++++++++++
 paddle/fluid/platform/flags.cc                |   3 +-
 8 files changed, 296 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/memory/allocation/thread_local_allocator.cc
 create mode 100644 paddle/fluid/memory/allocation/thread_local_allocator.h
 create mode 100644 paddle/fluid/memory/allocation/thread_local_allocator_test.cc
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index dc26c19cbc8..fdd6923a67b 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -14,13 +14,15 @@ endif()
 
 if (WITH_GPU)
   nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
+  nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
+  cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
 endif()
 
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 
 nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
 if (WITH_GPU)
-    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard)
+    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
 else ()
     set(AllocatorFacadeDeps)
 endif()
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 63763acb64c..c851f1b10c9 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -32,6 +32,7 @@
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
+#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
@@ -80,6 +81,18 @@ class AllocatorFacadePrivate {
         break;
       }
 
+      case AllocatorStrategy::kThreadLocal: {
+        InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_CUDA
+        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
+             ++dev_id) {
+          InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
+        }
+        InitNaiveBestFitCUDAPinnedAllocator();
+#endif
+        break;
+      }
+
       default: {
         PADDLE_THROW("Unsupported allocator strategy: %d",
                      static_cast<int>(strategy));
@@ -136,6 +149,10 @@ class AllocatorFacadePrivate {
     allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
   }
 
+  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) {
+    allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
+  }
+
   void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p) {
     auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc
index 19b1380612b..74757439fd4 100644
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
@@ -32,6 +32,10 @@ static AllocatorStrategy GetStrategyFromFlag() {
     return AllocatorStrategy::kAutoGrowth;
   }
 
+  if (FLAGS_allocator_strategy == "thread_local") {
+    return AllocatorStrategy::kThreadLocal;
+  }
+
   PADDLE_THROW("Unsupported allocator strategy: %s", FLAGS_allocator_strategy);
 }
 
diff --git a/paddle/fluid/memory/allocation/allocator_strategy.h b/paddle/fluid/memory/allocation/allocator_strategy.h
index ff6e7839ff7..0db9d93e3e6 100644
--- a/paddle/fluid/memory/allocation/allocator_strategy.h
+++ b/paddle/fluid/memory/allocation/allocator_strategy.h
@@ -18,7 +18,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-enum class AllocatorStrategy { kNaiveBestFit, kAutoGrowth };
+enum class AllocatorStrategy { kNaiveBestFit, kAutoGrowth, kThreadLocal };
 
 extern AllocatorStrategy GetAllocatorStrategy();
 
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc
new file mode 100644
index 00000000000..96f22530135
--- /dev/null
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+ThreadLocalAllocatorImpl::ThreadLocalAllocatorImpl(const platform::Place& p)
+    : place_(p) {
+  if (platform::is_gpu_place(place_)) {
+    buddy_allocator_.reset(new memory::detail::BuddyAllocator(
+        std::unique_ptr<memory::detail::SystemAllocator>(
+            new memory::detail::GPUAllocator(
+                boost::get<platform::CUDAPlace>(place_).device)),
+        platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
+  } else {
+    LOG(FATAL) << "Thread local allocator only supports CUDAPlace now.";
+  }
+}
+
+std::shared_ptr<ThreadLocalAllocatorImpl> ThreadLocalCUDAAllocatorPool::Get(
+    int gpu_id) {
+  auto pos = std::distance(devices_.begin(),
+                           std::find(devices_.begin(), devices_.end(), gpu_id));
+  PADDLE_ENFORCE_LT(
+      pos, devices_.size(),
+      platform::errors::InvalidArgument(
+          "The position of device should be less than the size of devices."));
+  std::call_once(*init_flags_[pos], [this, pos, gpu_id] {
+    platform::SetDeviceId(devices_[pos]);
+    allocators_[pos].reset(
+        new ThreadLocalAllocatorImpl(platform::CUDAPlace(gpu_id)));
+  });
+  return allocators_[pos];
+}
+
+ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool()
+    : devices_(platform::GetSelectedDevices()) {
+  auto gpu_num = devices_.size();
+  allocators_.resize(gpu_num);
+  init_flags_.reserve(gpu_num);
+  for (size_t i = 0; i < gpu_num; ++i) {
+    init_flags_.emplace_back(new std::once_flag());
+  }
+}
+
+ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
+  VLOG(10) << "ThreadLocalAllocatorImpl::AllocateImpl " << size;
+  void* ptr = buddy_allocator_->Alloc(size);
+  auto* tl_allocation = new ThreadLocalAllocation(ptr, size, place_);
+  tl_allocation->SetThreadLocalAllocatorImpl(shared_from_this());
+  return tl_allocation;
+}
+
+void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) {
+  VLOG(10) << "ThreadLocalAllocatorImpl::FreeImpl " << allocation;
+  buddy_allocator_->Free(allocation->ptr());
+  delete allocation;
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h
new file mode 100644
index 00000000000..bc07ad0c4dc
--- /dev/null
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/detail/buddy_allocator.h"
+#include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class ThreadLocalAllocatorImpl;
+
+class ThreadLocalAllocation : public Allocation {
+ public:
+  ThreadLocalAllocation(void* ptr, size_t size, platform::Place place)
+      : Allocation(ptr, size, place) {}
+
+  void SetThreadLocalAllocatorImpl(
+      std::shared_ptr<ThreadLocalAllocatorImpl> allocator) {
+    allocator_ = allocator;
+  }
+
+  std::shared_ptr<ThreadLocalAllocatorImpl> GetAllocator() {
+    return allocator_;
+  }
+
+ private:
+  std::shared_ptr<ThreadLocalAllocatorImpl> allocator_;
+};
+
+class ThreadLocalAllocatorImpl
+    : public std::enable_shared_from_this<ThreadLocalAllocatorImpl> {
+ public:
+  explicit ThreadLocalAllocatorImpl(const platform::Place& p);
+  ThreadLocalAllocation* AllocateImpl(size_t size);
+  void FreeImpl(ThreadLocalAllocation* allocation);
+
+ private:
+  std::unique_ptr<memory::detail::BuddyAllocator> buddy_allocator_;
+  platform::Place place_;
+};
+
+class ThreadLocalCUDAAllocatorPool {
+ public:
+  static ThreadLocalCUDAAllocatorPool& Instance() {
+    static thread_local ThreadLocalCUDAAllocatorPool pool;
+    return pool;
+  }
+
+  std::shared_ptr<ThreadLocalAllocatorImpl> Get(int gpu_id);
+
+ private:
+  ThreadLocalCUDAAllocatorPool();
+  std::vector<int> devices_;
+  std::vector<std::unique_ptr<std::once_flag>> init_flags_;
+  std::vector<std::shared_ptr<ThreadLocalAllocatorImpl>> allocators_;
+};
+
+class ThreadLocalCUDAAllocator : public Allocator {
+ public:
+  explicit ThreadLocalCUDAAllocator(const platform::CUDAPlace& p)
+      : gpu_id_(p.device) {}
+
+  bool IsAllocThreadSafe() const override { return true; }
+
+ protected:
+  Allocation* AllocateImpl(size_t size) override {
+    return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->AllocateImpl(
+        size);
+  }
+  void FreeImpl(Allocation* allocation) override {
+    auto* tl_allocation = static_cast<ThreadLocalAllocation*>(allocation);
+    auto allocator_impl = tl_allocation->GetAllocator();
+    allocator_impl->FreeImpl(tl_allocation);
+  }
+
+ private:
+  int gpu_id_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc
new file mode 100644
index 00000000000..f9e2ea8c27a
--- /dev/null
+++ b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
+#include <algorithm>
+#include <condition_variable>  // NOLINT
+#include <functional>
+#include <iostream>
+#include <thread>  // NOLINT
+#include <utility>
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_string(allocator_strategy);
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+TEST(ThreadLocalAllocator, cross_scope_release) {
+  FLAGS_fraction_of_gpu_memory_to_use = 0.1;
+  FLAGS_allocator_strategy = "thread_local";
+
+  const size_t thread_num = 5;
+  const std::vector<int> devices = platform::GetSelectedDevices();
+
+  std::vector<std::vector<void *>> allocator_addresses(devices.size());
+  std::vector<std::vector<AllocationPtr>> thread_allocations(devices.size());
+
+  for (size_t i = 0; i < devices.size(); ++i) {
+    allocator_addresses[i].resize(thread_num);
+    thread_allocations[i].resize(thread_num);
+  }
+
+  std::vector<std::thread> threads(thread_num);
+  std::mutex mutex;
+  std::condition_variable cv;
+  bool flag = false;
+
+  for (size_t i = 0; i < threads.size(); ++i) {
+    threads[i] = std::thread([&, i]() {
+      {
+        std::unique_lock<std::mutex> lock(mutex);
+        cv.wait(lock, [&] { return flag; });
+      }
+      for (size_t j = 0; j < devices.size(); ++j) {
+        thread_allocations[j][i] =
+            memory::Alloc(platform::CUDAPlace(devices[j]), 10);
+        auto tl_allocator_impl =
+            ThreadLocalCUDAAllocatorPool::Instance().Get(devices[j]);
+        allocator_addresses[j][i] = tl_allocator_impl.get();
+      }
+    });
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    flag = true;
+    cv.notify_all();
+  }
+
+  for (auto &th : threads) {
+    th.join();
+  }
+
+  for (auto &addresses : allocator_addresses) {
+    std::sort(addresses.begin(), addresses.end());
+    ASSERT_EQ(std::adjacent_find(addresses.begin(), addresses.end(),
+                                 std::equal_to<void *>()),
+              addresses.end());
+  }
+
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  ASSERT_EXIT(([&]() { thread_allocations.clear(); }(), exit(0)),
+              ::testing::ExitedWithCode(0), ".*");
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index a5dd3401007..c2af3d0e982 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -303,7 +303,8 @@ DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
  * Allocator related FLAG
  * Name: FLAGS_allocator_strategy
  * Since Version: 1.2
- * Value Range: string, {naive_best_fit, auto_growth}, default=auto_growth
+ * Value Range: string, {naive_best_fit, auto_growth, thread_local},
+ * default=auto_growth
  * Example:
  * Note: For selecting allocator policy of PaddlePaddle.
  */
-- 
GitLab