From d8bff9883edd7e08bb68c8d174b586605e9d407b Mon Sep 17 00:00:00 2001
From: From00 <zero.ruibiao@gmail.com>
Date: Wed, 23 Mar 2022 12:45:07 +0800
Subject: [PATCH] Performance optimization for StreamSafeCudaAllocator (#40718)

* Performance optimize

* Optimize GetAllocator, RWLock and ProcessUnfreedAllocation

* Remove test file

* Fix CI error

* Fix CI errors

* Fix CI errors
---
 .../memory/allocation/allocator_facade.cc     | 158 ++++++++++++------
 .../memory/allocation/allocator_facade.h      |   9 +-
 .../allocation/stream_safe_cuda_allocator.cc  |  22 ++-
 .../allocation/stream_safe_cuda_allocator.h   |   3 +
 paddle/fluid/platform/device_context.cc       |  14 +-
 5 files changed, 136 insertions(+), 70 deletions(-)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 4a44448dc84..abf72564753 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -34,6 +34,7 @@
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
@@ -210,13 +211,28 @@ class AllocatorFacadePrivate {
         InitNaiveBestFitCPUAllocator();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         allow_free_idle_chunk_ = allow_free_idle_chunk;
-        if (!FLAGS_use_stream_safe_cuda_allocator) {
-          for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
-               ++dev_id) {
-            InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
-                                        allow_free_idle_chunk_);
-          }
+        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
+          InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
+                                      allow_free_idle_chunk_);
+        }
+
+        // Note(Ruibiao): For GPU multi-stream case, the 'allocators_' map(place
+        // -> Allocator) hold the StreamSafeCUDAAllocator releate to default
+        // stream (i.e., the stream directly got from DeviceContex), while the
+        // 'cuda_allocators_' map(place -> map(stream -> Allocator)) hold the
+        // StreamSafeCUDAAllocator releate to non-default stream (i.e., the
+        // stream users pass in). The default stream Allocator is built in the
+        // structure of AllocatorFacadePrivate, while the non-default stream is
+        // build in a delayed manner in GetAllocator function with
+        // 'create_if_not_found = ture'. We make special treatment for the
+        // default stream for performance reasons. Since most Alloc calls are
+        // for default stream in application, treating it separately can avoid
+        // lots of overhead of acquiring default stream and applying read-write
+        // lock.
+        if (FLAGS_use_stream_safe_cuda_allocator) {
+          WrapStreamSafeCUDAAllocatorForDefault();
         }
+
         InitNaiveBestFitCUDAPinnedAllocator();
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -301,7 +317,8 @@ class AllocatorFacadePrivate {
     CheckAllocThreadSafe();
 
 #ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+    if (FLAGS_use_stream_safe_cuda_allocator == false &&
+        UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
       WrapCUDAGraphAllocator();
     }
 #endif
@@ -341,7 +358,12 @@ class AllocatorFacadePrivate {
   const std::shared_ptr<Allocator>& GetAllocator(
       const platform::CUDAPlace& place, const gpuStream_t& stream,
       bool create_if_not_found = false) {
-    {  // shared_lock_guard
+    if (stream == GetDefaultStream(place)) {
+      VLOG(7) << "Get Allocator by passing in a default stream";
+      return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+    }
+
+    /* shared_lock_guard */ {
       std::shared_lock<std::shared_timed_mutex> lock_guard(
           cuda_allocator_mutex_);
       if (LIKELY(HasCUDAAllocator(place, stream))) {
@@ -355,7 +377,7 @@ class AllocatorFacadePrivate {
       }
     }
 
-    {  // unique_lock_guard
+    /* unique_lock_guard */ {
       std::unique_lock<std::shared_timed_mutex> lock_guard(
           cuda_allocator_mutex_);
       InitStreamSafeCUDAAllocator(place, stream);
@@ -363,9 +385,40 @@ class AllocatorFacadePrivate {
     }
   }
 
-  gpuStream_t GetDefaultStream(const platform::CUDAPlace& place) {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    return static_cast<platform::CUDADeviceContext*>(pool.Get(place))->stream();
+  const std::shared_ptr<StreamSafeCUDAAllocator>
+  GetDefaultStreamSafeCUDAAllocator(const platform::CUDAPlace& place) const {
+    const auto iter = default_stream_safe_cuda_allocators_.find(place);
+    PADDLE_ENFORCE_NE(
+        iter, default_stream_safe_cuda_allocators_.end(),
+        platform::errors::NotFound(
+            "No StreamSafeCUDAAllocator found for the place, %s", place));
+    return iter->second;
+  }
+
+  const gpuStream_t& GetDefaultStream(const platform::CUDAPlace& place) const {
+    const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
+        GetDefaultStreamSafeCUDAAllocator(place);
+    return allocator->GetDefaultStream();
+  }
+
+  void SetDefaultStream(const platform::CUDAPlace& place,
+                        const gpuStream_t& stream) {
+    const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
+        GetDefaultStreamSafeCUDAAllocator(place);
+    allocator->SetDefaultStream(stream);
+    VLOG(8) << "Set default stream to " << stream
+            << " for StreamSafeCUDAAllocator(" << allocator.get() << ") in "
+            << place;
+  }
+
+  void SetDefaultStreamFromDeviceContext() {
+    VLOG(8) << "Set default stream from DeviceContex";
+    for (auto& pair : default_stream_safe_cuda_allocators_) {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      pair.second->SetDefaultStream(
+          static_cast<phi::GPUContext*>(pool.Get(pair.first))->stream());
+    }
   }
 
   void RecordStream(std::shared_ptr<phi::Allocation> allocation,
@@ -635,6 +688,26 @@ class AllocatorFacadePrivate {
         /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
   }
 
+  void WrapStreamSafeCUDAAllocatorForDefault() {
+    for (auto& pair : allocators_) {
+      auto& place = pair.first;
+      if (platform::is_gpu_place(place)) {
+        std::shared_ptr<StreamSafeCUDAAllocator>&& allocator =
+            std::make_shared<StreamSafeCUDAAllocator>(
+                pair.second, place, /* default_stream = */ nullptr,
+                /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
+        pair.second = allocator;
+
+        // NOTE(Ruibiao): A tricky implement to give StreamSafeCUDAAllocator an
+        // ability to interact with the outside world, i.e., change default
+        // stream from outside
+        default_stream_safe_cuda_allocators_[place] = allocator;
+        VLOG(8) << "WrapStreamSafeCUDAAllocator for " << place
+                << ", allocator address = " << pair.second.get();
+      }
+    }
+  }
+
   void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
                               size_t retry_time) {
     PADDLE_ENFORCE_GT(
@@ -813,7 +886,6 @@ class AllocatorFacadePrivate {
 #endif
   }
 
-  // NOTE(Ruibiao): Old single-stream version, will be removed later
   void WrapCUDARetryAllocator(size_t retry_time) {
     PADDLE_ENFORCE_GT(
         retry_time, 0,
@@ -828,6 +900,8 @@ class AllocatorFacadePrivate {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // a standalone CUDA allocator to support multi-stream GC in new executor
+  std::map<platform::Place, std::shared_ptr<StreamSafeCUDAAllocator>>
+      default_stream_safe_cuda_allocators_;
   CUDAAllocatorMap cuda_allocators_;
   std::shared_timed_mutex cuda_allocator_mutex_;
 #endif
@@ -870,15 +944,6 @@ AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
 
 const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
     const platform::Place& place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
-      FLAGS_use_system_allocator == false) {
-    AllocatorFacadePrivate* m = GetPrivate();
-    platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return m->GetAllocator(cuda_place, m->GetDefaultStream(cuda_place));
-  }
-#endif
-
   return GetPrivate()->GetAllocator(
       place, /* A non-zero num to choose allocator_ */ 1);
 }
@@ -898,19 +963,6 @@ void* AllocatorFacade::GetBasePtr(
   return GetPrivate()->GetBasePtr(allocation);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
-    const platform::Place& place, const gpuStream_t& stream) {
-  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
-      FLAGS_use_system_allocator == false) {
-    return GetPrivate()->GetAllocator(place, stream,
-                                      /*create_if_not_found=*/true);
-  }
-  return GetPrivate()->GetAllocator(
-      place, /* A non-zero num to choose allocator_ */ 1);
-}
-#endif
-
 const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator(
     const platform::Place& place) {
   return GetPrivate()->GetAllocator(place, /* zero size */ 0);
@@ -923,26 +975,10 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
 
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                      size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
-      size > 0 && FLAGS_use_system_allocator == false) {
-    platform::CUDAPlace cuda_place(place.GetDeviceId());
-    phi::Stream default_stream = phi::Stream(reinterpret_cast<phi::StreamId>(
-        GetPrivate()->GetDefaultStream(cuda_place)));
-    return Alloc(cuda_place, size, default_stream);
-  }
-#endif
   return GetPrivate()->GetAllocator(place, size)->Allocate(size);
 }
 
 uint64_t AllocatorFacade::Release(const platform::Place& place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
-      FLAGS_use_system_allocator == false) {
-    platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return Release(cuda_place, GetPrivate()->GetDefaultStream(cuda_place));
-  }
-#endif
   return GetPrivate()
       ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
       ->Release(place);
@@ -1028,6 +1064,17 @@ void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
   GetPrivate()->RecordStream(allocation, stream);
 }
 
+const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
+    const platform::Place& place, const gpuStream_t& stream) {
+  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
+      FLAGS_use_system_allocator == false) {
+    return GetPrivate()->GetAllocator(place, stream,
+                                      /*create_if_not_found=*/true);
+  }
+  return GetPrivate()->GetAllocator(
+      place, /* A non-zero num to choose allocator_ */ 1);
+}
+
 const gpuStream_t& AllocatorFacade::GetStream(
     const std::shared_ptr<phi::Allocation>& allocation) const {
   PADDLE_ENFORCE_EQ(
@@ -1040,6 +1087,13 @@ const gpuStream_t& AllocatorFacade::GetStream(
   return GetPrivate()->GetStream(allocation);
 }
 
+void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
+                                       const gpuStream_t& stream) {
+  if (FLAGS_use_stream_safe_cuda_allocator) {
+    GetPrivate()->SetDefaultStream(place, stream);
+  }
+}
+
 #ifdef PADDLE_WITH_CUDA
 void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
   PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
@@ -1055,6 +1109,8 @@ void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
           "The memory pool of the CUDA Graph with ID %d have been prepared.",
           id));
   allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
+  allocator->SetDefaultStreamFromDeviceContext();
+
   VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
 }
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 9066bb284e2..1ea872f7eca 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -55,11 +55,6 @@ class AllocatorFacade {
 
   void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
-                                                 const gpuStream_t& stream);
-#endif
-
   const std::shared_ptr<Allocator>& GetZeroAllocator(
       const platform::Place& place);
 
@@ -86,8 +81,12 @@ class AllocatorFacade {
   uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
   void RecordStream(std::shared_ptr<Allocation> allocation,
                     const gpuStream_t& stream);
+  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
+                                                 const gpuStream_t& stream);
   const gpuStream_t& GetStream(
       const std::shared_ptr<Allocation>& allocation) const;
+  void SetDefaultStream(const platform::CUDAPlace& place,
+                        const gpuStream_t& stream);
 #endif
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 072c4dee3bc..7e47d35176b 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -154,6 +154,14 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
 
 bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
 
+const gpuStream_t& StreamSafeCUDAAllocator::GetDefaultStream() const {
+  return default_stream_;
+}
+
+void StreamSafeCUDAAllocator::SetDefaultStream(const gpuStream_t& stream) {
+  default_stream_ = stream;
+}
+
 phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
   platform::RecordEvent("StreamSafeCUDAAllocator::Allocate",
                         platform::TracerEventType::UserDefined, 9 /*level*/);
@@ -187,12 +195,8 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
   platform::RecordEvent("StreamSafeCUDAAllocator::Free",
                         platform::TracerEventType::UserDefined, 9 /*level*/);
   StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
-      dynamic_cast<StreamSafeCUDAAllocation*>(allocation);
-  PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
-                          platform::errors::InvalidArgument(
-                              "Failed to dynamic cast %p from Allocation* to "
-                              "StreamSafeCUDAAllocation*",
-                              allocation));
+      static_cast<StreamSafeCUDAAllocation*>(allocation);
+
   VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr();
   if (stream_safe_cuda_allocation->CanBeFreed()) {
     VLOG(9) << "Directly delete allocation";
@@ -221,6 +225,12 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
 }
 
 void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() {
+  // NOTE(Ruibiao): This condition is to reduce lock competion. It does not need
+  // to be thread-safe since here occasional misjudgments are permissible.
+  if (unfreed_allocations_.empty()) {
+    return;
+  }
+
   std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
   for (auto it = unfreed_allocations_.begin();
        it != unfreed_allocations_.end();) {
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index ecddff97c20..65af32c701b 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -64,7 +64,10 @@ class StreamSafeCUDAAllocator
                           platform::CUDAPlace place, gpuStream_t default_stream,
                           bool in_cuda_graph_capturing = false);
   ~StreamSafeCUDAAllocator();
+
   bool IsAllocThreadSafe() const override;
+  const gpuStream_t &GetDefaultStream() const;
+  void SetDefaultStream(const gpuStream_t &stream);
 
  protected:
   phi::Allocation *AllocateImpl(size_t size) override;
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 18ac979b48e..5605d326f2c 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -159,10 +159,8 @@ inline void EmplaceDeviceContext(
               cuda_ctx,
               platform::errors::InvalidArgument(
                   "Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
-          // Note: A trick method to init context, why GetAllocator interface
-          // needs a stream parameter?
           dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
-                                    .GetAllocator(p, cuda_ctx->stream())
+                                    .GetAllocator(p)
                                     .get());
           cuda_ctx->PartialInitWithAllocator();
           dev_ctx->SetGenerator(
@@ -517,10 +515,10 @@ CUDAContext::~CUDAContext() {
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) {
   phi::GPUContext::PartialInitWithoutAllocator();
   cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place));
-  workspace_.reset(new phi::DnnWorkspaceHandle(
-      memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(place, phi::GPUContext::stream())
-          .get()));
+  auto& instance = memory::allocation::AllocatorFacade::Instance();
+  instance.SetDefaultStream(place, phi::GPUContext::stream());
+  workspace_.reset(
+      new phi::DnnWorkspaceHandle(instance.GetAllocator(place).get()));
 }
 
 CUDADeviceContext::~CUDADeviceContext() = default;
@@ -618,7 +616,7 @@ phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
     // return workspace_.get();
     return phi::DnnWorkspaceHandle(
         memory::allocation::AllocatorFacade::Instance()
-            .GetAllocator(GetPlace(), phi::GPUContext::stream())
+            .GetAllocator(GetPlace())
             .get());
   }
   return phi::GPUContext::cudnn_workspace_handle();
-- 
GitLab