From 0ad2e192649d909a348195859aaa2e3135e47ae6 Mon Sep 17 00:00:00 2001
From: From00 <chenruibiao@baidu.com>
Date: Sun, 27 Mar 2022 10:03:19 +0800
Subject: [PATCH] Make StreamSafeCUDAAllocator compatible with NaiveBestFit
 strategy (#40886)

* Make StreamSafeCUDAAllocator compatible with NaiveBestFit strategy

* Set FLAGS_use_stream_safe_cuda_allocator to false

* Update

* Remove unnecessary code

* Fix CI errors

* Add UT
---
 .../framework/new_executor/interpretercore.cc |   5 +-
 .../memory/allocation/allocator_facade.cc     | 212 ++++++++----------
 .../memory/allocation/allocator_facade.h      |   4 +-
 paddle/fluid/memory/malloc.cc                 |   2 +-
 paddle/fluid/memory/malloc.h                  |   2 +-
 paddle/fluid/memory/malloc_test.cu            |  24 +-
 .../memory/stream_safe_cuda_alloc_test.cu     |  13 ++
 7 files changed, 130 insertions(+), 132 deletions(-)
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 6e73aaef15..b36ff519ce 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -32,7 +32,6 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true,
 DECLARE_bool(check_nan_inf);
 DECLARE_bool(benchmark);
 DECLARE_bool(fast_eager_deletion_mode);
-DECLARE_bool(use_stream_safe_cuda_allocator);
 
 constexpr const char* kExceptionCaught = "ExceptionCaught";
 constexpr const char* kTaskCompletion = "TaskCompletion";
@@ -44,7 +43,9 @@ static constexpr size_t kHostNumThreads = 4;
 static constexpr size_t kDeviceNumThreads = 1;
 
 bool IsInterpretercoreFastGCEnabled() {
-  return FLAGS_fast_eager_deletion_mode && FLAGS_use_stream_safe_cuda_allocator;
+  return memory::allocation::AllocatorFacade::Instance()
+             .IsStreamSafeCUDAAllocatorUsed() &&
+         FLAGS_fast_eager_deletion_mode;
 }
 
 InterpreterCore::InterpreterCore(const platform::Place& place,
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index abf7256475..88bbe339f8 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -145,6 +145,14 @@ class CUDAGraphAllocator
 };
 #endif
 
+static bool IsCUDAGraphCapturing() {
+#ifdef PADDLE_WITH_CUDA
+  return UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing());
+#else
+  return false;
+#endif
+}
+
 class AllocatorFacadePrivate {
  public:
   using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;
@@ -157,6 +165,8 @@ class AllocatorFacadePrivate {
 
   explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
     strategy_ = GetAllocatorStrategy();
+    is_stream_safe_cuda_allocator_used_ = false;
+
     switch (strategy_) {
       case AllocatorStrategy::kNaiveBestFit: {
         InitNaiveBestFitCPUAllocator();
@@ -166,12 +176,6 @@ class AllocatorFacadePrivate {
         }
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-        PADDLE_ENFORCE_EQ(
-            FLAGS_use_stream_safe_cuda_allocator, false,
-            paddle::platform::errors::Unimplemented(
-                "StreamSafeCUDAAllocator is only implemented for auto_growth "
-                "strategy, not support naive_best_fit strategy"));
-
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
         }
@@ -216,21 +220,24 @@ class AllocatorFacadePrivate {
                                       allow_free_idle_chunk_);
         }
 
-        // Note(Ruibiao): For GPU multi-stream case, the 'allocators_' map(place
-        // -> Allocator) hold the StreamSafeCUDAAllocator releate to default
-        // stream (i.e., the stream directly got from DeviceContex), while the
-        // 'cuda_allocators_' map(place -> map(stream -> Allocator)) hold the
-        // StreamSafeCUDAAllocator releate to non-default stream (i.e., the
-        // stream users pass in). The default stream Allocator is built in the
-        // structure of AllocatorFacadePrivate, while the non-default stream is
-        // build in a delayed manner in GetAllocator function with
-        // 'create_if_not_found = ture'. We make special treatment for the
-        // default stream for performance reasons. Since most Alloc calls are
-        // for default stream in application, treating it separately can avoid
-        // lots of overhead of acquiring default stream and applying read-write
-        // lock.
+        // Note(Ruibiao): For GPU multi-stream case without CUDA graph
+        // capturing, the 'allocators_' map(place -> Allocator) hold the
+        // StreamSafeCUDAAllocator releate to defaultstream (i.e., the stream
+        // directly got from DeviceContex), while the 'cuda_allocators_' map
+        // (place -> map(stream -> Allocator)) hold the StreamSafeCUDAAllocator
+        // releate to non-default stream (i.e., the stream users pass in). The
+        // default stream Allocator is built in the structure of
+        // AllocatorFacadePrivate, while the non-default stream is build in a
+        // manner in GetAllocator function with 'create_if_not_found = ture'.
+        // We make special treatment for the default stream for performance
+        // reasons. Since most Alloc calls are for default stream in
+        // application, treating it separately can avoid lots of overhead of
+        // acquiring default stream and applying read-write lock.
         if (FLAGS_use_stream_safe_cuda_allocator) {
-          WrapStreamSafeCUDAAllocatorForDefault();
+          if (LIKELY(!IsCUDAGraphCapturing())) {
+            WrapStreamSafeCUDAAllocatorForDefault();
+          }
+          is_stream_safe_cuda_allocator_used_ = true;
         }
 
         InitNaiveBestFitCUDAPinnedAllocator();
@@ -283,12 +290,6 @@ class AllocatorFacadePrivate {
         }
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-        PADDLE_ENFORCE_EQ(
-            FLAGS_use_stream_safe_cuda_allocator, false,
-            paddle::platform::errors::Unimplemented(
-                "StreamSafeCUDAAllocator is only implemented for auto_growth "
-                "strategy, not support thread_local strategy"));
-
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
         }
@@ -317,8 +318,9 @@ class AllocatorFacadePrivate {
     CheckAllocThreadSafe();
 
 #ifdef PADDLE_WITH_CUDA
-    if (FLAGS_use_stream_safe_cuda_allocator == false &&
-        UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+    // No need to wrap CUDAGraphAllocator for StreamSafeCUDAAllocator
+    if (!is_stream_safe_cuda_allocator_used_ &&
+        UNLIKELY(IsCUDAGraphCapturing())) {
       WrapCUDAGraphAllocator();
     }
 #endif
@@ -343,6 +345,11 @@ class AllocatorFacadePrivate {
     return static_cast<Allocation*>(allocation.get())->base_ptr();
   }
 
+  bool IsStreamSafeCUDAAllocatorUsed() {
+    return is_stream_safe_cuda_allocator_used_ &&
+           LIKELY(FLAGS_use_system_allocator == false);
+  }
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   bool HasCUDAAllocator(const platform::CUDAPlace& place,
                         const gpuStream_t& stream) {
@@ -358,9 +365,11 @@ class AllocatorFacadePrivate {
   const std::shared_ptr<Allocator>& GetAllocator(
       const platform::CUDAPlace& place, const gpuStream_t& stream,
       bool create_if_not_found = false) {
-    if (stream == GetDefaultStream(place)) {
-      VLOG(7) << "Get Allocator by passing in a default stream";
-      return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+    if (LIKELY(!IsCUDAGraphCapturing())) {
+      if (stream == GetDefaultStream(place)) {
+        VLOG(7) << "Get Allocator by passing in a default stream";
+        return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+      }
     }
 
     /* shared_lock_guard */ {
@@ -411,42 +420,30 @@ class AllocatorFacadePrivate {
             << place;
   }
 
-  void SetDefaultStreamFromDeviceContext() {
-    VLOG(8) << "Set default stream from DeviceContex";
-    for (auto& pair : default_stream_safe_cuda_allocators_) {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      pair.second->SetDefaultStream(
-          static_cast<phi::GPUContext*>(pool.Get(pair.first))->stream());
-    }
-  }
-
   void RecordStream(std::shared_ptr<phi::Allocation> allocation,
                     const gpuStream_t& stream) {
-    if (allocation->size() == 0) {
-      return;
+    std::shared_ptr<StreamSafeCUDAAllocation> stream_safe_cuda_allocation =
+        std::dynamic_pointer_cast<StreamSafeCUDAAllocation>(allocation);
+    if (stream_safe_cuda_allocation != nullptr) {
+      stream_safe_cuda_allocation->RecordStream(stream);
+    } else {
+      VLOG(6) << "RecordStream for a non-StreamSafeCUDAAllocation";
     }
-
-    StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
-        dynamic_cast<StreamSafeCUDAAllocation*>(allocation.get());
-    PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
-                            platform::errors::InvalidArgument(
-                                "Failed to dynamic cast %p from Allocation* to "
-                                "StreamSafeCUDAAllocation*",
-                                allocation.get()));
-    stream_safe_cuda_allocation->RecordStream(stream);
   }
 
-  const gpuStream_t& GetStream(
+  const gpuStream_t GetStream(
       const std::shared_ptr<phi::Allocation>& allocation) const {
-    const StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
-        dynamic_cast<const StreamSafeCUDAAllocation*>(allocation.get());
-    PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
-                            platform::errors::InvalidArgument(
-                                "Failed to dynamic cast %p from Allocation* to "
-                                "StreamSafeCUDAAllocation*",
-                                allocation.get()));
-    return stream_safe_cuda_allocation->GetOwningStream();
+    const std::shared_ptr<StreamSafeCUDAAllocation>
+        stream_safe_cuda_allocation =
+            std::dynamic_pointer_cast<StreamSafeCUDAAllocation>(allocation);
+    if (stream_safe_cuda_allocation != nullptr) {
+      return stream_safe_cuda_allocation->GetOwningStream();
+    }
+
+    VLOG(6) << "GetStream for a non-StreamSafeCUDAAllocation";
+    return static_cast<phi::GPUContext*>(
+               platform::DeviceContextPool::Instance().Get(allocation->place()))
+        ->stream();
   }
 #endif
 
@@ -880,7 +877,7 @@ class AllocatorFacadePrivate {
     CheckAllocThreadSafe(zero_size_allocators_);
     CheckAllocThreadSafe(system_allocators_);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    if (FLAGS_use_stream_safe_cuda_allocator) {
+    if (is_stream_safe_cuda_allocator_used_) {
       CheckCUDAAllocThreadSafe(cuda_allocators_);
     }
 #endif
@@ -910,6 +907,7 @@ class AllocatorFacadePrivate {
   static AllocatorMap zero_size_allocators_;
   static AllocatorMap system_allocators_;
   bool allow_free_idle_chunk_;
+  bool is_stream_safe_cuda_allocator_used_;
 };
 AllocatorFacadePrivate::AllocatorMap
     AllocatorFacadePrivate::zero_size_allocators_;
@@ -928,7 +926,7 @@ AllocatorFacade& AllocatorFacade::Instance() {
 
 AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
 #ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+  if (UNLIKELY(IsCUDAGraphCapturing())) {
     auto id = platform::CUDAGraph::CapturingID();
     auto iter = cuda_graph_map_.find(id);
     PADDLE_ENFORCE_NE(
@@ -986,35 +984,25 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
 
 std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
     const platform::Place& place, size_t size, const phi::Stream& stream) {
-  PADDLE_ENFORCE_EQ(
-      FLAGS_use_stream_safe_cuda_allocator, true,
-      platform::errors::Unimplemented(
-          "StreamSafeCUDAAllocator is disabled, you should not call this "
-          "multi-stream 'AllocaShared' function. To enable it, you can enter"
-          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
-          "terminal."));
   return std::shared_ptr<phi::Allocation>(Alloc(place, size, stream));
 }
 
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
                                      const phi::Stream& stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  PADDLE_ENFORCE_EQ(
-      FLAGS_use_stream_safe_cuda_allocator, true,
-      platform::errors::Unimplemented(
-          "StreamSafeCUDAAllocator is disabled, you should not call this "
-          "multi-stream 'Alloc' function. To enable it, you can enter"
-          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
-          "terminal."));
+  AllocatorFacadePrivate* m = GetPrivate();
+  if (!m->IsStreamSafeCUDAAllocatorUsed()) {
+    VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
+    return Alloc(place, size);
+  }
 
   platform::CUDAPlace p(place.GetDeviceId());
   if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
     gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
-    return GetPrivate()
-        ->GetAllocator(p, s, /* create_if_not_found = */ true)
+    return m->GetAllocator(p, s, /* create_if_not_found = */ true)
         ->Allocate(size);
   } else {
-    return GetPrivate()->GetAllocator(p, size)->Allocate(size);
+    return m->GetAllocator(p, size)->Allocate(size);
   }
 #else
   PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
@@ -1025,13 +1013,6 @@ bool AllocatorFacade::InSameStream(
     const std::shared_ptr<phi::Allocation>& allocation,
     const phi::Stream& stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  PADDLE_ENFORCE_EQ(
-      FLAGS_use_stream_safe_cuda_allocator, true,
-      platform::errors::Unimplemented(
-          "StreamSafeCUDAAllocator is disabled, you should not call this "
-          "multi-stream 'InSameStream' function. To enable it, you can enter"
-          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
-          "terminal."));
   gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
   return s == GetStream(allocation);
 #else
@@ -1039,58 +1020,52 @@ bool AllocatorFacade::InSameStream(
 #endif
 }
 
+bool AllocatorFacade::IsStreamSafeCUDAAllocatorUsed() {
+  return GetPrivate()->IsStreamSafeCUDAAllocatorUsed();
+}
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
                                   const gpuStream_t& stream) {
-  PADDLE_ENFORCE_EQ(
-      FLAGS_use_stream_safe_cuda_allocator, true,
-      platform::errors::Unimplemented(
-          "StreamSafeCUDAAllocator is disabled, you should not call this "
-          "multi-stream 'Release' function. To enable it, you can enter"
-          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
-          "terminal."));
-  return GetPrivate()->GetAllocator(place, stream)->Release(place);
+  AllocatorFacadePrivate* m = GetPrivate();
+  if (!m->IsStreamSafeCUDAAllocatorUsed()) {
+    VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
+    return Release(place);
+  }
+
+  return m->GetAllocator(place, stream)->Release(place);
 }
 
 void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
                                    const gpuStream_t& stream) {
-  PADDLE_ENFORCE_EQ(
-      FLAGS_use_stream_safe_cuda_allocator, true,
-      platform::errors::Unimplemented(
-          "StreamSafeCUDAAllocator is disabled, you should not call this "
-          "'RecordStream' function. To enable it, you can enter"
-          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
-          "terminal."));
   GetPrivate()->RecordStream(allocation, stream);
 }
 
 const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
     const platform::Place& place, const gpuStream_t& stream) {
-  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
-      FLAGS_use_system_allocator == false) {
-    return GetPrivate()->GetAllocator(place, stream,
-                                      /*create_if_not_found=*/true);
+  AllocatorFacadePrivate* m = GetPrivate();
+
+  if (!m->IsStreamSafeCUDAAllocatorUsed()) {
+    VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
+    return GetAllocator(place);
   }
-  return GetPrivate()->GetAllocator(
-      place, /* A non-zero num to choose allocator_ */ 1);
+
+  if (platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) {
+    return m->GetAllocator(place, stream,
+                           /*create_if_not_found=*/true);
+  }
+  return m->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
 }
 
-const gpuStream_t& AllocatorFacade::GetStream(
+const gpuStream_t AllocatorFacade::GetStream(
     const std::shared_ptr<phi::Allocation>& allocation) const {
-  PADDLE_ENFORCE_EQ(
-      FLAGS_use_stream_safe_cuda_allocator, true,
-      platform::errors::Unimplemented(
-          "StreamSafeCUDAAllocator is disabled, you should not call this "
-          "'GetStream' function. To enable it, you can enter"
-          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
-          "terminal."));
   return GetPrivate()->GetStream(allocation);
 }
 
 void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
                                        const gpuStream_t& stream) {
-  if (FLAGS_use_stream_safe_cuda_allocator) {
-    GetPrivate()->SetDefaultStream(place, stream);
+  if (m_->IsStreamSafeCUDAAllocatorUsed()) {
+    m_->SetDefaultStream(place, stream);
   }
 }
 
@@ -1109,7 +1084,6 @@ void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
           "The memory pool of the CUDA Graph with ID %d have been prepared.",
           id));
   allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
-  allocator->SetDefaultStreamFromDeviceContext();
 
   VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
 }
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 1ea872f7ec..d5c1e7c908 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -76,6 +76,8 @@ class AllocatorFacade {
   bool InSameStream(const std::shared_ptr<Allocation>& allocation,
                     const phi::Stream& stream);
 
+  bool IsStreamSafeCUDAAllocatorUsed();
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed.
   uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
@@ -83,7 +85,7 @@ class AllocatorFacade {
                     const gpuStream_t& stream);
   const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
                                                  const gpuStream_t& stream);
-  const gpuStream_t& GetStream(
+  const gpuStream_t GetStream(
       const std::shared_ptr<Allocation>& allocation) const;
   void SetDefaultStream(const platform::CUDAPlace& place,
                         const gpuStream_t& stream);
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 2bca2c388a..f3de317dd1 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -67,7 +67,7 @@ void RecordStream(std::shared_ptr<Allocation> allocation,
                                                               stream);
 }
 
-const gpuStream_t& GetStream(const std::shared_ptr<Allocation>& allocation) {
+const gpuStream_t GetStream(const std::shared_ptr<Allocation>& allocation) {
   return allocation::AllocatorFacade::Instance().GetStream(allocation);
 }
 
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 601fe3f2a4..e6d910579b 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -56,7 +56,7 @@ extern uint64_t Release(const platform::CUDAPlace& place,
 void RecordStream(std::shared_ptr<Allocation> allocation,
                   const gpuStream_t& stream);
 
-const gpuStream_t& GetStream(const std::shared_ptr<Allocation>& allocation);
+const gpuStream_t GetStream(const std::shared_ptr<Allocation>& allocation);
 #endif
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu
index 07577531d6..9837d3e4fa 100644
--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -12,6 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/stream.h"
+
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -21,14 +30,6 @@
 #include <hip/hip_runtime.h>
 #endif
 
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/device_context.h"
-
 namespace paddle {
 namespace memory {
 
@@ -196,5 +197,12 @@ TEST(Malloc, AllocZero) {
   AllocationPtr allocation_ptr = Alloc(place, 0);
   EXPECT_GE(allocation_ptr->size(), 0);
 }
+
+TEST(Malloc, AllocWithStream) {
+  size_t size = 1024;
+  AllocationPtr allocation = Alloc(platform::CUDAPlace(), size, phi::Stream(0));
+  EXPECT_EQ(allocation->size(), 1024);
+}
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
index 5e4a4234bb..3bf873bcfc 100644
--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -101,6 +101,19 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
   CheckMemLeak(place);
 }
 
+TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorWithDefaultStreamTest) {
+  auto &instance = allocation::AllocatorFacade::Instance();
+  platform::CUDAPlace place = platform::CUDAPlace();
+  const std::shared_ptr<Allocator> allocator_implicit_stream =
+      instance.GetAllocator(place);
+  const std::shared_ptr<Allocator> allocator_default_stream =
+      instance.GetAllocator(
+          place, static_cast<phi::GPUContext *>(
+                     platform::DeviceContextPool::Instance().Get(place))
+                     ->stream());
+  EXPECT_EQ(allocator_implicit_stream.get(), allocator_default_stream.get());
+}
+
 TEST(StreamSafeCUDAAllocInterfaceTest, ZeroSizeRecordStreamTest) {
   platform::CUDAPlace place = platform::CUDAPlace();
   std::shared_ptr<Allocation> zero_size_allocation = AllocShared(place, 0);
-- 
GitLab