Make StreamSafeCUDAAllocator compatible with NaiveBestFit strategy (#40886)

* Make StreamSafeCUDAAllocator compatible with NaiveBestFit strategy * Set FLAGS_use_stream_safe_cuda_allocator to false * Update * Remove unnecessary code * Fix CI errors * Add UT

Make StreamSafeCUDAAllocator compatible with NaiveBestFit strategy (#40886)
* Make StreamSafeCUDAAllocator compatible with NaiveBestFit strategy * Set FLAGS_use_stream_safe_cuda_allocator to false * Update * Remove unnecessary code * Fix CI errors * Add UT
0ad2e192 · From00 · GitHub · f6b6b057 · 0ad2e192 · 0ad2e192
7 changed file
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -32,7 +32,6 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true,
 DECLARE_bool(check_nan_inf);
 DECLARE_bool(benchmark);
 DECLARE_bool(fast_eager_deletion_mode);
-DECLARE_bool(use_stream_safe_cuda_allocator);
 constexpr const char* kExceptionCaught = "ExceptionCaught";
 constexpr const char* kTaskCompletion = "TaskCompletion";
@@ -44,7 +43,9 @@ static constexpr size_t kHostNumThreads = 4;
 static constexpr size_t kDeviceNumThreads = 1;
 bool IsInterpretercoreFastGCEnabled() {
-  return FLAGS_fast_eager_deletion_mode && FLAGS_use_stream_safe_cuda_allocator;
+  return memory::allocation::AllocatorFacade::Instance()
+             .IsStreamSafeCUDAAllocatorUsed() &&
+         FLAGS_fast_eager_deletion_mode;
 }
 InterpreterCore::InterpreterCore(const platform::Place& place,

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -145,6 +145,14 @@ class CUDAGraphAllocator
 };
 #endif
+static bool IsCUDAGraphCapturing() {
+#ifdef PADDLE_WITH_CUDA
+  return UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing());
+#else
+  return false;
+#endif
+}
 class AllocatorFacadePrivate {
 public:
  using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;
@@ -157,6 +165,8 @@ class AllocatorFacadePrivate {
  explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
    strategy_ = GetAllocatorStrategy();
+    is_stream_safe_cuda_allocator_used_ = false;
    switch (strategy_) {
      case AllocatorStrategy::kNaiveBestFit: {
        InitNaiveBestFitCPUAllocator();
@@ -166,12 +176,6 @@ class AllocatorFacadePrivate {
        }
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-        PADDLE_ENFORCE_EQ(
-            FLAGS_use_stream_safe_cuda_allocator, false,
-            paddle::platform::errors::Unimplemented(
-                "StreamSafeCUDAAllocator is only implemented for auto_growth "
-                "strategy, not support naive_best_fit strategy"));
        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
        }
@@ -216,21 +220,24 @@ class AllocatorFacadePrivate {
                                      allow_free_idle_chunk_);
        }
-        // Note(Ruibiao): For GPU multi-stream case, the 'allocators_' map(place
+        // Note(Ruibiao): For GPU multi-stream case without CUDA graph
-        // -> Allocator) hold the StreamSafeCUDAAllocator releate to default
+        // capturing, the 'allocators_' map(place -> Allocator) hold the
-        // stream (i.e., the stream directly got from DeviceContex), while the
+        // StreamSafeCUDAAllocator releate to defaultstream (i.e., the stream
-        // 'cuda_allocators_' map(place -> map(stream -> Allocator)) hold the
+        // directly got from DeviceContex), while the 'cuda_allocators_' map
-        // StreamSafeCUDAAllocator releate to non-default stream (i.e., the
+        // (place -> map(stream -> Allocator)) hold the StreamSafeCUDAAllocator
-        // stream users pass in). The default stream Allocator is built in the
+        // releate to non-default stream (i.e., the stream users pass in). The
-        // structure of AllocatorFacadePrivate, while the non-default stream is
+        // default stream Allocator is built in the structure of
-        // build in a delayed manner in GetAllocator function with
+        // AllocatorFacadePrivate, while the non-default stream is build in a
-        // 'create_if_not_found = ture'. We make special treatment for the
+        // manner in GetAllocator function with 'create_if_not_found = ture'.
-        // default stream for performance reasons. Since most Alloc calls are
+        // We make special treatment for the default stream for performance
-        // for default stream in application, treating it separately can avoid
+        // reasons. Since most Alloc calls are for default stream in
-        // lots of overhead of acquiring default stream and applying read-write
+        // application, treating it separately can avoid lots of overhead of
-        // lock.
+        // acquiring default stream and applying read-write lock.
        if (FLAGS_use_stream_safe_cuda_allocator) {
-          WrapStreamSafeCUDAAllocatorForDefault();
+          if (LIKELY(!IsCUDAGraphCapturing())) {
+            WrapStreamSafeCUDAAllocatorForDefault();
+          }
+          is_stream_safe_cuda_allocator_used_ = true;
        }
        InitNaiveBestFitCUDAPinnedAllocator();
@@ -283,12 +290,6 @@ class AllocatorFacadePrivate {
        }
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-        PADDLE_ENFORCE_EQ(
-            FLAGS_use_stream_safe_cuda_allocator, false,
-            paddle::platform::errors::Unimplemented(
-                "StreamSafeCUDAAllocator is only implemented for auto_growth "
-                "strategy, not support thread_local strategy"));
        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
          InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
        }
@@ -317,8 +318,9 @@ class AllocatorFacadePrivate {
    CheckAllocThreadSafe();
 #ifdef PADDLE_WITH_CUDA
-    if (FLAGS_use_stream_safe_cuda_allocator == false &&
+    // No need to wrap CUDAGraphAllocator for StreamSafeCUDAAllocator
-        UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+    if (!is_stream_safe_cuda_allocator_used_ &&
+        UNLIKELY(IsCUDAGraphCapturing())) {
      WrapCUDAGraphAllocator();
    }
 #endif
@@ -343,6 +345,11 @@ class AllocatorFacadePrivate {
    return static_cast<Allocation*>(allocation.get())->base_ptr();
  }
+  bool IsStreamSafeCUDAAllocatorUsed() {
+    return is_stream_safe_cuda_allocator_used_ &&
+           LIKELY(FLAGS_use_system_allocator == false);
+  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  bool HasCUDAAllocator(const platform::CUDAPlace& place,
                        const gpuStream_t& stream) {
@@ -358,9 +365,11 @@ class AllocatorFacadePrivate {
  const std::shared_ptr<Allocator>& GetAllocator(
      const platform::CUDAPlace& place, const gpuStream_t& stream,
      bool create_if_not_found = false) {
-    if (stream == GetDefaultStream(place)) {
+    if (LIKELY(!IsCUDAGraphCapturing())) {
-      VLOG(7) << "Get Allocator by passing in a default stream";
+      if (stream == GetDefaultStream(place)) {
-      return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+        VLOG(7) << "Get Allocator by passing in a default stream";
+        return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+      }
    }
    /* shared_lock_guard */ {
@@ -411,42 +420,30 @@ class AllocatorFacadePrivate {
            << place;
  }
-  void SetDefaultStreamFromDeviceContext() {
-    VLOG(8) << "Set default stream from DeviceContex";
-    for (auto& pair : default_stream_safe_cuda_allocators_) {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      pair.second->SetDefaultStream(
-          static_cast<phi::GPUContext*>(pool.Get(pair.first))->stream());
-    }
-  }
  void RecordStream(std::shared_ptr<phi::Allocation> allocation,
                    const gpuStream_t& stream) {
-    if (allocation->size() == 0) {
+    std::shared_ptr<StreamSafeCUDAAllocation> stream_safe_cuda_allocation =
-      return;
+        std::dynamic_pointer_cast<StreamSafeCUDAAllocation>(allocation);
+    if (stream_safe_cuda_allocation != nullptr) {
+      stream_safe_cuda_allocation->RecordStream(stream);
+    } else {
+      VLOG(6) << "RecordStream for a non-StreamSafeCUDAAllocation";
    }
-    StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
-        dynamic_cast<StreamSafeCUDAAllocation*>(allocation.get());
-    PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
-                            platform::errors::InvalidArgument(
-                                "Failed to dynamic cast %p from Allocation* to "
-                                "StreamSafeCUDAAllocation*",
-                                allocation.get()));
-    stream_safe_cuda_allocation->RecordStream(stream);
  }
-  const gpuStream_t& GetStream(
+  const gpuStream_t GetStream(
      const std::shared_ptr<phi::Allocation>& allocation) const {
-    const StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
+    const std::shared_ptr<StreamSafeCUDAAllocation>
-        dynamic_cast<const StreamSafeCUDAAllocation*>(allocation.get());
+        stream_safe_cuda_allocation =
-    PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
+            std::dynamic_pointer_cast<StreamSafeCUDAAllocation>(allocation);
-                            platform::errors::InvalidArgument(
+    if (stream_safe_cuda_allocation != nullptr) {
-                                "Failed to dynamic cast %p from Allocation* to "
+      return stream_safe_cuda_allocation->GetOwningStream();
-                                "StreamSafeCUDAAllocation*",
+    }
-                                allocation.get()));
-    return stream_safe_cuda_allocation->GetOwningStream();
+    VLOG(6) << "GetStream for a non-StreamSafeCUDAAllocation";
+    return static_cast<phi::GPUContext*>(
+               platform::DeviceContextPool::Instance().Get(allocation->place()))
+        ->stream();
  }
 #endif
@@ -880,7 +877,7 @@ class AllocatorFacadePrivate {
    CheckAllocThreadSafe(zero_size_allocators_);
    CheckAllocThreadSafe(system_allocators_);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    if (FLAGS_use_stream_safe_cuda_allocator) {
+    if (is_stream_safe_cuda_allocator_used_) {
      CheckCUDAAllocThreadSafe(cuda_allocators_);
    }
 #endif
@@ -910,6 +907,7 @@ class AllocatorFacadePrivate {
  static AllocatorMap zero_size_allocators_;
  static AllocatorMap system_allocators_;
  bool allow_free_idle_chunk_;
+  bool is_stream_safe_cuda_allocator_used_;
 };
 AllocatorFacadePrivate::AllocatorMap
    AllocatorFacadePrivate::zero_size_allocators_;
@@ -928,7 +926,7 @@ AllocatorFacade& AllocatorFacade::Instance() {
 AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
 #ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+  if (UNLIKELY(IsCUDAGraphCapturing())) {
    auto id = platform::CUDAGraph::CapturingID();
    auto iter = cuda_graph_map_.find(id);
    PADDLE_ENFORCE_NE(
@@ -986,35 +984,25 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
 std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
    const platform::Place& place, size_t size, const phi::Stream& stream) {
-  PADDLE_ENFORCE_EQ(
-      FLAGS_use_stream_safe_cuda_allocator, true,
-      platform::errors::Unimplemented(
-          "StreamSafeCUDAAllocator is disabled, you should not call this "
-          "multi-stream 'AllocaShared' function. To enable it, you can enter"
-          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
-          "terminal."));
  return std::shared_ptr<phi::Allocation>(Alloc(place, size, stream));
 }
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
                                     const phi::Stream& stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  PADDLE_ENFORCE_EQ(
+  AllocatorFacadePrivate* m = GetPrivate();
-      FLAGS_use_stream_safe_cuda_allocator, true,
+  if (!m->IsStreamSafeCUDAAllocatorUsed()) {
-      platform::errors::Unimplemented(
+    VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
-          "StreamSafeCUDAAllocator is disabled, you should not call this "
+    return Alloc(place, size);
-          "multi-stream 'Alloc' function. To enable it, you can enter"
+  }
-          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
-          "terminal."));
  platform::CUDAPlace p(place.GetDeviceId());
  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
    gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
-    return GetPrivate()
+    return m->GetAllocator(p, s, /* create_if_not_found = */ true)
-        ->GetAllocator(p, s, /* create_if_not_found = */ true)
        ->Allocate(size);
  } else {
-    return GetPrivate()->GetAllocator(p, size)->Allocate(size);
+    return m->GetAllocator(p, size)->Allocate(size);
  }
 #else
  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
@@ -1025,13 +1013,6 @@ bool AllocatorFacade::InSameStream(
    const std::shared_ptr<phi::Allocation>& allocation,
    const phi::Stream& stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  PADDLE_ENFORCE_EQ(
-      FLAGS_use_stream_safe_cuda_allocator, true,
-      platform::errors::Unimplemented(
-          "StreamSafeCUDAAllocator is disabled, you should not call this "
-          "multi-stream 'InSameStream' function. To enable it, you can enter"
-          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
-          "terminal."));
  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
  return s == GetStream(allocation);
 #else
@@ -1039,58 +1020,52 @@ bool AllocatorFacade::InSameStream(
 #endif
 }
+bool AllocatorFacade::IsStreamSafeCUDAAllocatorUsed() {
+  return GetPrivate()->IsStreamSafeCUDAAllocatorUsed();
+}
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
                                  const gpuStream_t& stream) {
-  PADDLE_ENFORCE_EQ(
+  AllocatorFacadePrivate* m = GetPrivate();
-      FLAGS_use_stream_safe_cuda_allocator, true,
+  if (!m->IsStreamSafeCUDAAllocatorUsed()) {
-      platform::errors::Unimplemented(
+    VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
-          "StreamSafeCUDAAllocator is disabled, you should not call this "
+    return Release(place);
-          "multi-stream 'Release' function. To enable it, you can enter"
+  }
-          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
-          "terminal."));
+  return m->GetAllocator(place, stream)->Release(place);
-  return GetPrivate()->GetAllocator(place, stream)->Release(place);
 }
 void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
                                   const gpuStream_t& stream) {
-  PADDLE_ENFORCE_EQ(
-      FLAGS_use_stream_safe_cuda_allocator, true,
-      platform::errors::Unimplemented(
-          "StreamSafeCUDAAllocator is disabled, you should not call this "
-          "'RecordStream' function. To enable it, you can enter"
-          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
-          "terminal."));
  GetPrivate()->RecordStream(allocation, stream);
 }
 const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
    const platform::Place& place, const gpuStream_t& stream) {
-  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
+  AllocatorFacadePrivate* m = GetPrivate();
-      FLAGS_use_system_allocator == false) {
-    return GetPrivate()->GetAllocator(place, stream,
+  if (!m->IsStreamSafeCUDAAllocatorUsed()) {
-                                      /*create_if_not_found=*/true);
+    VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
+    return GetAllocator(place);
  }
-  return GetPrivate()->GetAllocator(
-      place, /* A non-zero num to choose allocator_ */ 1);
+  if (platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) {
+    return m->GetAllocator(place, stream,
+                           /*create_if_not_found=*/true);
+  }
+  return m->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
 }
-const gpuStream_t& AllocatorFacade::GetStream(
+const gpuStream_t AllocatorFacade::GetStream(
    const std::shared_ptr<phi::Allocation>& allocation) const {
-  PADDLE_ENFORCE_EQ(
-      FLAGS_use_stream_safe_cuda_allocator, true,
-      platform::errors::Unimplemented(
-          "StreamSafeCUDAAllocator is disabled, you should not call this "
-          "'GetStream' function. To enable it, you can enter"
-          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
-          "terminal."));
  return GetPrivate()->GetStream(allocation);
 }
 void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
                                       const gpuStream_t& stream) {
-  if (FLAGS_use_stream_safe_cuda_allocator) {
+  if (m_->IsStreamSafeCUDAAllocatorUsed()) {
-    GetPrivate()->SetDefaultStream(place, stream);
+    m_->SetDefaultStream(place, stream);
  }
 }
@@ -1109,7 +1084,6 @@ void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
          "The memory pool of the CUDA Graph with ID %d have been prepared.",
          id));
  allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
-  allocator->SetDefaultStreamFromDeviceContext();
  VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
 }

--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -76,6 +76,8 @@ class AllocatorFacade {
  bool InSameStream(const std::shared_ptr<Allocation>& allocation,
                    const phi::Stream& stream);
+  bool IsStreamSafeCUDAAllocatorUsed();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed.
  uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
@@ -83,7 +85,7 @@ class AllocatorFacade {
                    const gpuStream_t& stream);
  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
                                                 const gpuStream_t& stream);
-  const gpuStream_t& GetStream(
+  const gpuStream_t GetStream(
      const std::shared_ptr<Allocation>& allocation) const;
  void SetDefaultStream(const platform::CUDAPlace& place,
                        const gpuStream_t& stream);

--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -67,7 +67,7 @@ void RecordStream(std::shared_ptr<Allocation> allocation,
                                                              stream);
 }
-const gpuStream_t& GetStream(const std::shared_ptr<Allocation>& allocation) {
+const gpuStream_t GetStream(const std::shared_ptr<Allocation>& allocation) {
  return allocation::AllocatorFacade::Instance().GetStream(allocation);
 }

--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -56,7 +56,7 @@ extern uint64_t Release(const platform::CUDAPlace& place,
 void RecordStream(std::shared_ptr<Allocation> allocation,
                  const gpuStream_t& stream);
-const gpuStream_t& GetStream(const std::shared_ptr<Allocation>& allocation);
+const gpuStream_t GetStream(const std::shared_ptr<Allocation>& allocation);
 #endif
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -12,6 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <thread>  // NOLINT
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/stream.h"
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -21,14 +30,6 @@
 #include <hip/hip_runtime.h>
 #endif
-#include <thread>  // NOLINT
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/device_context.h"
 namespace paddle {
 namespace memory {
@@ -196,5 +197,12 @@ TEST(Malloc, AllocZero) {
  AllocationPtr allocation_ptr = Alloc(place, 0);
  EXPECT_GE(allocation_ptr->size(), 0);
 }
+TEST(Malloc, AllocWithStream) {
+  size_t size = 1024;
+  AllocationPtr allocation = Alloc(platform::CUDAPlace(), size, phi::Stream(0));
+  EXPECT_EQ(allocation->size(), 1024);
+}
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -101,6 +101,19 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
  CheckMemLeak(place);
 }
+TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorWithDefaultStreamTest) {
+  auto &instance = allocation::AllocatorFacade::Instance();
+  platform::CUDAPlace place = platform::CUDAPlace();
+  const std::shared_ptr<Allocator> allocator_implicit_stream =
+      instance.GetAllocator(place);
+  const std::shared_ptr<Allocator> allocator_default_stream =
+      instance.GetAllocator(
+          place, static_cast<phi::GPUContext *>(
+                     platform::DeviceContextPool::Instance().Get(place))
+                     ->stream());
+  EXPECT_EQ(allocator_implicit_stream.get(), allocator_default_stream.get());
+}
 TEST(StreamSafeCUDAAllocInterfaceTest, ZeroSizeRecordStreamTest) {
  platform::CUDAPlace place = platform::CUDAPlace();
  std::shared_ptr<Allocation> zero_size_allocation = AllocShared(place, 0);