Support multi-stream allocation for CUDA place (#37290)

* Support multi-stream allocation for CUDA place * Do not notify the retrying from other streams when free CUDA allocation * Fix compile error for CPU * Fix compile error for HIP * Release memory for StreamSafeCUDAAllocaRetry in malloc_test * Add FLAGS_use_stream_safe_cuda_allocator * Fix CI error for 'set_tests_properties' * Invalidate stream safe CUDA allocator for naive_best_fit and thread_local strategy * Performance improvement: insert allocation pair to outstanding_events_map when free but not alloc; replace recursive_mutex with SpinLock * FLAGS priority changes: FLAGS_use_system_allocator > FLAGS_use_stream_safe_cuda_allocator * Performance improvement: directly delete allocation when the recorded_streams is empty in FreeImpl of StreamSafeCUDAAllocator * Add UT for alloc interface * Changes multi-stream interface; move retry code from AllocatorFacadePrivate to StreamSafeCUDAAllocator

Support multi-stream allocation for CUDA place (#37290)
* Support multi-stream allocation for CUDA place * Do not notify the retrying from other streams when free CUDA allocation * Fix compile error for CPU * Fix compile error for HIP * Release memory for StreamSafeCUDAAllocaRetry in malloc_test * Add FLAGS_use_stream_safe_cuda_allocator * Fix CI error for 'set_tests_properties' * Invalidate stream safe CUDA allocator for naive_best_fit and thread_local strategy * Performance improvement: insert allocation pair to outstanding_events_map when free but not alloc; replace recursive_mutex with SpinLock * FLAGS priority changes: FLAGS_use_system_allocator > FLAGS_use_stream_safe_cuda_allocator * Performance improvement: directly delete allocation when the recorded_streams is empty in FreeImpl of StreamSafeCUDAAllocator * Add UT for alloc interface * Changes multi-stream interface; move retry code from AllocatorFacadePrivate to StreamSafeCUDAAllocator
b9c464c3 · From00 · GitHub · adb54eb0 · b9c464c3 · b9c464c3
9 changed file
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -17,6 +17,16 @@ if (WITH_GPU)
    nv_test(malloc_test
            SRCS malloc_test.cu
            DEPS device_context malloc)
+    nv_test(stream_safe_cuda_alloc_test
+            SRCS stream_safe_cuda_alloc_test.cu
+            DEPS malloc)
+    if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test)
+        set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES 
+                             ENVIRONMENT "FLAGS_use_system_allocator=false"
+                             ENVIRONMENT "FLAGS_enable_stream_safe_cuda_allocator=true"
+                             ENVIRONMENT "FLAGS_allocator_strategy=auto_growth")   
+    endif()
 endif()
 if (WITH_ROCM)

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -15,8 +15,10 @@ endif()
 if (WITH_GPU)
  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
-  nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
  nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
+  nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
+  nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
  cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
  if(CUDA_VERSION GREATER_EQUAL 10.2)
    nv_library(cuda_virtual_mem_allocator SRCS cuda_virtual_mem_allocator.cc DEPS dynload_cuda)
@@ -25,8 +27,10 @@ endif()
 if (WITH_ROCM)
  hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
-  hip_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
  hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
+  hip_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
+  hip_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
  cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
 endif()
@@ -38,7 +42,7 @@ endif()
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 if (WITH_GPU OR WITH_ROCM)
-    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
+    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator)
    if(CUDA_VERSION GREATER_EQUAL 10.2)
      list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator)
    endif()

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -15,36 +15,45 @@
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "gflags/gflags.h"
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
-#endif
-#include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/npu_info.h"
 #include "paddle/fluid/platform/place.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
+#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#include "paddle/fluid/platform/cuda_graph.h"
+#else
+#include <hip/hip_runtime.h>
 #endif
 #if CUDA_VERSION >= 10020
 #include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h"
 #include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
 #include "paddle/fluid/platform/dynload/cuda_driver.h"
 #endif
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cuda_graph.h"
 #endif
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #endif
-#include "paddle/fluid/platform/npu_info.h"
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
+#endif
 PADDLE_DEFINE_EXPORTED_int64(
    gpu_allocator_retry_time, 10000,
@@ -59,6 +68,12 @@ PADDLE_DEFINE_EXPORTED_bool(
 PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
                            "Use VirtualMemoryAutoGrowthBestFitAllocator.");
+// NOTE(Ruibiao): This FLAGS is just to be compatibled with
+// the old single-stream CUDA allocator. It will be removed
+// after StreamSafeCudaAllocator has been fully tested.
+PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, true,
+                            "Enable StreamSafeCUDAAllocator");
 DECLARE_string(allocator_strategy);
 namespace paddle {
@@ -114,23 +129,34 @@ class AllocatorFacadePrivate {
 public:
  using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  using CUDAAllocatorMap =
+      std::map<platform::CUDAPlace,
+               std::map<gpuStream_t, std::shared_ptr<Allocator>>>;
+#endif
  explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
    strategy_ = GetAllocatorStrategy();
    switch (strategy_) {
      case AllocatorStrategy::kNaiveBestFit: {
        InitNaiveBestFitCPUAllocator();
-#ifdef PADDLE_WITH_XPU
-        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
-          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
-        }
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+        if (FLAGS_use_stream_safe_cuda_allocator) {
+          LOG(WARNING) << "FLAGS_use_stream_safe_cuda_allocator is invalid for "
+                          "naive_best_fit strategy";
+          FLAGS_use_stream_safe_cuda_allocator = false;
+        }
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
          InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
 #endif
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_ASCEND_CL
        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
@@ -142,18 +168,29 @@ class AllocatorFacadePrivate {
      case AllocatorStrategy::kAutoGrowth: {
        InitNaiveBestFitCPUAllocator();
-#ifdef PADDLE_WITH_XPU
-        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
-          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
-        }
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+        allow_free_idle_chunk_ = allow_free_idle_chunk;
+        if (FLAGS_use_stream_safe_cuda_allocator) {
+          // TODO(Ruibiao): Support multi-stream allocator for other strategies
+          default_stream_ = nullptr;
+          for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
+               ++dev_id) {
+            InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id),
+                                        default_stream_);
+          }
+        } else {
          for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
               ++dev_id) {
            InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
-                                      allow_free_idle_chunk);
+                                        allow_free_idle_chunk_);
+          }
        }
        InitNaiveBestFitCUDAPinnedAllocator();
+#endif
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
 #endif
        break;
      }
@@ -166,6 +203,12 @@ class AllocatorFacadePrivate {
        }
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+        if (FLAGS_use_stream_safe_cuda_allocator) {
+          LOG(WARNING) << "FLAGS_use_stream_safe_cuda_allocator is invalid for "
+                          "thread_local strategy";
+          FLAGS_use_stream_safe_cuda_allocator = false;
+        }
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
          InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
@@ -190,24 +233,6 @@ class AllocatorFacadePrivate {
    CheckAllocThreadSafe();
  }
-  inline const AllocatorMap& GetAllocatorMap() {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      auto id = platform::CUDAGraph::CapturingID();
-      auto iter = cuda_graph_allocator_map_.find(id);
-      PADDLE_ENFORCE_NE(
-          iter, cuda_graph_allocator_map_.end(),
-          platform::errors::PermissionDenied(
-              "No memory pool is prepared for CUDA Graph capturing."));
-      return iter->second->allocators_;
-    } else {
-      return allocators_;
-    }
-#else
-    return allocators_;
-#endif
-  }
  inline const std::shared_ptr<Allocator>& GetAllocator(
      const platform::Place& place, size_t size) {
    VLOG(6) << "GetAllocator"
@@ -223,25 +248,106 @@ class AllocatorFacadePrivate {
    return iter->second;
  }
- private:
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  void InitSystemAllocators() {
+  const std::shared_ptr<Allocator>& GetAllocator(
-    if (!system_allocators_.empty()) return;
+      const platform::CUDAPlace& place, const gpuStream_t& stream,
-    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
+      bool create_if_not_found = false) {
-#ifdef PADDLE_WITH_XPU
+    auto place_it = cuda_allocators_.find(place);
-    int device_count = platform::GetXPUDeviceCount();
+    PADDLE_ENFORCE_NE(place_it, cuda_allocators_.end(),
-    for (int i = 0; i < device_count; ++i) {
+                      platform::errors::NotFound(
-      platform::XPUPlace p(i);
+                          "No allocator found for the place %s", place));
-      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+    const std::map<gpuStream_t, std::shared_ptr<Allocator>>& allocator_map =
+        place_it->second;
+    auto stream_it = allocator_map.find(stream);
+    if (stream_it == allocator_map.end()) {
+      if (create_if_not_found) {
+        InitStreamSafeCUDAAllocator(place, stream);
+        return cuda_allocators_[place][stream];
+      } else {
+        PADDLE_THROW(platform::errors::NotFound(
+            "No allocator found for stream %s in place %s", stream, place));
+      }
+    }
+    return stream_it->second;
+  }
+  gpuStream_t GetDefaultStream() { return default_stream_; }
+  void RecordStream(Allocation* allocation, const gpuStream_t& stream) {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(allocation->place()), true,
+        platform::errors::InvalidArgument(
+            "Not allow to record stream for an allocation with place %s",
+            allocation->place()));
+    dynamic_cast<StreamSafeCUDAAllocation*>(allocation)->RecordStream(stream);
+  }
+#ifdef PADDLE_WITH_CUDA
+  void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
+    PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
+                      platform::errors::InvalidArgument(
+                          "CUDA Graph is only supported when the "
+                          "FLAGS_allocator_strategy=\"auto_growth\", but got "
+                          "FLAGS_allocator_strategy=\"%s\"",
+                          FLAGS_allocator_strategy));
+    auto& allocator = cuda_graph_allocator_map_[id];
+    PADDLE_ENFORCE_EQ(
+        allocator.get(), nullptr,
+        platform::errors::InvalidArgument(
+            "The memory pool of the CUDA Graph with ID %d have been prepared.",
+            id));
+    allocator.reset(
+        new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
+    for (auto& item : allocator->allocators_) {
+      auto& old_allocator = item.second;
+      old_allocator = CUDAGraphAllocator::Create(old_allocator);
+    }
+    VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
+  }
+  void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
+    auto iter = cuda_graph_allocator_map_.find(id);
+    PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(),
+                      platform::errors::InvalidArgument(
+                          "Cannot find CUDA Graph with ID = %d", id));
+    cuda_graph_allocator_map_.erase(iter);
+    VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
  }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#endif
-    system_allocators_[platform::CUDAPinnedPlace()] =
-        std::make_shared<CPUPinnedAllocator>();
+ private:
-    int device_count = platform::GetCUDADeviceCount();
+  class ZeroSizeAllocator : public Allocator {
-    for (int i = 0; i < device_count; ++i) {
+   public:
-      platform::CUDAPlace p(i);
+    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
-      system_allocators_[p] = std::make_shared<CUDAAllocator>(p);
+    bool IsAllocThreadSafe() const override { return true; }
+   protected:
+    Allocation* AllocateImpl(size_t size) override {
+      return new Allocation(nullptr, 0, place_);
    }
+    void FreeImpl(Allocation* allocation) override { delete allocation; }
+   private:
+    platform::Place place_;
+  };
+  const AllocatorMap& GetAllocatorMap() {
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+      auto id = platform::CUDAGraph::CapturingID();
+      auto iter = cuda_graph_allocator_map_.find(id);
+      PADDLE_ENFORCE_NE(
+          iter, cuda_graph_allocator_map_.end(),
+          platform::errors::PermissionDenied(
+              "No memory pool is prepared for CUDA Graph capturing."));
+      return iter->second->allocators_;
+    } else {
+      return allocators_;
+    }
+#else
+    return allocators_;
 #endif
  }
@@ -256,14 +362,108 @@ class AllocatorFacadePrivate {
        std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
  }
+  void InitStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
+    PADDLE_ENFORCE_EQ(
+        strategy_, AllocatorStrategy::kAutoGrowth,
+        platform::errors::Unimplemented(
+            "Only support auto-growth strategey for StreamSafeCUDAAllocator, "
+            "the allocator strategy %d is unsupported for multi-stream",
+            static_cast<int>(strategy_)));
+    VLOG(9) << "Init CUDA allocator for stream " << stream << " in place " << p;
+    std::lock_guard<SpinLock> lock_guard(cuda_allocators_lock_);
+    try {
+      GetAllocator(p, stream);
+      VLOG(9) << "Other thread had build a allocator for stream " << stream
+              << " in place " << p;
+    } catch (platform::EnforceNotMet&) {
+      InitAutoGrowthCUDAAllocator(p, stream);
+      WrapStreamSafeCUDAAllocator(p, stream);
+      WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time);
+    } catch (...) {
+      throw;
+    }
+  }
  void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
-  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) {
+  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
-    allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
+#if defined(PADDLE_WITH_HIP)
+    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
+        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk_);
+#endif
+#if defined(PADDLE_WITH_CUDA)
+#if CUDA_VERSION >= 10020
+    CUdevice device;
+    int val;
+    try {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          paddle::platform::dynload::cuDeviceGetAttribute(
+              &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
+              device));
+    } catch (...) {
+      val = 0;
+    }
+    if (val > 0 && FLAGS_use_virtual_memory_auto_growth) {
+      auto cuda_allocator = std::make_shared<CUDAVirtualMemAllocator>(p);
+      cuda_allocators_[p][stream] =
+          std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
+              cuda_allocator, platform::GpuMinChunkSize(), p);
+    } else {
+      auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+      cuda_allocators_[p][stream] =
+          std::make_shared<AutoGrowthBestFitAllocator>(
+              cuda_allocator, platform::GpuMinChunkSize(),
+              allow_free_idle_chunk_);
+    }
+#else
+    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    auto alignment = platform::GpuMinChunkSize();
+    bool need_addr_align = true;
+    // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
+    // API in that case may got cuda error(3), i.e.,
+    // cudaErrorInitializationError. And, the CUDAAllocator is only initialized
+    // but not really used.
+    // Here, the try-catch block is added to handle the case that
+    // GetDeviceProperties() may failed in the multiple process(for example, in
+    // dataloader with num_worker > 0)
+    try {
+      const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
+      need_addr_align = prop.textureAlignment < alignment;
+      VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
+              << prop.textureAlignment
+              << ", set need_addr_align=" << need_addr_align;
+    } catch (...) {
+      need_addr_align = true;
+      VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
+    }
+    // The address returned is aligned already,
+    // ref:
+    // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
+    std::shared_ptr<Allocator> underlying_allocator{nullptr};
+    if (need_addr_align) {
+      VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
+      underlying_allocator =
+          std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
+    } else {
+      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
+      underlying_allocator = cuda_allocator;
+    }
+    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
+        underlying_allocator, alignment, 0, allow_free_idle_chunk_);
+#endif
+#endif
  }
+  // NOTE(Ruibiao): Old single-stream version, will be removed later
  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
                                   bool allow_free_idle_chunk) {
 #if defined(PADDLE_WITH_HIP)
@@ -337,6 +537,37 @@ class AllocatorFacadePrivate {
 #endif
 #endif
  }
+  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) {
+    allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
+  }
+  void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
+    const std::shared_ptr<Allocator>& underlying_allocator =
+        GetAllocator(p, stream);
+    cuda_allocators_[p][stream] = std::make_shared<StreamSafeCUDAAllocator>(
+        underlying_allocator, p, stream);
+  }
+  void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
+                              size_t retry_time) {
+    PADDLE_ENFORCE_GT(
+        retry_time, 0,
+        platform::errors::InvalidArgument(
+            "Retry time should be larger than 0, but got %d", retry_time));
+    std::shared_ptr<Allocator> allocator = GetAllocator(p, stream);
+    allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
+  }
+  static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) {
+    for (auto& place_pair : allocators) {
+      for (auto& stream_pair : place_pair.second) {
+        PADDLE_ENFORCE_EQ(stream_pair.second->IsAllocThreadSafe(), true,
+                          platform::errors::InvalidArgument(
+                              "Public allocators must be thread safe"));
+      }
+    }
+  }
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -354,25 +585,28 @@ class AllocatorFacadePrivate {
    allocators_[platform::NPUPinnedPlace()] =
        std::make_shared<paddle::memory::allocation::NPUPinnedAllocator>();
  }
 #endif
-  class ZeroSizeAllocator : public Allocator {
+  void InitSystemAllocators() {
-   public:
+    if (!system_allocators_.empty()) return;
-    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
+    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
+#ifdef PADDLE_WITH_XPU
-    bool IsAllocThreadSafe() const override { return true; }
+    int device_count = platform::GetXPUDeviceCount();
+    for (int i = 0; i < device_count; ++i) {
-   protected:
+      platform::XPUPlace p(i);
-    Allocation* AllocateImpl(size_t size) override {
+      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
-      return new Allocation(nullptr, 0, place_);
+    }
+#endif
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    system_allocators_[platform::CUDAPinnedPlace()] =
+        std::make_shared<CPUPinnedAllocator>();
+    int device_count = platform::GetCUDADeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::CUDAPlace p(i);
+      system_allocators_[p] = std::make_shared<CUDAAllocator>(p);
+    }
+#endif
  }
-    void FreeImpl(Allocation* allocation) override { delete allocation; }
-   private:
-    platform::Place place_;
-  };
  void InitZeroSizeAllocators() {
    if (!zero_size_allocators_.empty()) return;
@@ -415,8 +649,14 @@ class AllocatorFacadePrivate {
    CheckAllocThreadSafe(allocators_);
    CheckAllocThreadSafe(zero_size_allocators_);
    CheckAllocThreadSafe(system_allocators_);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (FLAGS_use_stream_safe_cuda_allocator) {
+      CheckCUDAAllocThreadSafe(cuda_allocators_);
+    }
+#endif
  }
+  // NOTE(Ruibiao): Old single-stream version, will be removed later
  void WrapCUDARetryAllocator(size_t retry_time) {
    PADDLE_ENFORCE_GT(
        retry_time, 0,
@@ -429,53 +669,22 @@ class AllocatorFacadePrivate {
    }
  }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // a standalone CUDA allocator to support multi-stream GC in new executor
- public:
+  CUDAAllocatorMap cuda_allocators_;
-  void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
+  gpuStream_t default_stream_;
-    PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
+  SpinLock cuda_allocators_lock_;
-                      platform::errors::InvalidArgument(
-                          "CUDA Graph is only supported when the "
-                          "FLAGS_allocator_strategy=\"auto_growth\", but got "
-                          "FLAGS_allocator_strategy=\"%s\"",
-                          FLAGS_allocator_strategy));
-    auto& allocator = cuda_graph_allocator_map_[id];
-    PADDLE_ENFORCE_EQ(
-        allocator.get(), nullptr,
-        platform::errors::InvalidArgument(
-            "The memory pool of the CUDA Graph with ID %d have been prepared.",
-            id));
-    allocator.reset(
-        new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
-    for (auto& item : allocator->allocators_) {
-      auto& old_allocator = item.second;
-      old_allocator = CUDAGraphAllocator::Create(old_allocator);
-    }
-    VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
-  }
-  void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
-    auto iter = cuda_graph_allocator_map_.find(id);
-    PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(),
-                      platform::errors::InvalidArgument(
-                          "Cannot find CUDA Graph with ID = %d", id));
-    cuda_graph_allocator_map_.erase(iter);
-    VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
-  }
-#endif
- private:
-  AllocatorMap allocators_;
 #ifdef PADDLE_WITH_CUDA
  std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
      cuda_graph_allocator_map_;
+#endif
 #endif
  AllocatorStrategy strategy_;
+  AllocatorMap allocators_;
  static AllocatorMap zero_size_allocators_;
  static AllocatorMap system_allocators_;
+  bool allow_free_idle_chunk_;
 };
 AllocatorFacadePrivate::AllocatorMap
    AllocatorFacadePrivate::zero_size_allocators_;
 AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_;
@@ -491,6 +700,18 @@ AllocatorFacade& AllocatorFacade::Instance() {
  return instance;
 }
+const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
+    const platform::Place& place) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
+      FLAGS_use_system_allocator == false) {
+    return m_->GetAllocator(BOOST_GET_CONST(platform::CUDAPlace, place),
+                            m_->GetDefaultStream());
+  }
+#endif
+  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+}
 std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
    const platform::Place& place, size_t size) {
  return std::shared_ptr<Allocation>(Alloc(place, size));
@@ -498,17 +719,80 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                     size_t size) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
+      size > 0 && FLAGS_use_system_allocator == false) {
+    return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place), size,
+                 m_->GetDefaultStream());
+  }
+#endif
  return m_->GetAllocator(place, size)->Allocate(size);
 }
 uint64_t AllocatorFacade::Release(const platform::Place& place) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
+      FLAGS_use_system_allocator == false) {
+    return Release(BOOST_GET_CONST(platform::CUDAPlace, place),
+                   m_->GetDefaultStream());
+  }
+#endif
  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
      ->Release(place);
 }
-const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    const platform::Place& place) {
+std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
-  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+    const platform::CUDAPlace& place, size_t size, const gpuStream_t& stream) {
+  PADDLE_ENFORCE_EQ(
+      FLAGS_use_stream_safe_cuda_allocator, true,
+      platform::errors::Unimplemented(
+          "StreamSafeCUDAAllocator is disabled, you should not call this "
+          "multi-stream 'AllocaShared' function. "
+          "To enable it, you can enter 'export "
+          "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
+  return std::shared_ptr<Allocation>(Alloc(place, size, stream));
+}
+AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
+                                     size_t size, const gpuStream_t& stream) {
+  PADDLE_ENFORCE_EQ(
+      FLAGS_use_stream_safe_cuda_allocator, true,
+      platform::errors::Unimplemented(
+          "StreamSafeCUDAAllocator is disabled, you should not call this "
+          "multi-stream 'Alloca' function. "
+          "To enable it, you can enter 'export "
+          "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
+  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
+    return m_->GetAllocator(place, stream, /* creat_if_not_found = */ true)
+        ->Allocate(size);
+  } else {
+    return m_->GetAllocator(place, size)->Allocate(size);
+  }
+}
+uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
+                                  const gpuStream_t& stream) {
+  PADDLE_ENFORCE_EQ(
+      FLAGS_use_stream_safe_cuda_allocator, true,
+      platform::errors::Unimplemented(
+          "StreamSafeCUDAAllocator is disabled, you should not call this "
+          "multi-stream 'Release' function. "
+          "To enable it, you can enter 'export "
+          "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
+  return m_->GetAllocator(place, stream)->Release(place);
+}
+void AllocatorFacade::RecordStream(Allocation* allocation,
+                                   const gpuStream_t& stream) {
+  PADDLE_ENFORCE_EQ(
+      FLAGS_use_stream_safe_cuda_allocator, true,
+      platform::errors::Unimplemented(
+          "StreamSafeCUDAAllocator is disabled, you should not call this "
+          "'RecordStream' function. "
+          "To enable it, you can enter 'export "
+          "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
+  m_->RecordStream(allocation, stream);
 }
 #ifdef PADDLE_WITH_CUDA
@@ -520,7 +804,7 @@ void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
  return m_->RemoveMemoryPoolOfCUDAGraph(id);
 }
 #endif
+#endif
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -26,6 +26,7 @@
 namespace paddle {
 namespace memory {
 namespace allocation {
 #ifdef PADDLE_WITH_ASCEND_CL
 using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator;
 #endif
@@ -40,26 +41,34 @@ using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator;
 class AllocatorFacadePrivate;
 class AllocatorFacade {
 public:
-  ~AllocatorFacade();
  AllocatorFacade(const AllocatorFacade& o) = delete;
  const AllocatorFacade& operator=(const AllocatorFacade& o) = delete;
+  ~AllocatorFacade();
  static AllocatorFacade& Instance();
+  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
  // Allocate a shared allocation.
  std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                          size_t size);
  // Allocate a unique allocation.
  AllocationPtr Alloc(const platform::Place& place, size_t size);
  // Release unused memory pool.
  uint64_t Release(const platform::Place& place);
-  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
+                                          size_t size,
+                                          const gpuStream_t& stream);
+  AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
+                      const gpuStream_t& stream);
+  uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
+  void RecordStream(Allocation* allocation, const gpuStream_t& stream);
 #ifdef PADDLE_WITH_CUDA
  void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id);
  void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id);
+#endif
 #endif
  // TODO(yy): Allocate a Copy-On-Write allocation?

--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace memory {
+namespace allocation {
+StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
+    AllocationPtr underlying_allocation, gpuStream_t owning_stream)
+    : Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
+                 underlying_allocation->place()),
+      underlying_allocation_(std::move(underlying_allocation)),
+      owning_stream_(owning_stream),
+      recorded_streams_(std::make_shared<std::set<gpuStream_t>>()) {}
+void StreamSafeCUDAAllocation::RecordStream(gpuStream_t stream) {
+  VLOG(8) << "Record stream " << stream << " to " << ptr();
+  if (stream == owning_stream_) {
+    return;
+  }
+  std::lock_guard<SpinLock> lock_guard(spin_lock_);
+  recorded_streams_->insert(stream);
+}
+std::shared_ptr<std::set<gpuStream_t>>
+StreamSafeCUDAAllocation::GetRecordedStreams() {
+  return recorded_streams_;
+}
+StreamSafeCUDAAllocator::StreamSafeCUDAAllocator(
+    const std::shared_ptr<Allocator>& underlying_allocator,
+    const platform::CUDAPlace& place, const gpuStream_t default_stream)
+    : underlying_allocator_(underlying_allocator),
+      place_(place),
+      default_stream_(default_stream) {
+  std::lock_guard<SpinLock> lock_guard(allocators_map_lock_);
+  allocators_map_[place].emplace_back(this);
+}
+StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
+  std::lock_guard<SpinLock> lock_guard(allocators_map_lock_);
+  std::vector<StreamSafeCUDAAllocator*>& allocators = allocators_map_[place_];
+  allocators.erase(std::remove(allocators.begin(), allocators.end(), this),
+                   allocators.end());
+}
+bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
+Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
+  ProcessEventsAndFree();
+  AllocationPtr underlying_allocation;
+  try {
+    underlying_allocation = underlying_allocator_->Allocate(size);
+  } catch (BadAlloc&) {
+    VLOG(9) << "Allocation failed when allocating " << size << " bytes";
+    uint64_t release_size = ReleaseImpl(place_);
+    VLOG(9) << "Release " << release_size << " bytes memory from all streams";
+    try {
+      underlying_allocation = underlying_allocator_->Allocate(size);
+    } catch (...) {
+      VLOG(9) << "Still allocation failed after release memory";
+      throw;
+    }
+  } catch (...) {
+    throw;
+  }
+  StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation(
+      std::move(underlying_allocation), default_stream_);
+  return allocation;
+}
+void StreamSafeCUDAAllocator::FreeImpl(Allocation* allocation) {
+  if (dynamic_cast<StreamSafeCUDAAllocation*>(allocation)
+          ->GetRecordedStreams()
+          ->empty()) {
+    delete allocation;
+  } else {
+    std::lock_guard<SpinLock> lock_guard(outstanding_events_map_lock_);
+    FreeStreamSafeCUDAAllocation(allocation);
+  }
+}
+uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
+  std::lock_guard<SpinLock> lock_guard(allocators_map_lock_);
+  std::vector<StreamSafeCUDAAllocator*>& allocators =
+      allocators_map_[BOOST_GET_CONST(platform::CUDAPlace, place)];
+  uint64_t release_size = 0;
+  for (StreamSafeCUDAAllocator* allocator : allocators) {
+    release_size += allocator->ProcessEventsAndFreeWithRelease();
+  }
+  return release_size;
+}
+void StreamSafeCUDAAllocator::CreateEventForAllRecordedStream(
+    std::set<gpuStream_t>* recorded_streams,
+    std::deque<gpuEvent_t>* outstanding_events) {
+  for (gpuStream_t stream : *recorded_streams) {
+    gpuEvent_t event;
+#ifdef PADDLE_WITH_CUDA
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        hipEventCreateWithFlags(&event, hipEventDisableTiming));
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, stream));
+#endif
+    outstanding_events->emplace_back(event);
+    VLOG(9) << "Record event " << event << " in stream " << stream;
+  }
+  recorded_streams->clear();
+}
+void StreamSafeCUDAAllocator::FreeStreamSafeCUDAAllocation(
+    Allocation* allocation) {
+  std::deque<gpuEvent_t>& outstanding_events =
+      outstanding_events_map_[allocation];
+  CreateEventForAllRecordedStream(
+      dynamic_cast<StreamSafeCUDAAllocation*>(allocation)
+          ->GetRecordedStreams()
+          .get(),
+      &outstanding_events);
+  if (!outstanding_events.empty()) {
+    VLOG(8) << allocation->ptr() << " is not ready to free";
+    return;
+  }
+  VLOG(8) << "Free " << allocation->ptr();
+  outstanding_events_map_.erase(allocation);
+  delete allocation;
+}
+void StreamSafeCUDAAllocator::ProcessEventsAndFree() {
+  std::lock_guard<SpinLock> lock_guard(outstanding_events_map_lock_);
+  for (auto map_it = outstanding_events_map_.begin();
+       map_it != outstanding_events_map_.end();) {
+    std::deque<gpuEvent_t>& outstanding_events = map_it->second;
+    VLOG(10) << "Check " << outstanding_events.size()
+             << " outstanding events for " << map_it->first->ptr();
+    auto deque_it = outstanding_events.begin();
+    while (deque_it != outstanding_events.end()) {
+#ifdef PADDLE_WITH_CUDA
+      gpuError_t err = cudaEventQuery(*deque_it);
+      if (err == cudaErrorNotReady) {
+        VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr()
+                 << " is not completed";
+        outstanding_events.erase(outstanding_events.begin(), deque_it);
+        break;
+      }
+      PADDLE_ENFORCE_CUDA_SUCCESS(err);
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(*deque_it));
+#else
+      gpuError_t err = hipEventQuery(*deque_it);
+      if (err == hipErrorNotReady) {
+        VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr()
+                 << " is not completed";
+        // Erase the completded event before "deque_it"
+        outstanding_events.erase(outstanding_events.begin(), deque_it);
+        break;
+      }
+      PADDLE_ENFORCE_CUDA_SUCCESS(err);
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(*deque_it));
+#endif
+      ++deque_it;
+    }
+    if (deque_it == outstanding_events.end()) {
+      outstanding_events.clear();
+      Allocation* allocation = map_it->first;
+      // "map_it" may be invalid after calling FreeStreamSafeCUDAAllocation
+      auto next_it = ++map_it;
+      FreeStreamSafeCUDAAllocation(allocation);
+      map_it = next_it;
+    } else {
+      ++map_it;
+    }
+  }
+}
+uint64_t StreamSafeCUDAAllocator::ProcessEventsAndFreeWithRelease() {
+  ProcessEventsAndFree();
+  return underlying_allocator_->Release(place_);
+}
+std::map<platform::CUDAPlace, std::vector<StreamSafeCUDAAllocator*>>
+    StreamSafeCUDAAllocator::allocators_map_;
+SpinLock StreamSafeCUDAAllocator::allocators_map_lock_;
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#else
+#include <hip/hip_runtime.h>
+#endif
+#include <deque>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <set>
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/allocation/spin_lock.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace memory {
+namespace allocation {
+class StreamSafeCUDAAllocation : public Allocation {
+ public:
+  StreamSafeCUDAAllocation(AllocationPtr underlying_allocation,
+                           gpuStream_t owning_stream);
+  void RecordStream(gpuStream_t stream);
+  std::shared_ptr<std::set<gpuStream_t>> GetRecordedStreams();
+ private:
+  AllocationPtr underlying_allocation_;
+  gpuStream_t owning_stream_;
+  std::shared_ptr<std::set<gpuStream_t>> recorded_streams_;
+  SpinLock spin_lock_;
+};
+class StreamSafeCUDAAllocator : public Allocator {
+ public:
+  StreamSafeCUDAAllocator(
+      const std::shared_ptr<Allocator> &underlying_allocator,
+      const platform::CUDAPlace &place, const gpuStream_t default_stream);
+  ~StreamSafeCUDAAllocator();
+  bool IsAllocThreadSafe() const override;
+ protected:
+  Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(Allocation *allocation) override;
+  uint64_t ReleaseImpl(const platform::Place &place) override;
+ private:
+  void CreateEventForAllRecordedStream(
+      std::set<gpuStream_t> *recorded_streams,
+      std::deque<gpuEvent_t> *outstanding_events);
+  void FreeStreamSafeCUDAAllocation(Allocation *allocation);
+  void ProcessEventsAndFree();
+  uint64_t ProcessEventsAndFreeWithRelease();
+  static std::map<platform::CUDAPlace, std::vector<StreamSafeCUDAAllocator *>>
+      allocators_map_;
+  static SpinLock allocators_map_lock_;
+  std::shared_ptr<Allocator> underlying_allocator_;
+  platform::CUDAPlace place_;
+  gpuStream_t default_stream_;
+  std::map<Allocation *, std::deque<gpuEvent_t>> outstanding_events_map_;
+  SpinLock outstanding_events_map_lock_;
+};
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -20,18 +20,40 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
-std::shared_ptr<Allocation> AllocShared(const platform::Place &place,
+std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                        size_t size) {
  return allocation::AllocatorFacade::Instance().AllocShared(place, size);
 }
-AllocationPtr Alloc(const platform::Place &place, size_t size) {
+AllocationPtr Alloc(const platform::Place& place, size_t size) {
  return allocation::AllocatorFacade::Instance().Alloc(place, size);
 }
-uint64_t Release(const platform::Place &place) {
+uint64_t Release(const platform::Place& place) {
  return allocation::AllocatorFacade::Instance().Release(place);
 }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
+                                        size_t size,
+                                        const gpuStream_t& stream) {
+  return allocation::AllocatorFacade::Instance().AllocShared(place, size,
+                                                             stream);
+}
+AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
+                    const gpuStream_t& stream) {
+  return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
+}
+uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) {
+  return allocation::AllocatorFacade::Instance().Release(place, stream);
+}
+void RecordStream(Allocation* allocation, const gpuStream_t& stream) {
+  return allocation::AllocatorFacade::Instance().RecordStream(allocation,
+                                                              stream);
+}
+#endif
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -40,5 +40,18 @@ extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size);
 extern uint64_t Release(const platform::Place& place);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+extern std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
+                                               size_t size,
+                                               const gpuStream_t& stream);
+extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
+                           const gpuStream_t& stream);
+extern uint64_t Release(const platform::CUDAPlace& place,
+                        const gpuStream_t& stream);
+void RecordStream(Allocation* allocation, const gpuStream_t& stream);
+#endif
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+#include <thread>  // NOLINT
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/gpu_info.h"
+namespace paddle {
+namespace memory {
+__global__ void add_kernel(int *x, int n) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
+    atomicAdd(x + i, tid);
+  }
+}
+class StreamSafeCUDAAllocTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    place_ = platform::CUDAPlace();
+    stream_num_ = 64;
+    grid_num_ = 1;
+    block_num_ = 64;
+    data_num_ = 64;
+    default_stream = nullptr;
+    streams_.reserve(stream_num_);
+    streams_.emplace_back(default_stream);
+    for (size_t i = 1; i < stream_num_; ++i) {
+      gpuStream_t stream;
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
+#else
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream));
+#endif
+      streams_.emplace_back(stream);
+    }
+    for (size_t i = 0; i < stream_num_; ++i) {
+      size_t allocation_size = data_num_ * sizeof(int);
+      std::shared_ptr<Allocation> allocation =
+          AllocShared(place_, allocation_size, streams_[i]);
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaMemset(allocation->ptr(), 0, allocation->size()));
+#else
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipMemset(allocation->ptr(), 0, allocation->size()));
+#endif
+      allocations_.emplace_back(allocation);
+    }
+  }
+  void SingleStreamRun(size_t idx) {
+    for (size_t i = 0; i < stream_num_; ++i) {
+      int *x = reinterpret_cast<int *>(allocations_[i]->ptr());
+      add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(x, data_num_);
+      if (i != idx) {
+        RecordStream(allocations_[i].get(), streams_[idx]);
+      }
+    }
+  }
+  void MultiStreamRun() {
+    for (int i = 0; i < stream_num_; ++i) {
+      SingleStreamRun(i);
+    }
+    allocations_.clear();  // fast_gc
+  }
+  void MultiThreadMUltiStreamRun() {
+    std::vector<std::thread> threads;
+    for (size_t i = 0; i < stream_num_; ++i) {
+      threads.push_back(
+          std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i));
+    }
+    for (size_t i = 0; i < stream_num_; ++i) {
+      threads[i].join();
+    }
+    allocations_.clear();  // fast_gc
+  }
+  void CheckResult() {
+    auto host_x = std::unique_ptr<int[]>(new int[data_num_]);
+    size_t thread_num = grid_num_ * block_num_;
+    for (int i = 0; i < stream_num_; ++i) {
+// tricky code, the allocations are still accessible even though
+// allocations_.clear() has been called
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaMemcpy(host_x.get(), allocations_[i]->ptr(),
+                     data_num_ * sizeof(int), cudaMemcpyDeviceToHost));
+#else
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipMemcpy(host_x.get(), allocations_[i]->ptr(),
+                    data_num_ * sizeof(int), hipMemcpyDeviceToHost));
+#endif
+      for (int j = 0; j < data_num_; ++j) {
+        EXPECT_TRUE(host_x[j] == (j % thread_num) * stream_num_);
+      }
+    }
+  }
+  void TearDown() override {
+#ifdef PADDLE_WITH_CUDA
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
+#endif
+    for (gpuStream_t stream : streams_) {
+      Release(place_, stream);
+    }
+    for (size_t i = 1; i < stream_num_; ++i) {
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(streams_[i]));
+#else
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(streams_[i]));
+#endif
+    }
+    uint64_t cuda_malloc_size =
+        platform::RecordedCudaMallocSize(place_.GetDeviceId());
+    ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
+                                   << " bytes memory that not released yet,"
+                                   << " there may be a memory leak problem";
+  }
+  size_t stream_num_;
+  size_t grid_num_;
+  size_t block_num_;
+  size_t data_num_;
+  platform::CUDAPlace place_;
+  gpuStream_t default_stream;
+  std::vector<gpuStream_t> streams_;
+  std::vector<std::shared_ptr<Allocation>> allocations_;
+};
+TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
+  MultiStreamRun();
+  CheckResult();
+}
+TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) {
+  MultiThreadMUltiStreamRun();
+  CheckResult();
+}
+TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
+  platform::CUDAPlace place = platform::CUDAPlace();
+  size_t alloc_size = 256;
+  std::shared_ptr<Allocation> allocation_implicit_stream =
+      AllocShared(place, alloc_size);
+  EXPECT_GE(allocation_implicit_stream->size(), alloc_size);
+  void *address = allocation_implicit_stream->ptr();
+  allocation_implicit_stream.reset();
+  gpuStream_t default_stream = nullptr;
+  allocation::AllocationPtr allocation_unique =
+      Alloc(place, alloc_size, default_stream);
+  EXPECT_GE(allocation_unique->size(), alloc_size);
+  EXPECT_EQ(allocation_unique->ptr(), address);
+}
+TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
+  platform::CUDAPlace place = platform::CUDAPlace();
+  gpuStream_t stream1, stream2;
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream1));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream2));
+#else
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream1));
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream2));
+#endif
+  size_t available_size = platform::GpuAvailableMemToAlloc();
+  // alloc_size < available_size < 2 * alloc_size
+  size_t alloc_size = available_size / 4 * 3;
+  std::shared_ptr<Allocation> allocation1 =
+      AllocShared(place, alloc_size, stream1);
+  std::shared_ptr<Allocation> allocation2;
+  std::thread th([&allocation2, &place, &stream2, alloc_size]() {
+    std::this_thread::sleep_for(std::chrono::seconds(1));
+    allocation2 = AllocShared(place, alloc_size, stream2);
+  });
+  allocation1.reset();  // free but not release
+  th.join();
+  EXPECT_GE(allocation2->size(), alloc_size);
+  allocation2.reset();
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
+#else
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
+#endif
+  Release(place, stream1);
+  Release(place, stream2);
+}
+}  // namespace memory
+}  // namespace paddle