diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index cbbc020989d1e97c779ffca0b2048b973e554289..e59733b4fbe9a0aeeacaccf23f02809db391dbbd 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -84,14 +84,9 @@ void* Tensor::mutable_data(const platform::Place& place,
   return mutable_data(place, type_, requested_size);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-void* Tensor::mutable_data(const platform::CUDAPlace& place,
+void* Tensor::mutable_data(const platform::Place& place,
                            proto::VarType::Type type,
-                           const gpuStream_t& stream) {
-  if (!FLAGS_use_stream_safe_cuda_allocator) {
-    return mutable_data(place, type);
-  }
-
+                           const platform::Stream& stream) {
   type_ = type;
   PADDLE_ENFORCE_GE(
       numel(), 0,
@@ -111,7 +106,6 @@ void* Tensor::mutable_data(const platform::CUDAPlace& place,
   return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                  offset_);
 }
-#endif
 
 Tensor& Tensor::ShareDataWith(const Tensor& src) {
   src.check_memory_size();
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 4b1ae041fc4cada165e792a37250a0c1a3de27b0..a146f5717441205ae1f1cc0ec43530675b274407 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/stream.h"
 
 namespace paddle {
 namespace memory {
@@ -150,10 +151,8 @@ class Tensor {
 
   void* mutable_data(const platform::Place& place, size_t requested_size = 0);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  void* mutable_data(const platform::CUDAPlace& place,
-                     proto::VarType::Type type, const gpuStream_t& stream);
-#endif
+  void* mutable_data(const platform::Place& place, proto::VarType::Type type,
+                     const platform::Stream& stream);
 
   /**
    * @brief     Return a pointer to mutable memory block.
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index a53c6a8dbeb12a1de64c3207632d0bc5d0158a00..3a53954436c89e2810ff77231e5ead89877336b2 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -879,9 +879,9 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
       ->Release(place);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
-    const platform::CUDAPlace& place, size_t size, const gpuStream_t& stream) {
+    const platform::Place& place, size_t size, const platform::Stream& stream) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
       platform::errors::Unimplemented(
@@ -896,12 +896,16 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
         "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
   }
 #endif
-
-  return std::shared_ptr<Allocation>(Alloc(place, size, stream));
+  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
+  return std::shared_ptr<Allocation>(Alloc(place, size, s));
+#else
+  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
+#endif
 }
 
-AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
-                                     size_t size, const gpuStream_t& stream) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
+                                     const gpuStream_t& stream) {
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
       platform::errors::Unimplemented(
@@ -917,11 +921,12 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
   }
 #endif
 
+  platform::CUDAPlace p = BOOST_GET_CONST(platform::CUDAPlace, place);
   if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
-    return m_->GetAllocator(place, stream, /* create_if_not_found = */ true)
+    return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
         ->Allocate(size);
   } else {
-    return m_->GetAllocator(place, size)->Allocate(size);
+    return m_->GetAllocator(p, size)->Allocate(size);
   }
 }
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 4c4f805a0c61968d197720c46411f1021d323041..b10ea9948d4497af38f9cd172180771553c898b9 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -22,6 +22,7 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/stream.h"
 
 namespace paddle {
 namespace memory {
@@ -57,21 +58,23 @@ class AllocatorFacade {
   // Release unused memory pool.
   uint64_t Release(const platform::Place& place);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
+  std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                           size_t size,
-                                          const gpuStream_t& stream);
-  AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
+                                          const platform::Stream& stream);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // TODO(zhiqiu): change gpuStream_t to platform::Stream if needed.
+  AllocationPtr Alloc(const platform::Place& place, size_t size,
                       const gpuStream_t& stream);
   uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
   void RecordStream(std::shared_ptr<Allocation> allocation,
                     const gpuStream_t& stream);
   const gpuStream_t& GetStream(
       const std::shared_ptr<Allocation>& allocation) const;
+#endif
+
 #ifdef PADDLE_WITH_CUDA
   void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id);
   void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id);
-#endif
 #endif
 
   // TODO(yy): Allocate a Copy-On-Write allocation?
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 5ec96c39bb6045255d7aff294fd02b3ecc2faac7..6a8bb59260b34f8dd035cb5d5e60dcecf1579352 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/stream.h"
 
 namespace paddle {
 namespace memory {
@@ -33,14 +34,14 @@ uint64_t Release(const platform::Place& place) {
   return allocation::AllocatorFacade::Instance().Release(place);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
+std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                         size_t size,
-                                        const gpuStream_t& stream) {
+                                        const platform::Stream& stream) {
   return allocation::AllocatorFacade::Instance().AllocShared(place, size,
                                                              stream);
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
                     const gpuStream_t& stream) {
   return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 7ca15c5dfc1279f21221aa35c8cb445d44e2ca7f..b2ad3a98106a36acc5e920d8394f15752d3e839f 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/stream.h"
 
 namespace paddle {
 
@@ -40,11 +41,11 @@ extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size);
 
 extern uint64_t Release(const platform::Place& place);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-extern std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
+extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                                size_t size,
-                                               const gpuStream_t& stream);
+                                               const platform::Stream& stream);
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
                            const gpuStream_t& stream);
 
diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
index 52c3825053ca226de6a6058197dffaa9c23944f8..083b8a14d29f2b1f9e927bc8ceeb4135cb9abf77 100644
--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -30,6 +30,7 @@
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/stream/stream.h"
 
 namespace paddle {
 namespace memory {
@@ -69,8 +70,9 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
 #endif
 
-      std::shared_ptr<Allocation> allocation =
-          AllocShared(place_, workspace_size_, stream);
+      std::shared_ptr<Allocation> allocation = AllocShared(
+          place_, workspace_size_,
+          platform::Stream(reinterpret_cast<platform::StreamId>(stream)));
 #ifdef PADDLE_WITH_CUDA
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemset(allocation->ptr(), 0, allocation->size()));
@@ -283,8 +285,9 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
   PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&new_stream));
 #endif
 
-  std::shared_ptr<Allocation> allocation_new_stream =
-      AllocShared(place, alloc_size, new_stream);
+  std::shared_ptr<Allocation> allocation_new_stream = AllocShared(
+      place, alloc_size,
+      platform::Stream(reinterpret_cast<platform::StreamId>(new_stream)));
   EXPECT_EQ(GetStream(allocation_new_stream), new_stream);
 
 #ifdef PADDLE_WITH_CUDA
@@ -311,7 +314,9 @@ TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) {
   EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet);
   EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place),
                paddle::platform::EnforceNotMet);
-  EXPECT_THROW(AllocShared(place, alloc_size, nullptr),
+  EXPECT_THROW(AllocShared(place, alloc_size,
+                           platform::Stream(
+                               reinterpret_cast<platform::StreamId>(nullptr))),
                paddle::platform::EnforceNotMet);
   EXPECT_THROW(Alloc(place, alloc_size, nullptr),
                paddle::platform::EnforceNotMet);
@@ -342,13 +347,16 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   // so the second alloc will fail and retry
   size_t alloc_size = available_size / 4 * 3;
 
-  std::shared_ptr<Allocation> allocation1 =
-      AllocShared(place, alloc_size, stream1);
+  std::shared_ptr<Allocation> allocation1 = AllocShared(
+      place, alloc_size,
+      platform::Stream(reinterpret_cast<platform::StreamId>(stream1)));
   std::shared_ptr<Allocation> allocation2;
 
   std::thread th([&allocation2, &place, &stream2, alloc_size]() {
     std::this_thread::sleep_for(std::chrono::seconds(1));
-    allocation2 = AllocShared(place, alloc_size, stream2);
+    allocation2 = AllocShared(
+        place, alloc_size,
+        platform::Stream(reinterpret_cast<platform::StreamId>(stream2)));
   });
   allocation1.reset();  // free but not release
   th.join();
diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h
index 43ac5984bc8c844f146282a8975e468764ed5129..a19dc3367a14b4dae87c912e6790cce2a891af7a 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.h
+++ b/paddle/fluid/operators/memcpy_h2d_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/stream/stream.h"
 
 namespace paddle {
 namespace platform {
@@ -42,11 +43,15 @@ class MemcpyH2DFunctor {
   void operator()(const framework::LoDTensor &lod_tensor) const {
     auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    out_tensor.mutable_data(
-        BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_.GetPlace()),
-        lod_tensor.type(),
-        static_cast<const platform::CUDADeviceContext *>(&dev_ctx_)->stream());
+    auto stream =
+        static_cast<const platform::CUDADeviceContext *>(&dev_ctx_)->stream();
+#else
+    auto stream = nullptr;
 #endif
+    out_tensor.mutable_data(
+        dev_ctx_.GetPlace(), lod_tensor.type(),
+        platform::Stream(reinterpret_cast<platform::StreamId>(stream)));
+
     if (dst_place_type_ == 0 || dst_place_type_ == 1) {
       framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
                             &out_tensor);
diff --git a/paddle/fluid/platform/stream/stream.h b/paddle/fluid/platform/stream/stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..79ca51220bd6c196ce259656c9bfc79a2b4e8383
--- /dev/null
+++ b/paddle/fluid/platform/stream/stream.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+namespace paddle {
+namespace platform {
+
+using StreamId = uint64_t;
+class Stream final {
+ public:
+  explicit Stream(StreamId id) : id_(id) {}
+  StreamId id() const { return id_; }
+
+ private:
+  StreamId id_;
+};
+
+}  // namespace platform
+}  // namespace paddle