diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index c7162f58de284ccf20df269b26f6810ffedaf7e7..587ad5f37e55e554226dc98feeec45678dd26bc3 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -411,10 +411,11 @@ void CUDAContext::InitEigenContext() {
 }
 
 CUDAContext::CUDAContext(const CUDAPlace& place,
-                         const stream::Priority& priority) {
+                         const stream::Priority& priority,
+                         const stream::StreamFlag& flag) {
   place_ = place;
   CUDADeviceGuard guard(place_.device);
-  stream_.reset(new stream::CUDAStream(place, priority));
+  stream_.reset(new stream::CUDAStream(place, priority, flag));
   InitEigenContext();
   InitCuBlasContext();
   InitCuDNNContext();
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 87ce7c438b65be967c5f752da3a7bbf54bc80a7a..3c5c15a42199ab98b1d0a6dcd6d88bebb7d93bb6 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -272,7 +272,8 @@ class CUDAContext {
   CUDAContext() = default;
   explicit CUDAContext(
       const CUDAPlace& place,
-      const stream::Priority& priority = stream::Priority::kNormal);
+      const stream::Priority& priority = stream::Priority::kNormal,
+      const stream::StreamFlag& flag = stream::StreamFlag::kDefaultFlag);
 
   ~CUDAContext();
 
@@ -288,6 +289,12 @@ class CUDAContext {
 
   const std::unique_ptr<stream::CUDAStream>& Stream() const { return stream_; }
 
+  stream::CUDAStream* SetStream(stream::CUDAStream* new_stream_ptr) {
+    auto* old_stream_ptr = stream_.release();
+    stream_.reset(new_stream_ptr);
+    return old_stream_ptr;
+  }
+
   const gpuStream_t& RawStream() { return stream_->raw_stream(); }
 
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc
index 2a3d05ed4e34749a35b2914b3c34d07d04b6ecfd..212d99f6a78ed51b8ea6767ebf706259b7784938 100644
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -21,13 +21,8 @@ namespace paddle {
 namespace platform {
 namespace stream {
 
-#ifdef PADDLE_WITH_HIP
-constexpr unsigned int kDefaultFlag = hipStreamDefault;
-#else
-constexpr unsigned int kDefaultFlag = cudaStreamDefault;
-#endif
-
-bool CUDAStream::Init(const Place& place, const Priority& priority) {
+bool CUDAStream::Init(const Place& place, const Priority& priority,
+                      const StreamFlag& flag) {
   PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
                     platform::errors::InvalidArgument(
                         "Cuda stream must be created using cuda place."));
@@ -35,24 +30,25 @@ bool CUDAStream::Init(const Place& place, const Priority& priority) {
   CUDADeviceGuard guard(BOOST_GET_CONST(CUDAPlace, place_).device);
   if (priority == Priority::kHigh) {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        hipStreamCreateWithPriority(&stream_, kDefaultFlag, -1));
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreateWithPriority(
+        &stream_, static_cast<unsigned int>(flag), -1));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, -1));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreateWithPriority(
+        &stream_, static_cast<unsigned int>(flag), -1));
 #endif
   } else if (priority == Priority::kNormal) {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        hipStreamCreateWithPriority(&stream_, kDefaultFlag, 0));
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreateWithPriority(
+        &stream_, static_cast<unsigned int>(flag), 0));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, 0));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreateWithPriority(
+        &stream_, static_cast<unsigned int>(flag), 0));
 #endif
   }
   callback_manager_.reset(new StreamCallbackManager<gpuStream_t>(stream_));
   VLOG(3) << "GPUStream Init stream: " << stream_
-          << ", priority: " << static_cast<int>(priority);
+          << ", priority: " << static_cast<int>(priority)
+          << ", flag:" << static_cast<int>(flag);
   return true;
 }
 
@@ -118,6 +114,19 @@ CUDAStream* get_current_stream(int deviceId) {
 #endif
 }
 
+CUDAStream* set_current_stream(CUDAStream* stream) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  auto& device = stream->GetPlace();
+  auto& pool = platform::DeviceContextPool::Instance();
+  return static_cast<platform::CUDADeviceContext*>(pool.Get(device))
+      ->context()
+      ->SetStream(stream);
+#else
+  PADDLE_THROW(platform::errors::Unavailable(
+      "Paddle is not compiled with CUDA. Cannot visit cuda current stream."));
+  return nullptr;
+#endif
+}
 }  // namespace stream
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h
index ad54d842d40ee5f4524c38c318c1c2b065c61472..472d6bbab0c6cb02fb8d22ba144efca925786197 100644
--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@@ -33,18 +33,27 @@ enum class Priority : uint8_t {
   kHigh = 0x1,
   kNormal = 0x2,
 };
+
+enum class StreamFlag : uint8_t {
+  kDefaultFlag = 0x0,
+  kStreamNonBlocking = 0x1,
+};
+
 #endif
 class CUDAStream final {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
  public:
   CUDAStream() = default;
   explicit CUDAStream(const Place& place,
-                      const Priority& priority = Priority::kNormal) {
-    Init(place, priority);
+                      const Priority& priority = Priority::kNormal,
+                      const StreamFlag& flag = StreamFlag::kDefaultFlag) {
+    Init(place, priority, flag);
   }
   virtual ~CUDAStream() { Destroy(); }
 
-  bool Init(const Place& place, const Priority& priority = Priority::kNormal);
+  bool Init(const Place& place, const Priority& priority = Priority::kNormal,
+            const StreamFlag& flag = StreamFlag::kDefaultFlag);
 
   template <typename Callback>
   void AddCallback(Callback&& callback) const {
@@ -125,6 +134,8 @@ class CUDAStream final {
 #endif
   }
 
+  const Place& GetPlace() const { return place_; }
+
  private:
   Place place_;
 #ifdef PADDLE_WITH_HIP
@@ -139,6 +150,7 @@ class CUDAStream final {
 };
 
 CUDAStream* get_current_stream(int deviceId);
+CUDAStream* set_current_stream(CUDAStream* stream);
 
 }  // namespace stream
 }  // namespace platform
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index 50c6d0e983939f4c708cbd1f8072c47214ca8022..706012f4a44793d76fa2dd343a209345c395693b 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -40,6 +40,18 @@ void BindCudaStream(py::module *m_ptr) {
         },
         py::return_value_policy::reference);
 
+  m.def("_set_current_stream",
+        [](paddle::platform::stream::CUDAStream &stream) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+          return paddle::platform::stream::set_current_stream(&stream);
+#else
+          PADDLE_THROW(platform::errors::Unavailable(
+              "Paddle is not compiled with CUDA. Cannot set cuda current "
+              "stream."));
+#endif
+        },
+        py::return_value_policy::reference);
+
   m.def("_device_synchronize", [](int device_id) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (device_id == -1) {
@@ -69,7 +81,7 @@ void BindCudaStream(py::module *m_ptr) {
         If device is positive integer, it must less than the device count. Default: None. 
 
         priority(int|None, optional): The priority of stream. The priority can be 1(high) or 2(normal).
-        If prioriyt is None, the priority is 2(normal). Default: None. 
+        If priority is None, the priority is 2(normal). Default: None. 
 
       Examples:
         .. code-block:: python
@@ -200,6 +212,8 @@ void BindCudaStream(py::module *m_ptr) {
                    "Priority should be 1(high) or 2(normal) "));
              }
              auto prio = paddle::platform::stream::Priority(priority);
+             auto stream_flag =
+                 paddle::platform::stream::StreamFlag::kStreamNonBlocking;
 
              if (device == nullptr) {
                int curr_device_id = platform::GetCurrentDeviceId();
@@ -207,7 +221,8 @@ void BindCudaStream(py::module *m_ptr) {
                device = &device_tmp;
              }
 
-             new (&self) paddle::platform::stream::CUDAStream(*device, prio);
+             new (&self) paddle::platform::stream::CUDAStream(*device, prio,
+                                                              stream_flag);
 #else
             PADDLE_THROW(platform::errors::Unavailable(
         "Class CUDAStream can only be initialized on the GPU platform."));
@@ -224,6 +239,8 @@ void BindCudaStream(py::module *m_ptr) {
                   "Priority should be 1(high) or 2(normal) "));
             }
             auto prio = paddle::platform::stream::Priority(priority);
+            auto stream_flag =
+                paddle::platform::stream::StreamFlag::kStreamNonBlocking;
 
             int device_count = platform::GetCUDADeviceCount();
             if (device < 0) {
@@ -236,7 +253,7 @@ void BindCudaStream(py::module *m_ptr) {
             }
 
             new (&self) paddle::platform::stream::CUDAStream(
-                platform::CUDAPlace(device), prio);
+                platform::CUDAPlace(device), prio, stream_flag);
 #else
             PADDLE_THROW(platform::errors::Unavailable(
         "Class CUDAStream can only be initialized on the GPU platform."));
@@ -246,11 +263,13 @@ void BindCudaStream(py::module *m_ptr) {
       .def("__init__", [](paddle::platform::stream::CUDAStream &self) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         auto prio = paddle::platform::stream::Priority::kNormal;
+        auto stream_flag =
+            paddle::platform::stream::StreamFlag::kStreamNonBlocking;
 
         int device_id = platform::GetCurrentDeviceId();
 
         new (&self) paddle::platform::stream::CUDAStream(
-            platform::CUDAPlace(device_id), prio);
+            platform::CUDAPlace(device_id), prio, stream_flag);
 #else
             PADDLE_THROW(platform::errors::Unavailable(
         "Class CUDAStream can only be initialized on the GPU platform."));
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index be2e2488a304917e91dae47dc76c91e56ab2e010..e42b6031f6f08a6d3dbc73fe86fa7e5b17c1fbf5 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 from paddle.fluid import core
+from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
 
 from .streams import Stream  # noqa: F401
 from .streams import Event  # noqa: F401
@@ -24,6 +26,7 @@ __all__ = [
     'synchronize',
     'device_count',
     'empty_cache',
+    'stream_guard',
 ]
 
 
@@ -121,7 +124,7 @@ def device_count():
 
 
 def empty_cache():
-    """
+    '''
     Releases idle cached memory held by the allocator so that those can be used in other GPU
     application and visible in `nvidia-smi`. In most cases you don't need to use this function,
     Paddle does not release the memory back to the OS when you remove Tensors on the GPU,
@@ -137,7 +140,67 @@ def empty_cache():
             tensor = paddle.randn([512, 512, 512], "float")
             del tensor
             paddle.device.cuda.empty_cache()
-    """
+    '''
 
     if core.is_compiled_with_cuda():
         core.cuda_empty_cache()
+
+
+def _set_current_stream(stream):
+    '''
+    Set the current stream.
+
+    Parameters:
+        stream(paddle.device.cuda.Stream): The selected stream.
+
+    Returns:
+        CUDAStream: The previous stream.
+
+    '''
+
+    if not isinstance(stream, paddle.device.cuda.Stream):
+        raise TypeError("stream type should be paddle.device.cuda.Stream")
+
+    cur_stream = current_stream()
+    if id(stream) == id(cur_stream):
+        return stream
+    return core._set_current_stream(stream)
+
+
+@signature_safe_contextmanager
+def stream_guard(stream):
+    '''
+    **Notes**:
+        **This API only supports dygraph mode currently.**
+
+    A context manager that specifies the current stream context by the given stream.
+
+    Parameters:
+        stream(paddle.device.cuda.Stream): the selected stream. If stream is None, just yield. The default value is None.
+
+    Examples:
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+
+            s = paddle.device.cuda.Stream()
+            data1 = paddle.ones(shape=[20])
+            data2 = paddle.ones(shape=[20])
+            with paddle.device.cuda.stream_guard(s):
+                data3 = data1 + data2
+
+    '''
+
+    if stream is not None and not isinstance(stream, paddle.device.cuda.Stream):
+        raise TypeError("stream type should be paddle.device.cuda.Stream")
+
+    cur_stream = current_stream()
+    if stream is None or id(stream) == id(cur_stream):
+        yield
+    else:
+        pre_stream = _set_current_stream(stream)
+        try:
+            yield
+        finally:
+            stream = _set_current_stream(pre_stream)
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index befcd0901dc25ed9d1ef6cb6211bbe8c24627f78..1e153cf0f9747a446e4d3c46f0f69c4f6396b693 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -276,6 +276,7 @@ if avx_supported():
         from .core_avx import _set_cached_executor_build_strategy
         from .core_avx import _device_synchronize
         from .core_avx import _get_current_stream
+        from .core_avx import _set_current_stream
         if sys.platform != 'win32':
             from .core_avx import _set_process_pids
             from .core_avx import _erase_process_pids
@@ -328,6 +329,7 @@ if load_noavx:
         from .core_noavx import _set_cached_executor_build_strategy
         from .core_noavx import _device_synchronize
         from .core_noavx import _get_current_stream
+        from .core_noavx import _set_current_stream
         if sys.platform != 'win32':
             from .core_noavx import _set_process_pids
             from .core_noavx import _erase_process_pids
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py b/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py
index 04e42c4e5f8f4c15ebfa51da351e76038b8c4816..ec024105f899d9179f5e74a70664872f802a6dcd 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py
@@ -16,6 +16,7 @@ from paddle.device import cuda
 import paddle
 
 import unittest
+import numpy as np
 
 
 class TestCurrentStream(unittest.TestCase):
@@ -104,5 +105,56 @@ class TestCUDAEvent(unittest.TestCase):
             self.assertTrue(event_query_2)
 
 
+class TestStreamGuard(unittest.TestCase):
+    '''
+    Note: 
+        The asynchronous execution property of CUDA Stream can only be tested offline. 
+    '''
+
+    def test_stream_guard_normal(self):
+        if paddle.is_compiled_with_cuda():
+            s = paddle.device.cuda.Stream()
+            a = paddle.to_tensor(np.array([0, 2, 4], dtype="int32"))
+            b = paddle.to_tensor(np.array([1, 3, 5], dtype="int32"))
+            c = a + b
+            with paddle.device.cuda.stream_guard(s):
+                d = a + b
+
+            self.assertTrue(np.array_equal(np.array(c), np.array(d)))
+
+    def test_stream_guard_default_stream(self):
+        if paddle.is_compiled_with_cuda():
+            s1 = paddle.device.cuda.current_stream()
+            with paddle.device.cuda.stream_guard(s1):
+                pass
+            s2 = paddle.device.cuda.current_stream()
+
+            self.assertTrue(id(s1) == id(s2))
+
+    def test_set_current_stream_default_stream(self):
+        if paddle.is_compiled_with_cuda():
+            cur_stream = paddle.device.cuda.current_stream()
+            new_stream = paddle.device.cuda._set_current_stream(cur_stream)
+
+            self.assertTrue(id(cur_stream) == id(new_stream))
+
+    def test_stream_guard_raise_error(self):
+        if paddle.is_compiled_with_cuda():
+
+            def test_not_correct_stream_guard_input():
+                tmp = np.zeros(5)
+                with paddle.device.cuda.stream_guard(tmp):
+                    pass
+
+            self.assertRaises(TypeError, test_not_correct_stream_guard_input)
+
+    def test_set_current_stream_raise_error(self):
+        if paddle.is_compiled_with_cuda():
+            self.assertRaises(TypeError, paddle.device.cuda._set_current_stream,
+                              np.zeros(5))
+            self.assertRaises(TypeError, paddle.device.cuda._set_current_stream,
+                              None)
+
+
 if __name__ == "__main__":
     unittest.main()