diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 4871de3682437faa937670b666b2e3757d074dac..b19f02db1c0ddf17c84536bf5d512bbd823909b2 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -32,6 +32,18 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
 #ifdef PADDLE_WITH_CUDA
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
 
+inline void SyncCUDAStream() {
+#if !defined(_WIN32)
+  cudaStreamSynchronize(0);
+#else
+  cudaError_t e_sync = cudaSuccess;
+  while (e_sync = cudaStreamQuery(0)) {
+    if (e_sync == cudaErrorNotReady) continue;
+    break;
+  }
+#endif
+}
+
 // NOTE(zcd): Do not use GpuMemcpySync as much as possible.
 // because GpuMemcpySync issues the copying command to the default stream,
 // which will make two commands from different streams cannot run concurrently.
@@ -55,7 +67,7 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
     // FIXME(zjl): do we really need it?
     if (num <= kMaxGpuAsyncCopyBytes) {
-      cudaStreamSynchronize(0);
+      SyncCUDAStream();
     }
   }
 }
@@ -77,7 +89,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
     // FIXME(zjl): do we really need it?
     if (num <= kMaxGpuAsyncCopyBytes) {
-      cudaStreamSynchronize(0);
+      SyncCUDAStream();
     }
   }
 }