fix randomly hang issue of PaddleDetection training task on windows (#24977)

8603b5fb · liuwei1031 · GitHub · 640196c4 · 8603b5fb
显示空白变更内容
内联并排

Showing with 14 addition and 2 deletion

paddle/fluid/memory/memcpy.cc paddle/fluid/memory/memcpy.cc +14 -2

未找到文件。
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -32,6 +32,18 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
 #ifdef PADDLE_WITH_CUDA
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
+inline void SyncCUDAStream() {
+#if !defined(_WIN32)
+  cudaStreamSynchronize(0);
+#else
+  cudaError_t e_sync = cudaSuccess;
+  while (e_sync = cudaStreamQuery(0)) {
+    if (e_sync == cudaErrorNotReady) continue;
+    break;
+  }
+#endif
+}
 // NOTE(zcd): Do not use GpuMemcpySync as much as possible.
 // because GpuMemcpySync issues the copying command to the default stream,
 // which will make two commands from different streams cannot run concurrently.
@@ -55,7 +67,7 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
    platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
    // FIXME(zjl): do we really need it?
    if (num <= kMaxGpuAsyncCopyBytes) {
-      cudaStreamSynchronize(0);
+      SyncCUDAStream();
    }
  }
 }
@@ -77,7 +89,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
    platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
    // FIXME(zjl): do we really need it?
    if (num <= kMaxGpuAsyncCopyBytes) {
-      cudaStreamSynchronize(0);
+      SyncCUDAStream();
    }
  }
 }