diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 4871de3682437faa937670b666b2e3757d074dac..b19f02db1c0ddf17c84536bf5d512bbd823909b2 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -32,6 +32,18 @@ void Copy(platform::CPUPlace, void* dst, #ifdef PADDLE_WITH_CUDA static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K +inline void SyncCUDAStream() { +#if !defined(_WIN32) + cudaStreamSynchronize(0); +#else + cudaError_t e_sync = cudaSuccess; + while (e_sync = cudaStreamQuery(0)) { + if (e_sync == cudaErrorNotReady) continue; + break; + } +#endif +} + // NOTE(zcd): Do not use GpuMemcpySync as much as possible. // because GpuMemcpySync issues the copying command to the default stream, // which will make two commands from different streams cannot run concurrently. @@ -55,7 +67,7 @@ void Copy( platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); // FIXME(zjl): do we really need it? if (num <= kMaxGpuAsyncCopyBytes) { - cudaStreamSynchronize(0); + SyncCUDAStream(); } } } @@ -77,7 +89,7 @@ void Copy( platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); // FIXME(zjl): do we really need it? if (num <= kMaxGpuAsyncCopyBytes) { - cudaStreamSynchronize(0); + SyncCUDAStream(); } } }