diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 751b10eeeed10828c08ada4173300c07f81c093e..b53a6f43fbd1f23e69d23ad0fcc54d5c25d352a3 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -253,9 +253,6 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( t->set_lod(lod_tensors[j].lod()); } } - for (auto &p : member_->places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); - } } ParallelExecutor::~ParallelExecutor() { diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index e5bc74755f46449296a153e8b330968e6d9f1e1d..31516a884baefc2e50b7774a7993740f6930212f 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -69,19 +69,47 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); auto stream = reinterpret_cast(ctx).stream(); - memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); + if (platform::is_same_place(src_place, dst_place)) { + memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, + stream); + } else { + // NOTE(zcd): Because TensorCopy is an async operation, when the src_place + // and dst_place are two different GPU, to ensure that the operation can + // be carried out correctly, we should make ctx wait. + // If ctx_place and src_place are the same, we should add ctx.Wait() + // after memory::Copy; if ctx_place and dst_place are the same, we should + // add ctx.Wait() before memory::Copy. + if (platform::is_same_place(ctx_place, src_place)) { + memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, + stream); + ctx.Wait(); + } else if (platform::is_same_place(ctx_place, dst_place)) { + ctx.Wait(); + memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, + stream); + } else { + PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place."); + } + } } #endif } void TensorCopy(const Tensor& src, const platform::Place& dst_place, Tensor* dst) { + // NOTE(zcd): If the src.place() and dst_place are two different GPU, + // the copy operation is carried out on the dst_place's stream. This is + // very important, because TensorCopy is an async operator, and in most + // case, once this copy operator returns, dst is to be used in dst_place's + // stream, if this copy operation is carried out on the src_place's stream, + // when dst is used in dst_place's stream the copy operation may be + // not completed. platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); const platform::DeviceContext* dev_ctx; - if (platform::is_gpu_place(src.place())) { - dev_ctx = pool.Get(src.place()); - } else { + if (platform::is_gpu_place(dst_place)) { dev_ctx = pool.Get(dst_place); + } else { + dev_ctx = pool.Get(src.place()); } TensorCopy(src, dst_place, *dev_ctx, dst); }