diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 31516a884baefc2e50b7774a7993740f6930212f..f98011e896f4033ef210e0eb69f93ce7800a3cd6 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -73,18 +73,12 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else { - // NOTE(zcd): Because TensorCopy is an async operation, when the src_place - // and dst_place are two different GPU, to ensure that the operation can - // be carried out correctly, we should make ctx wait. - // If ctx_place and src_place are the same, we should add ctx.Wait() - // after memory::Copy; if ctx_place and dst_place are the same, we should - // add ctx.Wait() before memory::Copy. if (platform::is_same_place(ctx_place, src_place)) { memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); - ctx.Wait(); + platform::DeviceContextPool::Instance().Get(src.place())->Wait(); } else if (platform::is_same_place(ctx_place, dst_place)) { - ctx.Wait(); + platform::DeviceContextPool::Instance().Get(src.place())->Wait(); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else { @@ -97,13 +91,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, void TensorCopy(const Tensor& src, const platform::Place& dst_place, Tensor* dst) { - // NOTE(zcd): If the src.place() and dst_place are two different GPU, - // the copy operation is carried out on the dst_place's stream. This is - // very important, because TensorCopy is an async operator, and in most - // case, once this copy operator returns, dst is to be used in dst_place's - // stream, if this copy operation is carried out on the src_place's stream, - // when dst is used in dst_place's stream the copy operation may be - // not completed. platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); const platform::DeviceContext* dev_ctx; if (platform::is_gpu_place(dst_place)) { diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index dca279b69382b80e055f661cefe84b81326704b5..4457382ade37a12f5f3613fc4113fbf1f6f91124 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -23,10 +23,25 @@ limitations under the License. */ namespace paddle { namespace framework { +// NOTE(zcd): Because TensorCopy is an async operation, when the src_place +// and dst_place are two different GPU, to ensure that the operation can +// be carried out correctly, there is a src_ctx wait operation in TensorCopy. +// If ctx_place and src_place are the same, src_ctx.Wait() is added +// after memory::Copy; if ctx_place and dst_place are the same, +// src_ctx.Wait() is added before memory::Copy. void TensorCopy(const Tensor& src, const platform::Place& dst_place, const platform::DeviceContext& ctx, Tensor* dst); + +// NOTE(zcd): If the src.place() and dst_place are two different GPU, +// the copy operation is carried out on the dst_place's stream. This is +// very important, because TensorCopy is an async operator, and in most +// case, once this copy operator returns, dst is to be used in dst_place's +// stream, if this copy operation is carried out on the src_place's stream, +// when dst is used in dst_place's stream the copy operation may be +// not completed. void TensorCopy(const Tensor& src, const platform::Place& dst_place, Tensor* dst); + void TensorCopySync(const Tensor& src, const platform::Place& dst_place, Tensor* dst);