diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index d1652a3030bac95fe51d3a40bf49e69ff69e3353..24a9dcacf248326824f81f091704fb314a79111d 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -365,6 +365,7 @@ struct NCCLAllReduceOpHandle : public OpHandle { auto &p = static_cast(in)->place_; in->generated_op_->Wait(dev_ctx_[p]); } + VLOG(3) << "Before NCCL"; PADDLE_ENFORCE(cudaDeviceSynchronize()); auto &var_name = static_cast(this->inputs_[0])->name_; @@ -394,8 +395,9 @@ struct NCCLAllReduceOpHandle : public OpHandle { nccl_ctx.comm, nccl_ctx.stream()); } platform::dynload::ncclGroupEnd(); - PADDLE_ENFORCE(cudaDeviceSynchronize()); + + VLOG(3) << "After NCCL"; } } };