diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f9fc35d8ce3ecc042739ed54a5595c590fb94de7..21a19cb5b274f9e746a8c9f10cb7b5f86f8ffa7e 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -402,10 +402,13 @@ struct NCCLAllReduceOpHandle : public OpHandle { platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast(dtype), ncclSum, nccl_ctx.comm, nccl_ctx.stream()); - PADDLE_ENFORCE(cudaEventRecord(events_[dev_id], nccl_ctx.stream())); } - platform::dynload::ncclGroupEnd(); + + for (auto &ev : events_) { + PADDLE_ENFORCE(cudaEventRecord( + ev.second, member_->communication_streams_.at(ev.first).stream())); + } } }