diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b78dc3b8ae2b33fa872bf971153d844428e797c1..3a92494e7e918e49355cc60583ada1bf2b24be29 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -132,12 +132,12 @@ struct ScaleLossGradOpHandle : public OpHandle { scope_(scope), place_(place) { cudaSetDevice(boost::get(place_).device); + // Must set device before create event PADDLE_ENFORCE(cudaEventCreateWithFlags(&ev_, cudaEventDisableTiming)); - VLOG(3) << "Create " << ev_; } ~ScaleLossGradOpHandle() { - VLOG(3) << "Destroy " << ev_; + cudaSetDevice(boost::get(place_).device); PADDLE_ENFORCE(cudaEventDestroy(ev_)); } @@ -339,13 +339,15 @@ struct NCCLAllReduceOpHandle : public OpHandle { explicit NCCLAllReduceOpHandle(ParallelExecutorPrivate *member) : member_(member) { for (auto &nccl : member_->communication_streams_) { - PADDLE_ENFORCE(cudaEventCreate(&events_[nccl.second.device_id()], - cudaEventDisableTiming)); + int dev_id = nccl.second.device_id(); + cudaSetDevice(dev_id); + PADDLE_ENFORCE(cudaEventCreate(&events_[dev_id], cudaEventDisableTiming)); } } ~NCCLAllReduceOpHandle() { for (auto &ev : events_) { + cudaSetDevice(ev.first); PADDLE_ENFORCE(cudaEventDestroy(ev.second)); } }