diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 82a7bd218590d6009b2f6e5f6bee3085d2aefce5..b0cd1e8e908d6522dc819241e187e7a2e7d3e17f 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -110,23 +110,30 @@ ParallelExecutor::ParallelExecutor( // Bcast Parameters to all GPUs #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); - std::unique_ptr nccl_id = nullptr; + ncclUniqueId *nccl_id = nullptr; bool need_group_call = true; - if (nccl_id_var != nullptr) { - nccl_id.reset(nccl_id_var->GetMutable()); - } else if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { - nccl_id.reset(new ncclUniqueId()); - PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id.get())); - *member_->global_scope_->Var(NCCL_ID_VARNAME) - ->GetMutable() = *nccl_id.get(); + if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + // parallel graph mode should initialize nccl by ncclCommInitRank since + // it call nccl operator per device per thread. + if (nccl_id_var == nullptr) { + nccl_id = new ncclUniqueId(); + PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id)); + *member_->global_scope_->Var(NCCL_ID_VARNAME) + ->GetMutable() = *nccl_id; + } else { + nccl_id = nccl_id_var->GetMutable(); + } need_group_call = false; + } else if (nccl_id_var != nullptr) { // the other executor type. + // the distributed training with nccl mode would initialize the nccl id in + // startup_program. + nccl_id = nccl_id_var->GetMutable(); } else { - // init nccl_id in NCCLContextMap + // initlize NCCL by ncclCommInitAll, do not need nccl_id. } member_->nccl_ctxs_.reset(new platform::NCCLContextMap( - member_->places_, nccl_id.get(), num_trainers, trainer_id, - need_group_call)); + member_->places_, nccl_id, num_trainers, trainer_id, need_group_call)); #else PADDLE_THROW("Not compiled with CUDA"); #endif