diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc index a3ea0a4f895bb736959c854d0f97d26586dc3c1e..08b61765c2f0fb90056c97618c0ce345155a274c 100644 --- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc +++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc @@ -23,13 +23,18 @@ std::unique_ptr> global_comms; std::unique_ptr> comm_id_map; bool inited = false; size_t last_num_gpus = -1; +// TODO(panyx0718): Need to decide whether Paddle supports parallel +// runs with different number GPUs. If true, current solution is not enough. +std::mutex comm_mu; } int Communicator::GetCommId(int device_id) const { + std::lock_guard guard(comm_mu); return comm_id_map->at(device_id); } void Communicator::InitAll(const std::vector& gpus) { + std::lock_guard guard(comm_mu); if (inited && last_num_gpus == gpus.size()) { return; } @@ -52,6 +57,7 @@ void Communicator::InitAll(const std::vector& gpus) { } const std::vector& Communicator::comms() const { + std::lock_guard guard(comm_mu); return *global_comms; }