Merge pull request #8758 from panyx0718/nccl

[Speed]Avoid init_nccl for every steps.

Merge pull request #8758 from panyx0718/nccl
[Speed]Avoid init_nccl for every steps.
c7b7291b · Xin Pan · GitHub · 767acc6c · a4d68ed3 · c7b7291b
3 changed file
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
@@ -16,5 +16,50 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"

 namespace paddle {
-namespace platform {}  // namespace platform
+namespace platform {
+namespace {
+// TODO(panyx0718): Where to destroy them.
+std::unique_ptr<std::vector<ncclComm_t>> global_comms;
+std::unique_ptr<std::unordered_map<int, int>> comm_id_map;
+bool inited = false;
+size_t last_num_gpus = -1;
+// TODO(panyx0718): Need to decide whether Paddle supports parallel
+// runs with different number GPUs. If true, current solution is not enough.
+std::mutex comm_mu;
+}
+
+int Communicator::GetCommId(int device_id) const {
+  std::lock_guard<std::mutex> guard(comm_mu);
+  return comm_id_map->at(device_id);
+}
+
+void Communicator::InitAll(const std::vector<int>& gpus) {
+  std::lock_guard<std::mutex> guard(comm_mu);
+  if (inited && last_num_gpus == gpus.size()) {
+    return;
+  }
+  last_num_gpus = gpus.size();
+  if (global_comms) {
+    for (size_t i = 0; i < global_comms->size(); ++i) {
+      // FIXME(dzh) : PADDLE_ENFORCE return void
+      dynload::ncclCommDestroy((*global_comms)[i]);
+    }
+  }
+  global_comms.reset(new std::vector<ncclComm_t>());
+  comm_id_map.reset(new std::unordered_map<int, int>());
+  global_comms->resize(gpus.size());
+  for (size_t i = 0; i < gpus.size(); ++i) {
+    (*comm_id_map)[gpus[i]] = i;
+  }
+  PADDLE_ENFORCE(
+      dynload::ncclCommInitAll(global_comms->data(), gpus.size(), gpus.data()));
+  inited = true;
+}
+
+const std::vector<ncclComm_t>& Communicator::comms() const {
+  std::lock_guard<std::mutex> guard(comm_mu);
+  return *global_comms;
+}
+
+}  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.h
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h
@@ -29,39 +29,16 @@ limitations under the License. */

 namespace paddle {
 namespace platform {
-
 constexpr int kInvalidGPUId = -1;

 struct Communicator {
-  std::vector<ncclComm_t> comms_;
-  std::unordered_map<int, int> comm_id_map_;
-  bool inited_;
-
  Communicator() {}

-  int GetCommId(int device_id) const { return comm_id_map_.at(device_id); }
-
-  void InitAll(const std::vector<int>& gpus) {
-    comms_.resize(gpus.size());
-    inited_ = false;
-    for (size_t i = 0; i < gpus.size(); ++i) {
-      comm_id_map_[gpus[i]] = i;
-    }
-    PADDLE_ENFORCE(
-        dynload::ncclCommInitAll(comms_.data(), gpus.size(), gpus.data()));
-    inited_ = true;
-  }
+  int GetCommId(int device_id) const;

-  ~Communicator() {
-    if (inited_) {
-      for (size_t i = 0; i < comms_.size(); ++i) {
-        // FIXME(dzh) : PADDLE_ENFORCE return void
-        dynload::ncclCommDestroy(comms_[i]);
-      }
-    }
-  }
+  void InitAll(const std::vector<int>& gpus);

-  DISABLE_COPY_AND_ASSIGN(Communicator);
+  const std::vector<ncclComm_t>& comms() const;
 };

 }  // namespace platform

--- a/paddle/fluid/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
@@ -78,7 +78,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
      PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
          ins[i]->data<T>(), outs[i]->mutable_data<T>(ctx.GetPlace()),
          outs[i]->numel(), NCCLTypeWrapper<T>::type, reduction_op_,
-          comm->comms_[idx], stream));
+          comm->comms().at(idx), stream));
      PADDLE_ENFORCE(cudaStreamSynchronize(stream));

      VLOG(1) << "gpu : "
@@ -127,7 +127,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
    std::hash<std::string> hasher;
    for (size_t i = 0; i < ins.size(); ++i) {
      if (root == platform::kInvalidGPUId) {
-        root = hasher(ins_names[i]) % comm->comms_.size();
+        root = hasher(ins_names[i]) % comm->comms().size();
      }
      T* recvbuffer = nullptr;
      if (root == gpu_id) {
@@ -139,7 +139,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {

      PADDLE_ENFORCE(platform::dynload::ncclReduce(
          ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
-          NCCLTypeWrapper<T>::type, reduction_op_, root, comm->comms_[idx],
+          NCCLTypeWrapper<T>::type, reduction_op_, root, comm->comms().at(idx),
          stream));
      PADDLE_ENFORCE(cudaStreamSynchronize(stream));

@@ -176,7 +176,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
        VLOG(1) << " before ncclBcast";
        PADDLE_ENFORCE(platform::dynload::ncclBcast(
            (void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type,
-            root, comm->comms_[idx], stream));
+            root, comm->comms().at(idx), stream));
        VLOG(1) << " after ncclBcast";
        PADDLE_ENFORCE(cudaStreamSynchronize(stream));

@@ -190,7 +190,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {

        PADDLE_ENFORCE(platform::dynload::ncclBcast(
            outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(),
-            NCCLTypeWrapper<T>::type, root, comm->comms_[idx], stream));
+            NCCLTypeWrapper<T>::type, root, comm->comms().at(idx), stream));
        PADDLE_ENFORCE(cudaStreamSynchronize(stream));

        VLOG(1) << "gpu : " << gpu_id << " finished Bcast. recv "