Merge pull request #9928 from reyoung/feature/stablize_code

Use mutex to stablize ncclCtxMap

Merge pull request #9928 from reyoung/feature/stablize_code
Use mutex to stablize ncclCtxMap
c3c7b7bd · Yu Yang · GitHub · 35483a20 · 093d227a · c3c7b7bd
隐藏空白更改
内联并排

Showing with 16 addition and 34 deletion

paddle/fluid/platform/nccl_helper.h paddle/fluid/platform/nccl_helper.h +16 -34

未找到文件。
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -39,20 +39,19 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
 class NCCLGroupGuard {
 public:
+  static std::mutex &NCCLMutex() {
+    static std::mutex mtx;
+    return mtx;
+  }
  inline NCCLGroupGuard() {
-    mutex().lock();
+    NCCLMutex().lock();
    PADDLE_ENFORCE(dynload::ncclGroupStart());
  }
  inline ~NCCLGroupGuard() {
    PADDLE_ENFORCE(dynload::ncclGroupEnd());
-    mutex().unlock();
+    NCCLMutex().unlock();
-  }
- private:
-  static std::mutex &mutex() {
-    static std::mutex mtx;
-    return mtx;
  }
 };
@@ -68,26 +67,6 @@ struct NCCLContext {
  int device_id() const {
    return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
  }
-  static void InitNCCLContext(std::unordered_map<int, NCCLContext> *contexts,
-                              const std::vector<platform::Place> &places) {
-    std::vector<ncclComm_t> comms;
-    std::vector<int> devs;
-    comms.resize(contexts->size());
-    devs.reserve(contexts->size());
-    for (auto &p : places) {
-      devs.push_back(boost::get<platform::CUDAPlace>(p).device);
-    }
-    PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
-        &comms[0], static_cast<int>(contexts->size()), &devs[0]));
-    int i = 0;
-    for (auto &dev_id : devs) {
-      contexts->at(dev_id).comm_ = comms[i++];
-    }
-  }
 };
 struct NCCLContextMap {
@@ -107,12 +86,12 @@ struct NCCLContextMap {
        "NCCL Context Map does not support contain two or more same device");
    if (places.size() > 1) {
-      std::vector<ncclComm_t> comms;
+      std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
-      comms.resize(order_.size());
+      {
+        std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
-      PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
+        PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
-          &comms[0], static_cast<int>(order_.size()), &order_[0]));
+            comms.get(), static_cast<int>(order_.size()), order_.data()));
+      }
      int i = 0;
      for (auto &dev_id : order_) {
        contexts_.at(dev_id).comm_ = comms[i++];
@@ -120,6 +99,9 @@ struct NCCLContextMap {
    }
  }
+  NCCLContextMap(const NCCLContextMap &other) = delete;
+  NCCLContextMap &operator=(const NCCLContextMap &other) = delete;
  CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
  CUDADeviceContext *DevCtx(platform::Place p) const {