nccl_helper.h 5.1 KB
Newer Older
Y
Yu Yang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

P
peizhilin 已提交
15
#ifndef _WIN32
Y
Yu Yang 已提交
16 17
#pragma once

T
typhoonzero 已提交
18
#include <stdio.h>
Q
qingqing01 已提交
19
#include <memory>
20
#include <string>
21
#include <thread>  // NOLINT
Y
Yu Yang 已提交
22
#include <typeindex>
Q
qingqing01 已提交
23
#include <unordered_map>
24
#include <vector>
Y
Yu Yang 已提交
25
#include "paddle/fluid/framework/data_type.h"
Y
Yu Yang 已提交
26 27
#include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/fluid/platform/enforce.h"
W
Wu Yi 已提交
28
#include "paddle/fluid/platform/float16.h"
Y
Yu Yang 已提交
29

T
typhoonzero 已提交
30 31
#define NCCL_ID_VARNAME "NCCLID"

Y
Yu Yang 已提交
32 33 34
namespace paddle {
namespace platform {

Y
Yu Yang 已提交
35 36
inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
  if (type == framework::proto::VarType::FP32) {
Y
Yu Yang 已提交
37
    return ncclFloat;
Y
Yu Yang 已提交
38
  } else if (type == framework::proto::VarType::FP64) {
Y
Yu Yang 已提交
39
    return ncclDouble;
Y
Yu Yang 已提交
40
  } else if (type == framework::proto::VarType::INT32) {
Y
Yu Yang 已提交
41
    return ncclInt;
Y
Yu Yang 已提交
42
  } else if (type == framework::proto::VarType::INT64) {
43
    return ncclInt64;
W
Wu Yi 已提交
44 45
  } else if (type == framework::proto::VarType::FP16) {
    return ncclFloat16;
Y
Yu Yang 已提交
46 47 48 49 50
  } else {
    PADDLE_THROW("Not supported");
  }
}

51 52 53 54 55
// NOTE(minqiyang): according to the ncclGroupEnd documentations:
// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
// ncclGroupEnd will wait for all communicators to be initialized, which will
// cause blocking problem when a runtime_error was thrown, so try only guard
// NCCL actions when use it.
Y
Yu Yang 已提交
56 57
class NCCLGroupGuard {
 public:
Y
Yu Yang 已提交
58 59 60 61 62
  static std::mutex &NCCLMutex() {
    static std::mutex mtx;
    return mtx;
  }

Y
Yu Yang 已提交
63
  inline NCCLGroupGuard() {
Y
Yu Yang 已提交
64
    NCCLMutex().lock();
Y
Yu Yang 已提交
65 66
    PADDLE_ENFORCE(dynload::ncclGroupStart());
  }
Y
Yu Yang 已提交
67 68

  inline ~NCCLGroupGuard() {
S
sneaxiy 已提交
69
    PADDLE_ENFORCE(dynload::ncclGroupEnd());
Y
Yu Yang 已提交
70
    NCCLMutex().unlock();
Y
Yu Yang 已提交
71 72 73
  }
};

Y
Yu Yang 已提交
74 75 76 77 78
struct NCCLContext {
  std::unique_ptr<CUDADeviceContext> ctx_;
  ncclComm_t comm_;

  explicit NCCLContext(int dev_id)
Y
Yu Yang 已提交
79
      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
Y
Yu Yang 已提交
80 81 82

  cudaStream_t stream() const { return ctx_->stream(); }

Q
qingqing01 已提交
83 84
  ncclComm_t comm() const { return comm_; }

Y
Yu Yang 已提交
85 86 87 88 89
  int device_id() const {
    return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
  }
};

Y
Yu Yang 已提交
90 91 92 93
struct NCCLContextMap {
  std::unordered_map<int, NCCLContext> contexts_;
  std::vector<int> order_;

T
typhoonzero 已提交
94 95
  explicit NCCLContextMap(const std::vector<platform::Place> &places,
                          ncclUniqueId *nccl_id = nullptr,
Y
Yancey1989 已提交
96
                          size_t num_trainers = 1, size_t trainer_id = 0) {
Y
Yu Yang 已提交
97
    PADDLE_ENFORCE(!places.empty());
Y
Yu Yang 已提交
98 99 100 101 102 103 104 105 106 107
    order_.reserve(places.size());
    for (auto &p : places) {
      int dev_id = boost::get<CUDAPlace>(p).device;
      order_.emplace_back(dev_id);
      contexts_.emplace(dev_id, NCCLContext(dev_id));
    }
    PADDLE_ENFORCE_EQ(
        order_.size(), contexts_.size(),
        "NCCL Context Map does not support contain two or more same device");

108
    if (places.size() <= 1 && num_trainers == 1) {
T
typhoonzero 已提交
109 110 111
      return;
    }
    std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
W
Wu Yi 已提交
112
    // if num_trainers == 1, should create a new nccl id for local comms.
Y
Yancey1989 已提交
113
    if (num_trainers == 1 && nccl_id == nullptr) {
T
typhoonzero 已提交
114 115 116
      std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
      PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
          comms.get(), static_cast<int>(order_.size()), order_.data()));
T
typhoonzero 已提交
117
    } else {
W
Wu Yi 已提交
118
      PADDLE_ENFORCE_NOT_NULL(nccl_id);
Y
Yu Yang 已提交
119
      {
T
typhoonzero 已提交
120
        int nranks = num_trainers * order_.size();
T
typhoonzero 已提交
121
        NCCLGroupGuard gurad;
122 123 124 125 126 127 128 129 130 131
        for (size_t i = 0; i < order_.size(); ++i) {
          int gpu_id = order_[i];
          int rank;
          if (order_.size() > 1) {
            rank = trainer_id * order_.size() + i;
          } else {
            rank = trainer_id;
          }
          VLOG(30) << "init nccl rank: " << rank << " nranks: " << nranks
                   << "gpu id: " << gpu_id;
T
typhoonzero 已提交
132
          PADDLE_ENFORCE(cudaSetDevice(gpu_id));
T
testing  
typhoonzero 已提交
133
          PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
134
              comms.get() + i, nranks, *nccl_id, rank));
T
typhoonzero 已提交
135
        }
Y
Yu Yang 已提交
136
      }
Y
Yu Yang 已提交
137
    }
T
typhoonzero 已提交
138 139 140 141
    int i = 0;
    for (auto &dev_id : order_) {
      contexts_.at(dev_id).comm_ = comms[i++];
    }
Y
Yu Yang 已提交
142 143
  }

Y
Yu Yang 已提交
144 145 146
  NCCLContextMap(const NCCLContextMap &other) = delete;
  NCCLContextMap &operator=(const NCCLContextMap &other) = delete;

Y
Yu Yang 已提交
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
  CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }

  CUDADeviceContext *DevCtx(platform::Place p) const {
    return DevCtx(boost::get<CUDAPlace>(p).device);
  }

  const NCCLContext &at(platform::Place p) const {
    return this->at(boost::get<CUDAPlace>(p).device);
  }

  const NCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }

  void WaitAll() {
    for (auto &p : contexts_) {
      p.second.ctx_->Wait();
    }
  }
};

Y
Yu Yang 已提交
166 167
}  // namespace platform
}  // namespace paddle
P
peizhilin 已提交
168
#endif