nccl_op.cu 6.6 KB
Newer Older
D
Dong Zhihong 已提交
1 2 3 4
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
D
Dong Zhihong 已提交
5 6
http://www.apache.org/licenseshashernless required by applicable law or agreed
to in writing, software
D
Dong Zhihong 已提交
7 8 9 10 11 12
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#define EIGEN_USE_GPU
D
Dong Zhihong 已提交
13 14
#include <functional>

D
Dong Zhihong 已提交
15
#include "paddle/framework/lod_tensor.h"
16 17
#include "paddle/framework/op_registry.h"
#include "paddle/operators/nccl/nccl_gpu_common.h"
D
Dong Zhihong 已提交
18 19 20 21

namespace paddle {
namespace operators {

22 23
using framework::Tensor;
using platform::Communicator;
D
Dong Zhihong 已提交
24
using framework::LoDTensor;
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40

template <typename Type>
class NCCLTypeWrapper;

template <>
class NCCLTypeWrapper<float> {
 public:
  static const ncclDataType_t type = ncclFloat;
};

template <>
class NCCLTypeWrapper<double> {
 public:
  static const ncclDataType_t type = ncclDouble;
};

D
Dong Zhihong 已提交
41 42 43 44 45 46 47
template <typename T>
class NCCLAllReduceKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "This kernel only runs on GPU device.");

D
Dong Zhihong 已提交
48 49
    auto ins = ctx.MultiInput<LoDTensor>("X");
    auto outs = ctx.MultiOutput<LoDTensor>("Out");
D
Dong Zhihong 已提交
50

D
Dong Zhihong 已提交
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
    std::string reduction = ctx.Attr<std::string>("reduction");

    ncclRedOp_t reduction_op_ = ncclSum;

    if (reduction == "ncclMin") {
      reduction_op_ = ncclMin;
    } else if (reduction == "ncclMax") {
      reduction_op_ = ncclMax;
    } else if (reduction == "ncclSum") {
      reduction_op_ = ncclSum;
    } else if (reduction == "ncclProd") {
      reduction_op_ = ncclProd;
    } else {
      PADDLE_ENFORCE(false, "Invalid reduction. default ncclSum.");
    }

D
Dong Zhihong 已提交
67 68 69 70 71
    auto* comm = ctx.Input<Communicator>("Communicator");

    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
                      ctx.device_context())
                      .stream();
D
Dong Zhihong 已提交
72

D
Dong Zhihong 已提交
73
    // device id
D
Dong Zhihong 已提交
74 75
    int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
    int idx = comm->GetCommId(gpu_id);
D
Dong Zhihong 已提交
76 77

    for (size_t i = 0; i < ins.size(); ++i) {
D
Dong Zhihong 已提交
78 79
      VLOG(1) << "gpu : "
              << " invoke allreduce. send " << ins[i]->numel() << " recv "
D
Dong Zhihong 已提交
80 81
              << outs[i]->numel();

D
Dong Zhihong 已提交
82
      PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
D
Dong Zhihong 已提交
83
          ins[i]->data<T>(), outs[i]->mutable_data<T>(ctx.GetPlace()),
D
Dong Zhihong 已提交
84
          outs[i]->numel(), NCCLTypeWrapper<T>::type, reduction_op_,
D
Dong Zhihong 已提交
85 86
          comm->comms_[idx], stream));
      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
D
Dong Zhihong 已提交
87

D
Dong Zhihong 已提交
88 89
      VLOG(1) << "gpu : "
              << " finished allreduce. send " << ins[i]->numel() << " recv "
D
Dong Zhihong 已提交
90
              << outs[i]->numel();
D
Dong Zhihong 已提交
91 92 93 94
    }
  }
};

D
Dong Zhihong 已提交
95 96 97 98 99 100 101
template <typename T>
class NCCLReduceKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "This kernel only runs on GPU device.");

D
Dong Zhihong 已提交
102 103
    auto ins = ctx.MultiInput<LoDTensor>("X");  // x0, x1, x2
    auto outs = ctx.MultiOutput<LoDTensor>("Out");
D
Dong Zhihong 已提交
104
    int root = ctx.Attr<int>("root");
D
Dong Zhihong 已提交
105 106 107 108 109 110 111

    auto* comm = ctx.Input<Communicator>("Communicator");

    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
                      ctx.device_context())
                      .stream();
    // device id
D
Dong Zhihong 已提交
112 113
    int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
    int idx = comm->GetCommId(gpu_id);
D
Dong Zhihong 已提交
114

D
Dong Zhihong 已提交
115 116
    auto ins_names = ctx.Inputs("X");
    std::hash<std::string> hasher;
D
Dong Zhihong 已提交
117
    for (size_t i = 0; i < ins.size(); ++i) {
D
Dong Zhihong 已提交
118
      if (root == platform::kInvalidGPUId) {
D
Dong Zhihong 已提交
119 120
        root = hasher(ins_names[i]) % comm->comms_.size();
      }
D
Dong Zhihong 已提交
121
      T* recvbuffer = nullptr;
D
Dong Zhihong 已提交
122
      if (root == gpu_id) {
D
Dong Zhihong 已提交
123 124
        recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace());
      }
D
Dong Zhihong 已提交
125

D
Dong Zhihong 已提交
126 127
      VLOG(1) << "gpu : " << gpu_id << " invoke reduce. send "
              << ins[i]->numel() << " recv " << outs[i]->numel();
D
Dong Zhihong 已提交
128

D
Dong Zhihong 已提交
129 130 131
      PADDLE_ENFORCE(platform::dynload::ncclReduce(
          ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
          NCCLTypeWrapper<T>::type, ncclSum, root, comm->comms_[idx], stream));
D
Dong Zhihong 已提交
132
      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
D
Dong Zhihong 已提交
133

D
Dong Zhihong 已提交
134 135
      VLOG(1) << "gpu : " << gpu_id << " finished reduce. send "
              << ins[i]->numel() << " recv " << outs[i]->numel();
D
Dong Zhihong 已提交
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
    }
  }
};

template <typename T>
class NCCLBcastKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "This kernel only runs on GPU device.");

    int root = ctx.Attr<int>("root");

    auto* comm = ctx.Input<Communicator>("Communicator");

    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
                      ctx.device_context())
                      .stream();
    // device id
D
Dong Zhihong 已提交
155 156
    int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
    int idx = comm->GetCommId(gpu_id);
D
Dong Zhihong 已提交
157

D
Dong Zhihong 已提交
158
    if (idx == root) {
D
Dong Zhihong 已提交
159
      auto ins = ctx.MultiInput<LoDTensor>("X");
D
Dong Zhihong 已提交
160
      for (size_t i = 0; i < ins.size(); ++i) {
D
Dong Zhihong 已提交
161 162
        VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. send "
                << ins[i]->numel();
D
Dong Zhihong 已提交
163

D
Dong Zhihong 已提交
164
        VLOG(1) << " before ncclBcast";
D
Dong Zhihong 已提交
165 166 167
        PADDLE_ENFORCE(platform::dynload::ncclBcast(
            (void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type,
            root, comm->comms_[idx], stream));
D
Dong Zhihong 已提交
168
        VLOG(1) << " after ncclBcast";
D
Dong Zhihong 已提交
169
        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
D
Dong Zhihong 已提交
170

D
Dong Zhihong 已提交
171
        VLOG(1) << "gpu : " << gpu_id << " finished Bcast.";
D
Dong Zhihong 已提交
172 173
      }
    } else {
D
Dong Zhihong 已提交
174
      auto outs = ctx.MultiOutput<LoDTensor>("Out");
D
Dong Zhihong 已提交
175
      for (size_t i = 0; i < outs.size(); ++i) {
D
Dong Zhihong 已提交
176 177
        VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
                << framework::product(outs[i]->dims());
D
Dong Zhihong 已提交
178

D
Dong Zhihong 已提交
179 180 181
        PADDLE_ENFORCE(platform::dynload::ncclBcast(
            outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(),
            NCCLTypeWrapper<T>::type, root, comm->comms_[idx], stream));
D
Dong Zhihong 已提交
182
        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
D
Dong Zhihong 已提交
183

D
Dong Zhihong 已提交
184 185
        VLOG(1) << "gpu : " << gpu_id << " finished Bcast. recv "
                << outs[i]->numel();
D
Dong Zhihong 已提交
186 187 188 189 190
      }
    }
  }
};

D
Dong Zhihong 已提交
191 192 193 194 195
}  // namespace operators
}  // namespace paddle

namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
D
Dong Zhihong 已提交
196
REGISTER_OP_GPU_KERNEL(ncclBcast, ops::NCCLBcastKernel<float>);
D
Dong Zhihong 已提交
197
REGISTER_OP_GPU_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);