nccl_op_test.cu.cc 8.6 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
D
Dong Zhihong 已提交
2

L
Luo Tao 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
D
Dong Zhihong 已提交
6

L
Luo Tao 已提交
7
    http://www.apache.org/licenses/LICENSE-2.0
D
Dong Zhihong 已提交
8

L
Luo Tao 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
D
Dong Zhihong 已提交
14

D
Dong Zhihong 已提交
15 16
#include <glog/logging.h>
#include <gtest/gtest.h>
D
Dong Zhihong 已提交
17
#include <memory>
Y
Yi Wang 已提交
18 19
#include <mutex>   // NOLINT
#include <thread>  // NOLINT
D
Dong Zhihong 已提交
20
#include <vector>
D
Dong Zhihong 已提交
21

Y
Yi Wang 已提交
22 23 24 25 26 27 28 29 30
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h"
D
Dong Zhihong 已提交
31

D
Dong Zhihong 已提交
32
USE_NO_KERNEL_OP(ncclInit);
Q
QI JUN 已提交
33 34 35
USE_CUDA_ONLY_OP(ncclAllReduce);
USE_CUDA_ONLY_OP(ncclReduce);
USE_CUDA_ONLY_OP(ncclBcast);
D
Dong Zhihong 已提交
36

D
Dong Zhihong 已提交
37 38 39
namespace f = paddle::framework;
namespace p = paddle::platform;

D
Dong Zhihong 已提交
40
// test data amount
Q
QI JUN 已提交
41
const f::DDim kDims = {20, 20};
D
Dong Zhihong 已提交
42

D
Dong Zhihong 已提交
43 44 45
// nccl op common tester, init communicator.
class NCCLTester : public ::testing::Test {
 public:
Y
Yi Wang 已提交
46
  void SetUp() override {
Q
QI JUN 已提交
47 48 49 50 51 52 53 54 55 56 57
    int count = p::GetCUDADeviceCount();
    if (count <= 1) {
      LOG(WARNING)
          << "Cannot test multi-gpu nccl, because the CUDA device count is "
          << count;
      exit(0);
    }
    for (int i = 0; i < count; ++i) {
      gpu_list_.emplace_back(i);
    }

D
dzhwinter 已提交
58
    paddle::platform::CPUPlace cpu_place;
Q
QI JUN 已提交
59
    for (size_t i = 0; i < gpu_list_.size(); ++i) {
D
dzhwinter 已提交
60
      p::CUDAPlace place(i);
Q
QI JUN 已提交
61
      dev_ctxs_.emplace_back(new p::CUDADeviceContext(place));
D
Dong Zhihong 已提交
62 63 64 65
    }

    NCCLInitOp();
  }
D
Dong Zhihong 已提交
66

Y
Yi Wang 已提交
67
  void TearDown() override {
Q
QI JUN 已提交
68
    for (auto &device_context : dev_ctxs_) {
D
Dong Zhihong 已提交
69 70 71
      delete device_context;
    }
  }
D
Dong Zhihong 已提交
72

D
Dong Zhihong 已提交
73
  void NCCLInitOp() {
D
dzhwinter 已提交
74
    paddle::platform::CPUPlace cpu_place;
Y
Yu Yang 已提交
75
    std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
D
Dong Zhihong 已提交
76

D
Dong Zhihong 已提交
77
    op1->SetType("ncclInit");
Q
QI JUN 已提交
78
    op1->SetInput("parallel_scopes", {"p_scopes"});
D
Dong Zhihong 已提交
79
    op1->SetOutput("Communicator", {"comm"});
D
Dong Zhihong 已提交
80

Q
QI JUN 已提交
81
    auto *var = g_scope_.Var("comm");
D
Dong Zhihong 已提交
82
    var->GetMutable<p::Communicator>();
83

Q
QI JUN 已提交
84 85 86 87
    auto *scope_var = g_scope_.Var("p_scopes");
    auto *p_scopes = scope_var->GetMutable<std::vector<f::Scope *>>();
    (*p_scopes).resize(gpu_list_.size());

D
Dong Zhihong 已提交
88 89
    auto op = f::OpRegistry::CreateOp(*op1);
    VLOG(1) << "invoke NCCLInitOp.";
Q
QI JUN 已提交
90
    op->Run(g_scope_, cpu_place);
D
Dong Zhihong 已提交
91 92 93
    VLOG(1) << "NCCLInitOp finished.";
  }

Q
QI JUN 已提交
94 95
  int GetGPUData(int gpu_id) { return gpu_id + 42; }

D
Dong Zhihong 已提交
96
  template <class T>
Y
Yu Yang 已提交
97
  void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
Q
QI JUN 已提交
98
    std::unique_lock<std::mutex> lk(mu_);
Y
Yu Yang 已提交
99
    const f::OpDesc *op1 = &op_desc;
D
Dong Zhihong 已提交
100

D
dzhwinter 已提交
101
    p::CUDAPlace place(gpu_id);
Q
QI JUN 已提交
102
    auto &ctx = dev_ctxs_.at(gpu_id);
D
Dong Zhihong 已提交
103 104 105 106

    auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
    auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();

D
Dong Zhihong 已提交
107 108 109
    if (!send_tensor->numel()) {
      send_tensor->mutable_data<T>(kDims, place);

Q
QI JUN 已提交
110
      std::vector<T> send_vector(f::product(kDims), GetGPUData(gpu_id));
Y
Yi Wang 已提交
111
      paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
D
Dong Zhihong 已提交
112 113 114
      VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
    }

D
Dong Zhihong 已提交
115
    lk.unlock();
D
Dong Zhihong 已提交
116

D
Dong Zhihong 已提交
117 118 119 120
    PADDLE_ENFORCE(send_tensor->numel() == f::product(kDims),
                   "Tensor numel not match!");

    auto op = f::OpRegistry::CreateOp(*op1);
D
Dong Zhihong 已提交
121

D
Dong Zhihong 已提交
122
    VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
D
Dong Zhihong 已提交
123 124
    VLOG(1) << " send_tensor : " << send_tensor->numel()
            << " recv_tensor : " << recv_tensor->numel();
D
dzhwinter 已提交
125
    op->Run(*scope, place);
D
Dong Zhihong 已提交
126 127
    VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
  }
128

D
Dong Zhihong 已提交
129
 public:
Q
QI JUN 已提交
130 131 132 133
  std::vector<p::DeviceContext *> dev_ctxs_;
  f::Scope g_scope_;
  std::mutex mu_;
  std::vector<int> gpu_list_;
D
Dong Zhihong 已提交
134 135
};

136
// ncclInitOp with desc
Q
QI JUN 已提交
137
TEST_F(NCCLTester, ncclInitOp) {}
138 139

// ncclAllReduceOp with desc
H
Helin Wang 已提交
140 141
// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
/*
142
TEST_F(NCCLTester, ncclAllReduceOp) {
Y
Yu Yang 已提交
143
  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
144 145
  op2->SetType("ncclAllReduce");
  op2->SetInput("X", {"st"});
D
Dong Zhihong 已提交
146
  op2->SetInput("Communicator", {"comm"});
147
  op2->SetOutput("Out", {"rt"});
D
Dong Zhihong 已提交
148 149

  std::vector<f::Scope *> dev_scopes;
D
Dong Zhihong 已提交
150 151

  std::vector<std::thread> ths;
D
Dong Zhihong 已提交
152

Q
QI JUN 已提交
153 154 155
  for (size_t i = 0; i < gpu_list_.size(); ++i) {
    dev_scopes.emplace_back(&g_scope_.NewScope());
    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
D
Dong Zhihong 已提交
156 157 158
                   *op2.get(), dev_scopes[i]);
    ths.emplace_back(std::move(th));
  }
D
Dong Zhihong 已提交
159

Q
QI JUN 已提交
160
  for (size_t i = 0; i < gpu_list_.size(); ++i) {
D
Dong Zhihong 已提交
161 162
    ths[i].join();
  }
D
Dong Zhihong 已提交
163

Q
QI JUN 已提交
164 165 166 167
  float expected_result = 0.0;
  for (int gpu_id : gpu_list_) {
    expected_result = expected_result + GetGPUData(gpu_id);
  }
D
Dong Zhihong 已提交
168 169 170

  for (size_t i = 0; i < dev_scopes.size(); ++i) {
    p::CPUPlace cpu_place;
Q
QI JUN 已提交
171
    p::CUDAPlace gpu_place(gpu_list_[i]);
D
Dong Zhihong 已提交
172 173 174 175 176 177 178 179

    auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
    auto *rt = recv_tensor.data<float>();
    auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
    result_tensor->Resize(kDims);
    auto *ct = result_tensor->mutable_data<float>(cpu_place);

    paddle::memory::Copy(
Q
QI JUN 已提交
180
        cpu_place, ct, p::CUDAPlace(gpu_list_[i]), rt,
D
Dong Zhihong 已提交
181
        recv_tensor.numel() * sizeof(float),
Q
QI JUN 已提交
182
        static_cast<p::CUDADeviceContext *>(dev_ctxs_[i])->stream());
D
Dong Zhihong 已提交
183

D
dangqingqing 已提交
184
    for (int64_t j = 0; j < f::product(kDims); ++j) {
Q
QI JUN 已提交
185
      ASSERT_NEAR(ct[j], expected_result, 1e-5);
D
Dong Zhihong 已提交
186 187 188
    }
  }
}
H
Helin Wang 已提交
189
*/
D
Dong Zhihong 已提交
190

D
dzhwinter 已提交
191
// ncclReduceOp with desc
192
TEST_F(NCCLTester, ncclReduceOp) {
Y
Yu Yang 已提交
193
  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
194 195 196 197 198
  const int kRoot = 0;
  op2->SetType("ncclReduce");
  op2->SetInput("X", {"st"});
  op2->SetInput("Communicator", {"comm"});
  op2->SetOutput("Out", {"rt"});
D
dzhwinter 已提交
199
  op2->SetAttr("root", kRoot);
200 201 202 203 204

  std::vector<f::Scope *> dev_scopes;

  std::vector<std::thread> ths;

Q
QI JUN 已提交
205 206 207
  for (size_t i = 0; i < gpu_list_.size(); ++i) {
    dev_scopes.emplace_back(&g_scope_.NewScope());
    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
208 209 210 211
                   *op2.get(), dev_scopes[i]);
    ths.emplace_back(std::move(th));
  }

Q
QI JUN 已提交
212
  for (size_t i = 0; i < gpu_list_.size(); ++i) {
213 214 215
    ths[i].join();
  }

Q
QI JUN 已提交
216 217 218 219
  float expected_result = 0.0;
  for (int gpu_id : gpu_list_) {
    expected_result = expected_result + GetGPUData(gpu_id);
  }
220 221

  p::CPUPlace cpu_place;
Q
QI JUN 已提交
222
  p::CUDAPlace gpu_place(gpu_list_[kRoot]);
223 224 225 226 227 228 229 230 231

  auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
  auto *rt = recv_tensor.data<float>();
  auto *result_tensor =
      dev_scopes[kRoot]->Var("ct")->GetMutable<f::LoDTensor>();
  result_tensor->Resize(kDims);
  auto *ct = result_tensor->mutable_data<float>(cpu_place);

  paddle::memory::Copy(
Q
QI JUN 已提交
232
      cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt,
233
      recv_tensor.numel() * sizeof(float),
Q
QI JUN 已提交
234
      static_cast<p::CUDADeviceContext *>(dev_ctxs_[kRoot])->stream());
235

D
dangqingqing 已提交
236
  for (int64_t j = 0; j < f::product(kDims); ++j) {
Q
QI JUN 已提交
237
    ASSERT_NEAR(ct[j], expected_result, 1e-5);
238 239 240
  }
}

D
dzhwinter 已提交
241
// ncclBcastOp with desc
H
Helin Wang 已提交
242 243
// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
/*
244
TEST_F(NCCLTester, ncclBcastOp) {
Y
Yu Yang 已提交
245
  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
Y
Yang Yang(Tony) 已提交
246
  const int kRoot = 0;
247 248 249 250
  op2->SetType("ncclBcast");
  op2->SetInput("X", {"st"});
  op2->SetInput("Communicator", {"comm"});
  op2->SetOutput("Out", {"rt"});
D
dzhwinter 已提交
251
  op2->SetAttr("root", kRoot);
252 253 254 255 256

  std::vector<f::Scope *> dev_scopes;

  std::vector<std::thread> ths;

Q
QI JUN 已提交
257 258 259
  for (size_t i = 0; i < gpu_list_.size(); ++i) {
    dev_scopes.emplace_back(&g_scope_.NewScope());
    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
260 261 262 263
                   *op2.get(), dev_scopes[i]);
    ths.emplace_back(std::move(th));
  }

Q
QI JUN 已提交
264
  for (size_t i = 0; i < gpu_list_.size(); ++i) {
265 266 267 268
    ths[i].join();
  }

  const int idx = 1;
Q
QI JUN 已提交
269
  float result = GetGPUData(kRoot);
270 271

  p::CPUPlace cpu_place;
Q
QI JUN 已提交
272
  p::CUDAPlace gpu_place(gpu_list_[idx]);
273 274 275 276 277 278 279 280

  auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
  auto *rt = recv_tensor.data<float>();
  auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
  result_tensor->Resize(kDims);
  auto *ct = result_tensor->mutable_data<float>(cpu_place);

  paddle::memory::Copy(
Q
QI JUN 已提交
281
      cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
282
      recv_tensor.numel() * sizeof(float),
Q
QI JUN 已提交
283
      static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx])->stream());
284

D
dangqingqing 已提交
285
  for (int64_t j = 0; j < f::product(kDims); ++j) {
286 287 288
    ASSERT_NEAR(ct[j], result, 1e-5);
  }
}
H
Helin Wang 已提交
289
*/