nccl_op_test.cu.cc 8.9 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
D
Dong Zhihong 已提交
2

L
Luo Tao 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
D
Dong Zhihong 已提交
6

L
Luo Tao 已提交
7
    http://www.apache.org/licenses/LICENSE-2.0
D
Dong Zhihong 已提交
8

L
Luo Tao 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
D
Dong Zhihong 已提交
14

D
Dong Zhihong 已提交
15 16
#include <glog/logging.h>
#include <gtest/gtest.h>
D
Dong Zhihong 已提交
17
#include <memory>
Y
Yi Wang 已提交
18 19
#include <mutex>   // NOLINT
#include <thread>  // NOLINT
D
Dong Zhihong 已提交
20
#include <vector>
D
Dong Zhihong 已提交
21

Y
Yi Wang 已提交
22 23 24 25 26 27 28
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
29
#include "paddle/fluid/platform/init.h"
Y
Yi Wang 已提交
30
#include "paddle/fluid/platform/place.h"
D
Dong Zhihong 已提交
31

D
Dong Zhihong 已提交
32
USE_NO_KERNEL_OP(ncclInit);
Q
QI JUN 已提交
33 34 35
USE_CUDA_ONLY_OP(ncclAllReduce);
USE_CUDA_ONLY_OP(ncclReduce);
USE_CUDA_ONLY_OP(ncclBcast);
D
Dong Zhihong 已提交
36

D
Dong Zhihong 已提交
37 38 39
namespace f = paddle::framework;
namespace p = paddle::platform;

D
Dong Zhihong 已提交
40
// test data amount
Q
QI JUN 已提交
41
const f::DDim kDims = {20, 20};
D
Dong Zhihong 已提交
42

D
Dong Zhihong 已提交
43 44 45
// nccl op common tester, init communicator.
class NCCLTester : public ::testing::Test {
 public:
Y
Yi Wang 已提交
46
  void SetUp() override {
Q
QI JUN 已提交
47
    int count = p::GetCUDADeviceCount();
48 49 50
    if (count <= 0) {
      LOG(WARNING) << "Cannot test gpu nccl, because the CUDA device count is "
                   << count;
Q
QI JUN 已提交
51 52 53 54 55 56
      exit(0);
    }
    for (int i = 0; i < count; ++i) {
      gpu_list_.emplace_back(i);
    }

D
dzhwinter 已提交
57
    paddle::platform::CPUPlace cpu_place;
Q
QI JUN 已提交
58
    for (size_t i = 0; i < gpu_list_.size(); ++i) {
D
dzhwinter 已提交
59
      p::CUDAPlace place(i);
Q
QI JUN 已提交
60
      dev_ctxs_.emplace_back(new p::CUDADeviceContext(place));
D
Dong Zhihong 已提交
61 62 63 64
    }

    NCCLInitOp();
  }
D
Dong Zhihong 已提交
65

Y
Yi Wang 已提交
66
  void TearDown() override {
Q
QI JUN 已提交
67
    for (auto &device_context : dev_ctxs_) {
D
Dong Zhihong 已提交
68 69 70
      delete device_context;
    }
  }
D
Dong Zhihong 已提交
71

D
Dong Zhihong 已提交
72
  void NCCLInitOp() {
D
dzhwinter 已提交
73
    paddle::platform::CPUPlace cpu_place;
Y
Yu Yang 已提交
74
    std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
D
Dong Zhihong 已提交
75

D
Dong Zhihong 已提交
76
    op1->SetType("ncclInit");
Q
QI JUN 已提交
77
    op1->SetInput("parallel_scopes", {"p_scopes"});
D
Dong Zhihong 已提交
78
    op1->SetOutput("Communicator", {"comm"});
D
Dong Zhihong 已提交
79

Q
QI JUN 已提交
80
    auto *var = g_scope_.Var("comm");
D
Dong Zhihong 已提交
81
    var->GetMutable<p::Communicator>();
82

Q
QI JUN 已提交
83 84 85 86
    auto *scope_var = g_scope_.Var("p_scopes");
    auto *p_scopes = scope_var->GetMutable<std::vector<f::Scope *>>();
    (*p_scopes).resize(gpu_list_.size());

D
Dong Zhihong 已提交
87
    auto op = f::OpRegistry::CreateOp(*op1);
M
minqiyang 已提交
88
    VLOG(1) << "invoke NCCLInitOp.";
Q
QI JUN 已提交
89
    op->Run(g_scope_, cpu_place);
M
minqiyang 已提交
90
    VLOG(1) << "NCCLInitOp finished.";
D
Dong Zhihong 已提交
91 92
  }

Q
QI JUN 已提交
93 94
  int GetGPUData(int gpu_id) { return gpu_id + 42; }

D
Dong Zhihong 已提交
95
  template <class T>
Y
Yu Yang 已提交
96
  void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
Q
QI JUN 已提交
97
    std::unique_lock<std::mutex> lk(mu_);
Y
Yu Yang 已提交
98
    const f::OpDesc *op1 = &op_desc;
D
Dong Zhihong 已提交
99

D
dzhwinter 已提交
100
    p::CUDAPlace place(gpu_id);
Q
QI JUN 已提交
101
    auto &ctx = dev_ctxs_.at(gpu_id);
D
Dong Zhihong 已提交
102 103 104 105

    auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
    auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();

D
Dong Zhihong 已提交
106 107 108
    if (!send_tensor->numel()) {
      send_tensor->mutable_data<T>(kDims, place);

Q
QI JUN 已提交
109
      std::vector<T> send_vector(f::product(kDims), GetGPUData(gpu_id));
Y
Yi Wang 已提交
110
      paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
M
minqiyang 已提交
111
      VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
D
Dong Zhihong 已提交
112 113
    }

D
Dong Zhihong 已提交
114
    lk.unlock();
D
Dong Zhihong 已提交
115

116 117 118
    PADDLE_ENFORCE_EQ(
        send_tensor->numel(), f::product(kDims),
        paddle::platform::errors::InvalidArgument("Tensor numel not match!"));
D
Dong Zhihong 已提交
119 120

    auto op = f::OpRegistry::CreateOp(*op1);
D
Dong Zhihong 已提交
121

M
minqiyang 已提交
122 123 124
    VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
    VLOG(1) << " send_tensor : " << send_tensor->numel()
            << " recv_tensor : " << recv_tensor->numel();
D
dzhwinter 已提交
125
    op->Run(*scope, place);
M
minqiyang 已提交
126
    VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
D
Dong Zhihong 已提交
127
  }
128

129 130 131 132
  void testNcclReduceOp();
  void testNcclAllReduceOp();
  void testNcclBcastOp();

D
Dong Zhihong 已提交
133
 public:
Q
QI JUN 已提交
134 135 136 137
  std::vector<p::DeviceContext *> dev_ctxs_;
  f::Scope g_scope_;
  std::mutex mu_;
  std::vector<int> gpu_list_;
D
Dong Zhihong 已提交
138 139
};

140
void NCCLTester::testNcclAllReduceOp() {
Y
Yu Yang 已提交
141
  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
142 143
  op2->SetType("ncclAllReduce");
  op2->SetInput("X", {"st"});
D
Dong Zhihong 已提交
144
  op2->SetInput("Communicator", {"comm"});
145
  op2->SetOutput("Out", {"rt"});
D
Dong Zhihong 已提交
146 147

  std::vector<f::Scope *> dev_scopes;
D
Dong Zhihong 已提交
148 149

  std::vector<std::thread> ths;
D
Dong Zhihong 已提交
150

Q
QI JUN 已提交
151 152 153
  for (size_t i = 0; i < gpu_list_.size(); ++i) {
    dev_scopes.emplace_back(&g_scope_.NewScope());
    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
D
Dong Zhihong 已提交
154 155 156
                   *op2.get(), dev_scopes[i]);
    ths.emplace_back(std::move(th));
  }
D
Dong Zhihong 已提交
157

Q
QI JUN 已提交
158
  for (size_t i = 0; i < gpu_list_.size(); ++i) {
D
Dong Zhihong 已提交
159 160
    ths[i].join();
  }
D
Dong Zhihong 已提交
161

Q
QI JUN 已提交
162 163 164 165
  float expected_result = 0.0;
  for (int gpu_id : gpu_list_) {
    expected_result = expected_result + GetGPUData(gpu_id);
  }
D
Dong Zhihong 已提交
166 167 168

  for (size_t i = 0; i < dev_scopes.size(); ++i) {
    p::CPUPlace cpu_place;
Q
QI JUN 已提交
169
    p::CUDAPlace gpu_place(gpu_list_[i]);
D
Dong Zhihong 已提交
170 171 172 173 174 175 176 177

    auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
    auto *rt = recv_tensor.data<float>();
    auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
    result_tensor->Resize(kDims);
    auto *ct = result_tensor->mutable_data<float>(cpu_place);

    paddle::memory::Copy(
Q
QI JUN 已提交
178
        cpu_place, ct, p::CUDAPlace(gpu_list_[i]), rt,
D
Dong Zhihong 已提交
179
        recv_tensor.numel() * sizeof(float),
Q
QI JUN 已提交
180
        static_cast<p::CUDADeviceContext *>(dev_ctxs_[i])->stream());
D
Dong Zhihong 已提交
181

D
dangqingqing 已提交
182
    for (int64_t j = 0; j < f::product(kDims); ++j) {
Q
QI JUN 已提交
183
      ASSERT_NEAR(ct[j], expected_result, 1e-5);
D
Dong Zhihong 已提交
184 185 186
    }
  }
}
D
Dong Zhihong 已提交
187

188
void NCCLTester::testNcclReduceOp() {
Y
Yu Yang 已提交
189
  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
190 191 192 193 194
  const int kRoot = 0;
  op2->SetType("ncclReduce");
  op2->SetInput("X", {"st"});
  op2->SetInput("Communicator", {"comm"});
  op2->SetOutput("Out", {"rt"});
D
dzhwinter 已提交
195
  op2->SetAttr("root", kRoot);
196 197 198 199 200

  std::vector<f::Scope *> dev_scopes;

  std::vector<std::thread> ths;

Q
QI JUN 已提交
201 202 203
  for (size_t i = 0; i < gpu_list_.size(); ++i) {
    dev_scopes.emplace_back(&g_scope_.NewScope());
    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
204 205 206 207
                   *op2.get(), dev_scopes[i]);
    ths.emplace_back(std::move(th));
  }

Q
QI JUN 已提交
208
  for (size_t i = 0; i < gpu_list_.size(); ++i) {
209 210 211
    ths[i].join();
  }

Q
QI JUN 已提交
212 213 214 215
  float expected_result = 0.0;
  for (int gpu_id : gpu_list_) {
    expected_result = expected_result + GetGPUData(gpu_id);
  }
216 217

  p::CPUPlace cpu_place;
Q
QI JUN 已提交
218
  p::CUDAPlace gpu_place(gpu_list_[kRoot]);
219 220 221 222 223 224 225 226

  auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
  auto *rt = recv_tensor.data<float>();
  auto *result_tensor =
      dev_scopes[kRoot]->Var("ct")->GetMutable<f::LoDTensor>();
  result_tensor->Resize(kDims);
  auto *ct = result_tensor->mutable_data<float>(cpu_place);

F
fengjiayi 已提交
227 228
  paddle::memory::Copy(cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt,
                       recv_tensor.numel() * sizeof(float), nullptr);
229

D
dangqingqing 已提交
230
  for (int64_t j = 0; j < f::product(kDims); ++j) {
Q
QI JUN 已提交
231
    ASSERT_NEAR(ct[j], expected_result, 1e-5);
232 233 234
  }
}

235
void NCCLTester::testNcclBcastOp() {
Y
Yu Yang 已提交
236
  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
Y
Yang Yang(Tony) 已提交
237
  const int kRoot = 0;
238 239 240 241
  op2->SetType("ncclBcast");
  op2->SetInput("X", {"st"});
  op2->SetInput("Communicator", {"comm"});
  op2->SetOutput("Out", {"rt"});
D
dzhwinter 已提交
242
  op2->SetAttr("root", kRoot);
243 244 245 246 247

  std::vector<f::Scope *> dev_scopes;

  std::vector<std::thread> ths;

Q
QI JUN 已提交
248 249 250
  for (size_t i = 0; i < gpu_list_.size(); ++i) {
    dev_scopes.emplace_back(&g_scope_.NewScope());
    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
251 252 253 254
                   *op2.get(), dev_scopes[i]);
    ths.emplace_back(std::move(th));
  }

Q
QI JUN 已提交
255
  for (size_t i = 0; i < gpu_list_.size(); ++i) {
256 257 258
    ths[i].join();
  }

259
  const int idx = gpu_list_.size() - 1;
Q
QI JUN 已提交
260
  float result = GetGPUData(kRoot);
261 262

  p::CPUPlace cpu_place;
Q
QI JUN 已提交
263
  p::CUDAPlace gpu_place(gpu_list_[idx]);
264

265 266 267 268 269
  std::string rt_str = "rt";
  if (idx == kRoot) {
    rt_str = "st";
  }
  auto &recv_tensor = dev_scopes[idx]->FindVar(rt_str)->Get<f::LoDTensor>();
270 271 272 273 274 275
  auto *rt = recv_tensor.data<float>();
  auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
  result_tensor->Resize(kDims);
  auto *ct = result_tensor->mutable_data<float>(cpu_place);

  paddle::memory::Copy(
Q
QI JUN 已提交
276
      cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
277
      recv_tensor.numel() * sizeof(float),
Q
QI JUN 已提交
278
      static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx])->stream());
279

D
dangqingqing 已提交
280
  for (int64_t j = 0; j < f::product(kDims); ++j) {
281 282 283
    ASSERT_NEAR(ct[j], result, 1e-5);
  }
}
284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300

// ncclInitOp with desc
TEST_F(NCCLTester, ncclInitOp) {}

TEST_F(NCCLTester, ncclOp) {
  // Serial execution is required for the same nccl comm.

  // ncclAllReduceOp with desc
  // TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
  testNcclReduceOp();

  testNcclAllReduceOp();

  // ncclBcastOp with desc
  // TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
  testNcclBcastOp();
}