nccl_op_test.cu.cc 8.5 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
D
Dong Zhihong 已提交
2

L
Luo Tao 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
D
Dong Zhihong 已提交
6

L
Luo Tao 已提交
7
    http://www.apache.org/licenses/LICENSE-2.0
D
Dong Zhihong 已提交
8

L
Luo Tao 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
D
Dong Zhihong 已提交
14

D
Dong Zhihong 已提交
15 16
#include <glog/logging.h>
#include <gtest/gtest.h>
D
Dong Zhihong 已提交
17
#include <memory>
D
Dong Zhihong 已提交
18 19
#include <mutex>
#include <thread>
D
Dong Zhihong 已提交
20
#include <vector>
D
Dong Zhihong 已提交
21

Y
Yi Wang 已提交
22 23 24 25 26 27 28 29 30
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h"
D
Dong Zhihong 已提交
31

D
Dong Zhihong 已提交
32
USE_NO_KERNEL_OP(ncclInit);
Q
QI JUN 已提交
33 34 35
USE_CUDA_ONLY_OP(ncclAllReduce);
USE_CUDA_ONLY_OP(ncclReduce);
USE_CUDA_ONLY_OP(ncclBcast);
D
Dong Zhihong 已提交
36

D
Dong Zhihong 已提交
37 38 39
namespace f = paddle::framework;
namespace p = paddle::platform;

D
Dong Zhihong 已提交
40
// test data amount
Q
QI JUN 已提交
41
const f::DDim kDims = {20, 20};
D
Dong Zhihong 已提交
42

D
Dong Zhihong 已提交
43 44 45 46
// nccl op common tester, init communicator.
class NCCLTester : public ::testing::Test {
 public:
  virtual void SetUp() override {
Q
QI JUN 已提交
47 48 49 50 51 52 53 54 55 56 57
    int count = p::GetCUDADeviceCount();
    if (count <= 1) {
      LOG(WARNING)
          << "Cannot test multi-gpu nccl, because the CUDA device count is "
          << count;
      exit(0);
    }
    for (int i = 0; i < count; ++i) {
      gpu_list_.emplace_back(i);
    }

D
dzhwinter 已提交
58
    paddle::platform::CPUPlace cpu_place;
Q
QI JUN 已提交
59
    for (size_t i = 0; i < gpu_list_.size(); ++i) {
D
dzhwinter 已提交
60
      p::CUDAPlace place(i);
Q
QI JUN 已提交
61
      dev_ctxs_.emplace_back(new p::CUDADeviceContext(place));
D
Dong Zhihong 已提交
62 63 64 65
    }

    NCCLInitOp();
  }
D
Dong Zhihong 已提交
66

D
Dong Zhihong 已提交
67
  virtual void TearDown() override {
Q
QI JUN 已提交
68
    for (auto &device_context : dev_ctxs_) {
D
Dong Zhihong 已提交
69 70 71
      delete device_context;
    }
  }
D
Dong Zhihong 已提交
72

D
Dong Zhihong 已提交
73
  void NCCLInitOp() {
D
dzhwinter 已提交
74
    paddle::platform::CPUPlace cpu_place;
Y
Yu Yang 已提交
75
    std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
D
Dong Zhihong 已提交
76

D
Dong Zhihong 已提交
77
    op1->SetType("ncclInit");
Q
QI JUN 已提交
78
    op1->SetInput("parallel_scopes", {"p_scopes"});
D
Dong Zhihong 已提交
79
    op1->SetOutput("Communicator", {"comm"});
D
Dong Zhihong 已提交
80

Q
QI JUN 已提交
81
    auto *var = g_scope_.Var("comm");
D
Dong Zhihong 已提交
82
    var->GetMutable<p::Communicator>();
83

Q
QI JUN 已提交
84 85 86 87
    auto *scope_var = g_scope_.Var("p_scopes");
    auto *p_scopes = scope_var->GetMutable<std::vector<f::Scope *>>();
    (*p_scopes).resize(gpu_list_.size());

D
Dong Zhihong 已提交
88 89
    auto op = f::OpRegistry::CreateOp(*op1);
    VLOG(1) << "invoke NCCLInitOp.";
Q
QI JUN 已提交
90
    op->Run(g_scope_, cpu_place);
D
Dong Zhihong 已提交
91 92 93
    VLOG(1) << "NCCLInitOp finished.";
  }

Q
QI JUN 已提交
94 95
  int GetGPUData(int gpu_id) { return gpu_id + 42; }

D
Dong Zhihong 已提交
96
  template <class T>
Y
Yu Yang 已提交
97
  void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
Q
QI JUN 已提交
98
    std::unique_lock<std::mutex> lk(mu_);
Y
Yu Yang 已提交
99
    const f::OpDesc *op1 = &op_desc;
D
Dong Zhihong 已提交
100

D
dzhwinter 已提交
101
    p::CUDAPlace place(gpu_id);
Q
QI JUN 已提交
102
    auto &ctx = dev_ctxs_.at(gpu_id);
D
Dong Zhihong 已提交
103 104 105 106

    auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
    auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();

D
Dong Zhihong 已提交
107 108 109
    if (!send_tensor->numel()) {
      send_tensor->mutable_data<T>(kDims, place);

Q
QI JUN 已提交
110
      std::vector<T> send_vector(f::product(kDims), GetGPUData(gpu_id));
Y
Yi Wang 已提交
111
      paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
D
Dong Zhihong 已提交
112 113 114
      VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
    }

D
Dong Zhihong 已提交
115
    lk.unlock();
D
Dong Zhihong 已提交
116

D
Dong Zhihong 已提交
117 118 119 120
    PADDLE_ENFORCE(send_tensor->numel() == f::product(kDims),
                   "Tensor numel not match!");

    auto op = f::OpRegistry::CreateOp(*op1);
D
Dong Zhihong 已提交
121

D
Dong Zhihong 已提交
122
    VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
D
Dong Zhihong 已提交
123 124
    VLOG(1) << " send_tensor : " << send_tensor->numel()
            << " recv_tensor : " << recv_tensor->numel();
D
dzhwinter 已提交
125
    op->Run(*scope, place);
D
Dong Zhihong 已提交
126 127
    VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
  }
128

D
Dong Zhihong 已提交
129
 public:
Q
QI JUN 已提交
130 131 132 133
  std::vector<p::DeviceContext *> dev_ctxs_;
  f::Scope g_scope_;
  std::mutex mu_;
  std::vector<int> gpu_list_;
D
Dong Zhihong 已提交
134 135
};

136
// ncclInitOp with desc
Q
QI JUN 已提交
137
TEST_F(NCCLTester, ncclInitOp) {}
138 139 140

// ncclAllReduceOp with desc
TEST_F(NCCLTester, ncclAllReduceOp) {
Y
Yu Yang 已提交
141
  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
142 143
  op2->SetType("ncclAllReduce");
  op2->SetInput("X", {"st"});
D
Dong Zhihong 已提交
144
  op2->SetInput("Communicator", {"comm"});
145
  op2->SetOutput("Out", {"rt"});
D
Dong Zhihong 已提交
146 147

  std::vector<f::Scope *> dev_scopes;
D
Dong Zhihong 已提交
148 149

  std::vector<std::thread> ths;
D
Dong Zhihong 已提交
150

Q
QI JUN 已提交
151 152 153
  for (size_t i = 0; i < gpu_list_.size(); ++i) {
    dev_scopes.emplace_back(&g_scope_.NewScope());
    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
D
Dong Zhihong 已提交
154 155 156
                   *op2.get(), dev_scopes[i]);
    ths.emplace_back(std::move(th));
  }
D
Dong Zhihong 已提交
157

Q
QI JUN 已提交
158
  for (size_t i = 0; i < gpu_list_.size(); ++i) {
D
Dong Zhihong 已提交
159 160
    ths[i].join();
  }
D
Dong Zhihong 已提交
161

Q
QI JUN 已提交
162 163 164 165
  float expected_result = 0.0;
  for (int gpu_id : gpu_list_) {
    expected_result = expected_result + GetGPUData(gpu_id);
  }
D
Dong Zhihong 已提交
166 167 168

  for (size_t i = 0; i < dev_scopes.size(); ++i) {
    p::CPUPlace cpu_place;
Q
QI JUN 已提交
169
    p::CUDAPlace gpu_place(gpu_list_[i]);
D
Dong Zhihong 已提交
170 171 172 173 174 175 176 177

    auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
    auto *rt = recv_tensor.data<float>();
    auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
    result_tensor->Resize(kDims);
    auto *ct = result_tensor->mutable_data<float>(cpu_place);

    paddle::memory::Copy(
Q
QI JUN 已提交
178
        cpu_place, ct, p::CUDAPlace(gpu_list_[i]), rt,
D
Dong Zhihong 已提交
179
        recv_tensor.numel() * sizeof(float),
Q
QI JUN 已提交
180
        static_cast<p::CUDADeviceContext *>(dev_ctxs_[i])->stream());
D
Dong Zhihong 已提交
181

D
dangqingqing 已提交
182
    for (int64_t j = 0; j < f::product(kDims); ++j) {
Q
QI JUN 已提交
183
      ASSERT_NEAR(ct[j], expected_result, 1e-5);
D
Dong Zhihong 已提交
184 185 186
    }
  }
}
D
Dong Zhihong 已提交
187

D
dzhwinter 已提交
188
// ncclReduceOp with desc
189
TEST_F(NCCLTester, ncclReduceOp) {
Y
Yu Yang 已提交
190
  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
191 192 193 194 195
  const int kRoot = 0;
  op2->SetType("ncclReduce");
  op2->SetInput("X", {"st"});
  op2->SetInput("Communicator", {"comm"});
  op2->SetOutput("Out", {"rt"});
D
dzhwinter 已提交
196
  op2->SetAttr("root", kRoot);
197 198 199 200 201

  std::vector<f::Scope *> dev_scopes;

  std::vector<std::thread> ths;

Q
QI JUN 已提交
202 203 204
  for (size_t i = 0; i < gpu_list_.size(); ++i) {
    dev_scopes.emplace_back(&g_scope_.NewScope());
    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
205 206 207 208
                   *op2.get(), dev_scopes[i]);
    ths.emplace_back(std::move(th));
  }

Q
QI JUN 已提交
209
  for (size_t i = 0; i < gpu_list_.size(); ++i) {
210 211 212
    ths[i].join();
  }

Q
QI JUN 已提交
213 214 215 216
  float expected_result = 0.0;
  for (int gpu_id : gpu_list_) {
    expected_result = expected_result + GetGPUData(gpu_id);
  }
217 218

  p::CPUPlace cpu_place;
Q
QI JUN 已提交
219
  p::CUDAPlace gpu_place(gpu_list_[kRoot]);
220 221 222 223 224 225 226 227 228

  auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
  auto *rt = recv_tensor.data<float>();
  auto *result_tensor =
      dev_scopes[kRoot]->Var("ct")->GetMutable<f::LoDTensor>();
  result_tensor->Resize(kDims);
  auto *ct = result_tensor->mutable_data<float>(cpu_place);

  paddle::memory::Copy(
Q
QI JUN 已提交
229
      cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt,
230
      recv_tensor.numel() * sizeof(float),
Q
QI JUN 已提交
231
      static_cast<p::CUDADeviceContext *>(dev_ctxs_[kRoot])->stream());
232

D
dangqingqing 已提交
233
  for (int64_t j = 0; j < f::product(kDims); ++j) {
Q
QI JUN 已提交
234
    ASSERT_NEAR(ct[j], expected_result, 1e-5);
235 236 237
  }
}

D
dzhwinter 已提交
238
// ncclBcastOp with desc
H
Helin Wang 已提交
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286
// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
/*
TEST_F(NCCLTester, ncclBcastOp) {
  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
  const int kRoot = 0;
  op2->SetType("ncclBcast");
  op2->SetInput("X", {"st"});
  op2->SetInput("Communicator", {"comm"});
  op2->SetOutput("Out", {"rt"});
  op2->SetAttr("root", kRoot);

  std::vector<f::Scope *> dev_scopes;

  std::vector<std::thread> ths;

  for (size_t i = 0; i < gpu_list_.size(); ++i) {
    dev_scopes.emplace_back(&g_scope_.NewScope());
    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
                   *op2.get(), dev_scopes[i]);
    ths.emplace_back(std::move(th));
  }

  for (size_t i = 0; i < gpu_list_.size(); ++i) {
    ths[i].join();
  }

  const int idx = 1;
  float result = GetGPUData(kRoot);

  p::CPUPlace cpu_place;
  p::CUDAPlace gpu_place(gpu_list_[idx]);

  auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
  auto *rt = recv_tensor.data<float>();
  auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
  result_tensor->Resize(kDims);
  auto *ct = result_tensor->mutable_data<float>(cpu_place);

  paddle::memory::Copy(
      cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
      recv_tensor.numel() * sizeof(float),
      static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx])->stream());

  for (int64_t j = 0; j < f::product(kDims); ++j) {
    ASSERT_NEAR(ct[j], result, 1e-5);
  }
}
*/