提交 61c1b046 编写于 作者: D Dong Zhihong

"fix multigpu testcase"

上级 38d3adfe
...@@ -142,18 +142,26 @@ class NCCLBcastKernel : public framework::OpKernel<T> { ...@@ -142,18 +142,26 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
if (idx == root) { if (idx == root) {
auto ins = ctx.MultiInput<LoDTensor>("X"); auto ins = ctx.MultiInput<LoDTensor>("X");
for (size_t i = 0; i < ins.size(); ++i) { for (size_t i = 0; i < ins.size(); ++i) {
VLOG(1) << " invoke Bcast. send " << ins[i]->numel();
PADDLE_ENFORCE(platform::dynload::ncclBcast( PADDLE_ENFORCE(platform::dynload::ncclBcast(
(void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type, (void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type,
root, comm->comms_[idx], stream)); root, comm->comms_[idx], stream));
PADDLE_ENFORCE(cudaStreamSynchronize(stream)); PADDLE_ENFORCE(cudaStreamSynchronize(stream));
VLOG(1) << " finished Bcast.";
} }
} else { } else {
auto outs = ctx.MultiOutput<LoDTensor>("Out"); auto outs = ctx.MultiOutput<LoDTensor>("Out");
for (size_t i = 0; i < outs.size(); ++i) { for (size_t i = 0; i < outs.size(); ++i) {
VLOG(1) << " invoke Bcast. recv. ";
PADDLE_ENFORCE(platform::dynload::ncclBcast( PADDLE_ENFORCE(platform::dynload::ncclBcast(
outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(), outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(),
NCCLTypeWrapper<T>::type, root, comm->comms_[idx], stream)); NCCLTypeWrapper<T>::type, root, comm->comms_[idx], stream));
PADDLE_ENFORCE(cudaStreamSynchronize(stream)); PADDLE_ENFORCE(cudaStreamSynchronize(stream));
VLOG(1) << " finished Bcast. recv " << outs[i]->numel();
} }
} }
} }
......
...@@ -123,73 +123,71 @@ class NCCLTester : public ::testing::Test { ...@@ -123,73 +123,71 @@ class NCCLTester : public ::testing::Test {
}; };
// ncclInitOp with desc // ncclInitOp with desc
// TEST(NCCL, ncclInitOp) { TEST(NCCL, ncclInitOp) {
// std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind); std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
// op_desc->SetType("ncclInit"); op_desc->SetType("ncclInit");
// op_desc->SetOutput("Communicator", {"x1"}); op_desc->SetOutput("Communicator", {"x1"});
// op_desc->SetAttr("gpus", {gpu_list}); op_desc->SetAttr("gpus", {gpu_list});
// f::Scope g_scope; f::Scope g_scope;
// std::unique_ptr<p::DeviceContext> ctx(new std::unique_ptr<p::DeviceContext> ctx(new p::CPUDeviceContext(p::CPUPlace()));
// p::CPUDeviceContext(p::CPUPlace()));
// auto *var = g_scope.Var("x1"); auto *var = g_scope.Var("x1");
// var->GetMutable<p::Communicator>(); var->GetMutable<p::Communicator>();
// auto op = f::OpRegistry::CreateOp(*op_desc); auto op = f::OpRegistry::CreateOp(*op_desc);
// VLOG(1) << "invoke NCCLInitOp."; VLOG(1) << "invoke NCCLInitOp.";
// op->Run(g_scope, *ctx.get()); op->Run(g_scope, *ctx.get());
// VLOG(1) << "NCCLInitOp finished."; VLOG(1) << "NCCLInitOp finished.";
// } }
// ncclAllReduceOp with desc // ncclAllReduceOp with desc
// TEST_F(NCCLTester, ncclAllReduceOp) { TEST_F(NCCLTester, ncclAllReduceOp) {
// std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind); std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
// op2->SetType("ncclAllReduce"); op2->SetType("ncclAllReduce");
// op2->SetInput("X", {"st"}); op2->SetInput("X", {"st"});
// op2->SetInput("Communicator", {"comm"}); op2->SetInput("Communicator", {"comm"});
// op2->SetOutput("Out", {"rt"}); op2->SetOutput("Out", {"rt"});
// std::vector<f::Scope *> dev_scopes; std::vector<f::Scope *> dev_scopes;
// std::vector<std::thread> ths; std::vector<std::thread> ths;
// for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list.size(); ++i) {
// dev_scopes.emplace_back(&g_scope.NewScope()); dev_scopes.emplace_back(&g_scope.NewScope());
// std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i], std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
// *op2.get(), dev_scopes[i]); *op2.get(), dev_scopes[i]);
// ths.emplace_back(std::move(th)); ths.emplace_back(std::move(th));
// } }
// for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list.size(); ++i) {
// ths[i].join(); ths[i].join();
// } }
// // check results // check results
// float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0); float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
// for (size_t i = 0; i < dev_scopes.size(); ++i) { for (size_t i = 0; i < dev_scopes.size(); ++i) {
// p::CPUPlace cpu_place; p::CPUPlace cpu_place;
// p::GPUPlace gpu_place(gpu_list[i]); p::GPUPlace gpu_place(gpu_list[i]);
// auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>(); auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
// auto *rt = recv_tensor.data<float>(); auto *rt = recv_tensor.data<float>();
// auto *result_tensor = auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
// dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>(); result_tensor->Resize(kDims);
// result_tensor->Resize(kDims); auto *ct = result_tensor->mutable_data<float>(cpu_place);
// auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle::memory::Copy(
// paddle::memory::Copy( cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
// cpu_place, ct, p::GPUPlace(gpu_list[i]), rt, recv_tensor.numel() * sizeof(float),
// recv_tensor.numel() * sizeof(float), static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
// static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
for (size_t j = 0; j < f::product(kDims); ++j) {
// for (size_t j = 0; j < f::product(kDims); ++j) { ASSERT_NEAR(ct[j], result, 1e-5);
// ASSERT_NEAR(ct[j], result, 1e-5); }
// } }
// } }
// }
// ncclAReduceOp with desc // ncclAReduceOp with desc
TEST_F(NCCLTester, ncclReduceOp) { TEST_F(NCCLTester, ncclReduceOp) {
...@@ -242,7 +240,7 @@ TEST_F(NCCLTester, ncclReduceOp) { ...@@ -242,7 +240,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
// // ncclBcastOp with desc // // ncclBcastOp with desc
TEST_F(NCCLTester, ncclBcastOp) { TEST_F(NCCLTester, ncclBcastOp) {
std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind); std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
const int kRoot = 0; const int kRoot = 5;
op2->SetType("ncclBcast"); op2->SetType("ncclBcast");
op2->SetInput("X", {"st"}); op2->SetInput("X", {"st"});
op2->SetInput("Communicator", {"comm"}); op2->SetInput("Communicator", {"comm"});
...@@ -266,7 +264,7 @@ TEST_F(NCCLTester, ncclBcastOp) { ...@@ -266,7 +264,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
const int idx = 1; const int idx = 1;
// check results on // check results on
float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0); float result = kRoot;
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
p::GPUPlace gpu_place(gpu_list[idx]); p::GPUPlace gpu_place(gpu_list[idx]);
...@@ -292,14 +290,14 @@ TEST_F(NCCLTester, MultipleOp) { ...@@ -292,14 +290,14 @@ TEST_F(NCCLTester, MultipleOp) {
const int kRoot = 0; const int kRoot = 0;
std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind); std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
op1->SetType("ncclReduce"); op1->SetType("ncclReduce");
op1->SetInput("X", {"rt"}); op1->SetInput("X", {"st"});
op1->SetInput("Communicator", {"comm"}); op1->SetInput("Communicator", {"comm"});
op1->SetOutput("Out", {"rt"}); op1->SetOutput("Out", {"rt"});
op2->SetAttr("root", {kRoot}); op1->SetAttr("root", {kRoot});
std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind); std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
op2->SetType("ncclBcast"); op2->SetType("ncclBcast");
op2->SetInput("X", {"st"}); op2->SetInput("X", {"rt"});
op2->SetInput("Communicator", {"comm"}); op2->SetInput("Communicator", {"comm"});
op2->SetOutput("Out", {"rt"}); op2->SetOutput("Out", {"rt"});
op2->SetAttr("root", {kRoot}); op2->SetAttr("root", {kRoot});
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册