提交 4e165f4e 编写于 作者: D Dong Zhihong

"fix create output variable bug"

上级 61c1b046
...@@ -114,6 +114,9 @@ class NCCLBcastOp : public framework::OperatorWithKernel { ...@@ -114,6 +114,9 @@ class NCCLBcastOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
" Output(Out) of Bcast op output should not be NULL"); " Output(Out) of Bcast op output should not be NULL");
int root = ctx->Attrs().Get<int>("root");
PADDLE_ENFORCE(root != -1, "Bcast root must be set.");
auto x_dims = ctx->GetInputsDim("X"); auto x_dims = ctx->GetInputsDim("X");
ctx->SetOutputsDim("Out", x_dims); ctx->SetOutputsDim("Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
......
...@@ -54,12 +54,12 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> { ...@@ -54,12 +54,12 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
ctx.device_context()) ctx.device_context())
.stream(); .stream();
// device id // device id
int device_id = int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId(); int idx = comm->GetCommId(gpu_id);
int idx = comm->GetCommId(device_id);
for (size_t i = 0; i < ins.size(); ++i) { for (size_t i = 0; i < ins.size(); ++i) {
VLOG(1) << " invoke allreduce. send " << ins[i]->numel() << " recv " VLOG(1) << "gpu : "
<< " invoke allreduce. send " << ins[i]->numel() << " recv "
<< outs[i]->numel(); << outs[i]->numel();
PADDLE_ENFORCE(platform::dynload::ncclAllReduce( PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
...@@ -68,7 +68,8 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> { ...@@ -68,7 +68,8 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
comm->comms_[idx], stream)); comm->comms_[idx], stream));
PADDLE_ENFORCE(cudaStreamSynchronize(stream)); PADDLE_ENFORCE(cudaStreamSynchronize(stream));
VLOG(1) << " finished allreduce. send " << ins[i]->numel() << " recv " VLOG(1) << "gpu : "
<< " finished allreduce. send " << ins[i]->numel() << " recv "
<< outs[i]->numel(); << outs[i]->numel();
} }
} }
...@@ -91,9 +92,8 @@ class NCCLReduceKernel : public framework::OpKernel<T> { ...@@ -91,9 +92,8 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
ctx.device_context()) ctx.device_context())
.stream(); .stream();
// device id // device id
int device_id = int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId(); int idx = comm->GetCommId(gpu_id);
int idx = comm->GetCommId(device_id);
auto ins_names = ctx.Inputs("X"); auto ins_names = ctx.Inputs("X");
std::hash<std::string> hasher; std::hash<std::string> hasher;
...@@ -102,20 +102,20 @@ class NCCLReduceKernel : public framework::OpKernel<T> { ...@@ -102,20 +102,20 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
root = hasher(ins_names[i]) % comm->comms_.size(); root = hasher(ins_names[i]) % comm->comms_.size();
} }
T* recvbuffer = nullptr; T* recvbuffer = nullptr;
if (root == device_id) { if (root == gpu_id) {
recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace()); recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace());
} }
VLOG(1) << " invoke reduce. send " << ins[i]->numel() << " recv " VLOG(1) << "gpu : " << gpu_id << " invoke reduce. send "
<< outs[i]->numel(); << ins[i]->numel() << " recv " << outs[i]->numel();
PADDLE_ENFORCE(platform::dynload::ncclReduce( PADDLE_ENFORCE(platform::dynload::ncclReduce(
ins[i]->data<T>(), recvbuffer, ins[i]->numel(), ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
NCCLTypeWrapper<T>::type, ncclSum, root, comm->comms_[idx], stream)); NCCLTypeWrapper<T>::type, ncclSum, root, comm->comms_[idx], stream));
PADDLE_ENFORCE(cudaStreamSynchronize(stream)); PADDLE_ENFORCE(cudaStreamSynchronize(stream));
VLOG(1) << " finished reduce. send " << ins[i]->numel() << " recv " VLOG(1) << "gpu : " << gpu_id << " finished reduce. send "
<< outs[i]->numel(); << ins[i]->numel() << " recv " << outs[i]->numel();
} }
} }
}; };
...@@ -135,33 +135,37 @@ class NCCLBcastKernel : public framework::OpKernel<T> { ...@@ -135,33 +135,37 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
ctx.device_context()) ctx.device_context())
.stream(); .stream();
// device id // device id
int device_id = int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId(); int idx = comm->GetCommId(gpu_id);
int idx = comm->GetCommId(device_id);
if (idx == root) { if (idx == root) {
auto ins = ctx.MultiInput<LoDTensor>("X"); auto ins = ctx.MultiInput<LoDTensor>("X");
for (size_t i = 0; i < ins.size(); ++i) { for (size_t i = 0; i < ins.size(); ++i) {
VLOG(1) << " invoke Bcast. send " << ins[i]->numel(); VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. send "
<< ins[i]->numel();
VLOG(1) << " before ncclBcast";
PADDLE_ENFORCE(platform::dynload::ncclBcast( PADDLE_ENFORCE(platform::dynload::ncclBcast(
(void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type, (void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type,
root, comm->comms_[idx], stream)); root, comm->comms_[idx], stream));
VLOG(1) << " after ncclBcast";
PADDLE_ENFORCE(cudaStreamSynchronize(stream)); PADDLE_ENFORCE(cudaStreamSynchronize(stream));
VLOG(1) << " finished Bcast."; VLOG(1) << "gpu : " << gpu_id << " finished Bcast.";
} }
} else { } else {
auto outs = ctx.MultiOutput<LoDTensor>("Out"); auto outs = ctx.MultiOutput<LoDTensor>("Out");
for (size_t i = 0; i < outs.size(); ++i) { for (size_t i = 0; i < outs.size(); ++i) {
VLOG(1) << " invoke Bcast. recv. "; VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
<< framework::product(outs[i]->dims());
PADDLE_ENFORCE(platform::dynload::ncclBcast( PADDLE_ENFORCE(platform::dynload::ncclBcast(
outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(), outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(),
NCCLTypeWrapper<T>::type, root, comm->comms_[idx], stream)); NCCLTypeWrapper<T>::type, root, comm->comms_[idx], stream));
PADDLE_ENFORCE(cudaStreamSynchronize(stream)); PADDLE_ENFORCE(cudaStreamSynchronize(stream));
VLOG(1) << " finished Bcast. recv " << outs[i]->numel(); VLOG(1) << "gpu : " << gpu_id << " finished Bcast. recv "
<< outs[i]->numel();
} }
} }
} }
......
...@@ -87,30 +87,34 @@ class NCCLTester : public ::testing::Test { ...@@ -87,30 +87,34 @@ class NCCLTester : public ::testing::Test {
void PerThreadProgram(int gpu_id, const f::OpDescBind &op_desc, void PerThreadProgram(int gpu_id, const f::OpDescBind &op_desc,
f::Scope *scope) { f::Scope *scope) {
std::unique_lock<std::mutex> lk(mu); std::unique_lock<std::mutex> lk(mu);
f::ProgramDescBind program; const f::OpDescBind *op1 = &op_desc;
f::BlockDescBind *block = program.Block(0);
f::OpDescBind *op1 = block->AppendOp();
*op1 = op_desc;
p::GPUPlace place(gpu_id); p::GPUPlace place(gpu_id);
auto &ctx = dev_ctxs.at(gpu_id); auto &ctx = dev_ctxs.at(gpu_id);
auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>(); auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>(); auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();
send_tensor->Resize(kDims);
send_tensor->mutable_data<T>(kDims, place);
std::vector<T> send_vector(f::product(kDims), gpu_id); if (!send_tensor->numel()) {
send_tensor->CopyFromVector<T>(send_vector, *ctx); send_tensor->Resize(kDims);
send_tensor->mutable_data<T>(kDims, place);
std::vector<T> send_vector(f::product(kDims), gpu_id);
send_tensor->CopyFromVector<T>(send_vector, *ctx);
ctx->Wait();
VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
}
lk.unlock(); lk.unlock();
PADDLE_ENFORCE(send_tensor->numel() == f::product(kDims), PADDLE_ENFORCE(send_tensor->numel() == f::product(kDims),
"Tensor numel not match!"); "Tensor numel not match!");
ctx->Wait();
VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
auto op = f::OpRegistry::CreateOp(*op1); auto op = f::OpRegistry::CreateOp(*op1);
VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type(); VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
VLOG(1) << " send_tensor : " << send_tensor->numel()
<< " recv_tensor : " << recv_tensor->numel();
op->Run(*scope, *ctx); op->Run(*scope, *ctx);
VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type(); VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
} }
...@@ -122,168 +126,171 @@ class NCCLTester : public ::testing::Test { ...@@ -122,168 +126,171 @@ class NCCLTester : public ::testing::Test {
std::mutex mu; std::mutex mu;
}; };
// ncclInitOp with desc // // ncclInitOp with desc
TEST(NCCL, ncclInitOp) { // TEST(NCCL, ncclInitOp) {
std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind); // std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
op_desc->SetType("ncclInit"); // op_desc->SetType("ncclInit");
op_desc->SetOutput("Communicator", {"x1"}); // op_desc->SetOutput("Communicator", {"x1"});
op_desc->SetAttr("gpus", {gpu_list}); // op_desc->SetAttr("gpus", {gpu_list});
f::Scope g_scope; // f::Scope g_scope;
std::unique_ptr<p::DeviceContext> ctx(new p::CPUDeviceContext(p::CPUPlace())); // std::unique_ptr<p::DeviceContext> ctx(new
// p::CPUDeviceContext(p::CPUPlace()));
auto *var = g_scope.Var("x1");
var->GetMutable<p::Communicator>(); // auto *var = g_scope.Var("x1");
// var->GetMutable<p::Communicator>();
auto op = f::OpRegistry::CreateOp(*op_desc);
VLOG(1) << "invoke NCCLInitOp."; // auto op = f::OpRegistry::CreateOp(*op_desc);
op->Run(g_scope, *ctx.get()); // VLOG(1) << "invoke NCCLInitOp.";
VLOG(1) << "NCCLInitOp finished."; // op->Run(g_scope, *ctx.get());
} // VLOG(1) << "NCCLInitOp finished.";
// }
// ncclAllReduceOp with desc
TEST_F(NCCLTester, ncclAllReduceOp) { // // ncclAllReduceOp with desc
std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind); // TEST_F(NCCLTester, ncclAllReduceOp) {
op2->SetType("ncclAllReduce"); // std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
op2->SetInput("X", {"st"}); // op2->SetType("ncclAllReduce");
op2->SetInput("Communicator", {"comm"}); // op2->SetInput("X", {"st"});
op2->SetOutput("Out", {"rt"}); // op2->SetInput("Communicator", {"comm"});
// op2->SetOutput("Out", {"rt"});
std::vector<f::Scope *> dev_scopes;
// std::vector<f::Scope *> dev_scopes;
std::vector<std::thread> ths;
// std::vector<std::thread> ths;
for (size_t i = 0; i < gpu_list.size(); ++i) {
dev_scopes.emplace_back(&g_scope.NewScope()); // for (size_t i = 0; i < gpu_list.size(); ++i) {
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i], // dev_scopes.emplace_back(&g_scope.NewScope());
*op2.get(), dev_scopes[i]); // std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
ths.emplace_back(std::move(th)); // *op2.get(), dev_scopes[i]);
} // ths.emplace_back(std::move(th));
// }
for (size_t i = 0; i < gpu_list.size(); ++i) {
ths[i].join(); // for (size_t i = 0; i < gpu_list.size(); ++i) {
} // ths[i].join();
// }
// check results
float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0); // // check results
// float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
for (size_t i = 0; i < dev_scopes.size(); ++i) {
p::CPUPlace cpu_place; // for (size_t i = 0; i < dev_scopes.size(); ++i) {
p::GPUPlace gpu_place(gpu_list[i]); // p::CPUPlace cpu_place;
// p::GPUPlace gpu_place(gpu_list[i]);
auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>(); // auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>(); // auto *rt = recv_tensor.data<float>();
result_tensor->Resize(kDims); // auto *result_tensor =
auto *ct = result_tensor->mutable_data<float>(cpu_place); // dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
// result_tensor->Resize(kDims);
paddle::memory::Copy( // auto *ct = result_tensor->mutable_data<float>(cpu_place);
cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
recv_tensor.numel() * sizeof(float), // paddle::memory::Copy(
static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream()); // cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
// recv_tensor.numel() * sizeof(float),
for (size_t j = 0; j < f::product(kDims); ++j) { // static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
ASSERT_NEAR(ct[j], result, 1e-5);
} // for (size_t j = 0; j < f::product(kDims); ++j) {
} // ASSERT_NEAR(ct[j], result, 1e-5);
} // }
// }
// ncclAReduceOp with desc // }
TEST_F(NCCLTester, ncclReduceOp) {
std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind); // // ncclAReduceOp with desc
const int kRoot = 0; // TEST_F(NCCLTester, ncclReduceOp) {
op2->SetType("ncclReduce"); // std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
op2->SetInput("X", {"st"}); // const int kRoot = 0;
op2->SetInput("Communicator", {"comm"}); // op2->SetType("ncclReduce");
op2->SetOutput("Out", {"rt"}); // op2->SetInput("X", {"st"});
op2->SetAttr("root", {kRoot}); // op2->SetInput("Communicator", {"comm"});
// op2->SetOutput("Out", {"rt"});
std::vector<f::Scope *> dev_scopes; // op2->SetAttr("root", {kRoot});
std::vector<std::thread> ths; // std::vector<f::Scope *> dev_scopes;
for (size_t i = 0; i < gpu_list.size(); ++i) { // std::vector<std::thread> ths;
dev_scopes.emplace_back(&g_scope.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i], // for (size_t i = 0; i < gpu_list.size(); ++i) {
*op2.get(), dev_scopes[i]); // dev_scopes.emplace_back(&g_scope.NewScope());
ths.emplace_back(std::move(th)); // std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
} // *op2.get(), dev_scopes[i]);
// ths.emplace_back(std::move(th));
for (size_t i = 0; i < gpu_list.size(); ++i) { // }
ths[i].join();
} // for (size_t i = 0; i < gpu_list.size(); ++i) {
// ths[i].join();
// check results on // }
float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
// // check results on
p::CPUPlace cpu_place; // float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
p::GPUPlace gpu_place(gpu_list[kRoot]);
// p::CPUPlace cpu_place;
auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>(); // p::GPUPlace gpu_place(gpu_list[kRoot]);
auto *rt = recv_tensor.data<float>();
auto *result_tensor = // auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
dev_scopes[kRoot]->Var("ct")->GetMutable<f::LoDTensor>(); // auto *rt = recv_tensor.data<float>();
result_tensor->Resize(kDims); // auto *result_tensor =
auto *ct = result_tensor->mutable_data<float>(cpu_place); // dev_scopes[kRoot]->Var("ct")->GetMutable<f::LoDTensor>();
// result_tensor->Resize(kDims);
paddle::memory::Copy( // auto *ct = result_tensor->mutable_data<float>(cpu_place);
cpu_place, ct, p::GPUPlace(gpu_list[kRoot]), rt,
recv_tensor.numel() * sizeof(float), // paddle::memory::Copy(
static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream()); // cpu_place, ct, p::GPUPlace(gpu_list[kRoot]), rt,
// recv_tensor.numel() * sizeof(float),
for (int j = 0; j < f::product(kDims); ++j) { // static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
ASSERT_NEAR(ct[j], result, 1e-5);
} // for (int j = 0; j < f::product(kDims); ++j) {
} // ASSERT_NEAR(ct[j], result, 1e-5);
// }
// // ncclBcastOp with desc // }
TEST_F(NCCLTester, ncclBcastOp) {
std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind); // // // ncclBcastOp with desc
const int kRoot = 5; // TEST_F(NCCLTester, ncclBcastOp) {
op2->SetType("ncclBcast"); // std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
op2->SetInput("X", {"st"}); // const int kRoot = 5;
op2->SetInput("Communicator", {"comm"}); // op2->SetType("ncclBcast");
op2->SetOutput("Out", {"rt"}); // op2->SetInput("X", {"st"});
op2->SetAttr("root", {kRoot}); // op2->SetInput("Communicator", {"comm"});
// op2->SetOutput("Out", {"rt"});
std::vector<f::Scope *> dev_scopes; // op2->SetAttr("root", {kRoot});
std::vector<std::thread> ths; // std::vector<f::Scope *> dev_scopes;
for (size_t i = 0; i < gpu_list.size(); ++i) { // std::vector<std::thread> ths;
dev_scopes.emplace_back(&g_scope.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i], // for (size_t i = 0; i < gpu_list.size(); ++i) {
*op2.get(), dev_scopes[i]); // dev_scopes.emplace_back(&g_scope.NewScope());
ths.emplace_back(std::move(th)); // std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
} // *op2.get(), dev_scopes[i]);
// ths.emplace_back(std::move(th));
for (size_t i = 0; i < gpu_list.size(); ++i) { // }
ths[i].join();
} // for (size_t i = 0; i < gpu_list.size(); ++i) {
// ths[i].join();
const int idx = 1; // }
// check results on
float result = kRoot; // const int idx = 1;
// // check results on
p::CPUPlace cpu_place; // float result = kRoot;
p::GPUPlace gpu_place(gpu_list[idx]);
// p::CPUPlace cpu_place;
auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>(); // p::GPUPlace gpu_place(gpu_list[idx]);
auto *rt = recv_tensor.data<float>();
auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>(); // auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
result_tensor->Resize(kDims); // auto *rt = recv_tensor.data<float>();
auto *ct = result_tensor->mutable_data<float>(cpu_place); // auto *result_tensor =
// dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
paddle::memory::Copy( // result_tensor->Resize(kDims);
cpu_place, ct, p::GPUPlace(gpu_list[idx]), rt, // auto *ct = result_tensor->mutable_data<float>(cpu_place);
recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream()); // paddle::memory::Copy(
// cpu_place, ct, p::GPUPlace(gpu_list[idx]), rt,
for (size_t j = 0; j < f::product(kDims); ++j) { // recv_tensor.numel() * sizeof(float),
ASSERT_NEAR(ct[j], result, 1e-5); // static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
}
} // for (size_t j = 0; j < f::product(kDims); ++j) {
// ASSERT_NEAR(ct[j], result, 1e-5);
// }
// }
// joint ncclBcastOp and ncclReduceOp // joint ncclBcastOp and ncclReduceOp
TEST_F(NCCLTester, MultipleOp) { TEST_F(NCCLTester, MultipleOp) {
...@@ -299,14 +306,17 @@ TEST_F(NCCLTester, MultipleOp) { ...@@ -299,14 +306,17 @@ TEST_F(NCCLTester, MultipleOp) {
op2->SetType("ncclBcast"); op2->SetType("ncclBcast");
op2->SetInput("X", {"rt"}); op2->SetInput("X", {"rt"});
op2->SetInput("Communicator", {"comm"}); op2->SetInput("Communicator", {"comm"});
op2->SetOutput("Out", {"rt"}); op2->SetOutput("Out", {"out"});
op2->SetAttr("root", {kRoot}); op2->SetAttr("root", {kRoot});
std::vector<f::Scope *> dev_scopes; std::vector<f::Scope *> dev_scopes;
// for (size_t i = 0; i < dev_scopes.size(); ++i) {
// dev_scopes[i]->Var("out")->GetMutable<f::LoDTensor>();
// }
std::vector<std::thread> ths; std::vector<std::thread> ths;
// run Bcast // run Reduce
for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list.size(); ++i) {
dev_scopes.emplace_back(&g_scope.NewScope()); dev_scopes.emplace_back(&g_scope.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i], std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
...@@ -320,9 +330,9 @@ TEST_F(NCCLTester, MultipleOp) { ...@@ -320,9 +330,9 @@ TEST_F(NCCLTester, MultipleOp) {
ths.clear(); ths.clear();
// run Reduce // run Bcast
for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list.size(); ++i) {
dev_scopes.emplace_back(&g_scope.NewScope()); dev_scopes[i]->Var("out")->GetMutable<f::LoDTensor>();
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i], std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
*op2.get(), dev_scopes[i]); *op2.get(), dev_scopes[i]);
ths.emplace_back(std::move(th)); ths.emplace_back(std::move(th));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册