提交 11cf3e3a 编写于 作者: D Dong Zhihong

"refactorization of nccl test case"

上级 6d1493a4
...@@ -43,33 +43,48 @@ namespace f = paddle::framework; ...@@ -43,33 +43,48 @@ namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
static std::vector<int> gpu_list; static std::vector<int> gpu_list;
static std::vector<std::unique_ptr<p::DeviceContext>> dev_ctxs;
std::mutex mu;
// test data amount // test data amount
const f::DDim kDims = {100, 100}; const f::DDim kDims = {100, 100};
// ncclInitOp with desc // nccl op common tester, init communicator.
TEST(NCCL, ncclInitOp) { class NCCLTester : public ::testing::Test {
std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind); public:
virtual void SetUp() override {
cpu_ctx = new p::CPUDeviceContext(p::CPUPlace());
for (size_t i = 0; i < gpu_list.size(); ++i) {
p::GPUPlace place(i);
dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
}
op_desc->SetType("ncclInit"); NCCLInitOp();
op_desc->SetOutput("Communicator", {"x1"}); }
op_desc->SetAttr("gpus", {gpu_list});
f::Scope g_scope;
p::DeviceContext *ctx = new p::CPUDeviceContext(p::CPUPlace());
auto *var = g_scope.Var("x1"); virtual void TearDown() override {
for (auto &device_context : dev_ctxs) {
delete device_context;
}
}
void NCCLInitOp() {
std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
op1->SetType("ncclInit");
op1->SetOutput("Communicator", {"comm"});
op1->SetAttr("gpus", {gpu_list});
auto *var = g_scope.Var("comm");
var->GetMutable<p::Communicator>(); var->GetMutable<p::Communicator>();
auto op = f::OpRegistry::CreateOp(*op_desc); auto op = f::OpRegistry::CreateOp(*op1);
VLOG(1) << "invoke NCCLInitOp."; VLOG(1) << "invoke NCCLInitOp.";
op->Run(g_scope, *ctx); op->Run(g_scope, *cpu_ctx);
VLOG(1) << "NCCLInitOp finished."; VLOG(1) << "NCCLInitOp finished.";
} }
template <class T> template <class T>
void DeviceProgram(int gpu_id, const f::OpDescBind &op_desc, f::Scope *scope) { void PerThreadProgram(int gpu_id, const f::OpDescBind &op_desc,
f::Scope *scope) {
std::unique_lock<std::mutex> lk(mu); std::unique_lock<std::mutex> lk(mu);
f::ProgramDescBind program; f::ProgramDescBind program;
f::BlockDescBind *block = program.Block(0); f::BlockDescBind *block = program.Block(0);
...@@ -77,7 +92,7 @@ void DeviceProgram(int gpu_id, const f::OpDescBind &op_desc, f::Scope *scope) { ...@@ -77,7 +92,7 @@ void DeviceProgram(int gpu_id, const f::OpDescBind &op_desc, f::Scope *scope) {
*op1 = op_desc; *op1 = op_desc;
p::GPUPlace place(gpu_id); p::GPUPlace place(gpu_id);
auto ctx = dev_ctxs.at(gpu_id); auto &ctx = dev_ctxs.at(gpu_id);
auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>(); auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>(); auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();
...@@ -97,27 +112,38 @@ void DeviceProgram(int gpu_id, const f::OpDescBind &op_desc, f::Scope *scope) { ...@@ -97,27 +112,38 @@ void DeviceProgram(int gpu_id, const f::OpDescBind &op_desc, f::Scope *scope) {
VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type(); VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
op->Run(*scope, *ctx); op->Run(*scope, *ctx);
VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type(); VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
} }
// ncclAllReduceOp with desc public:
TEST(NCCL, ncclAllReduceOp) { std::vector<p::DeviceContext *> dev_ctxs;
std::unique_ptr<p::DeviceContext> ctx(new p::CPUDeviceContext(p::CPUPlace())); p::DeviceContext *cpu_ctx;
std::unique_ptr<f::Scope> g_scope(new Scope); f::Scope g_scope;
std::mutex mu;
};
std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind); // ncclInitOp with desc
op1->SetType("ncclInit"); // TEST(NCCL, ncclInitOp) {
op1->SetOutput("Communicator", {"comm"}); // std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
op1->SetAttr("gpus", {gpu_list});
auto *var = g_scope.Var("comm"); // op_desc->SetType("ncclInit");
var->GetMutable<p::Communicator>(); // op_desc->SetOutput("Communicator", {"x1"});
// op_desc->SetAttr("gpus", {gpu_list});
auto op = f::OpRegistry::CreateOp(*op1); // f::Scope g_scope;
VLOG(1) << "invoke NCCLInitOp."; // std::unique_ptr<p::DeviceContext> ctx(new
op->Run(g_scope, *ctx); // p::CPUDeviceContext(p::CPUPlace()));
VLOG(1) << "NCCLInitOp finished.";
delete ctx; // auto *var = g_scope.Var("x1");
// var->GetMutable<p::Communicator>();
// auto op = f::OpRegistry::CreateOp(*op_desc);
// VLOG(1) << "invoke NCCLInitOp.";
// op->Run(g_scope, *ctx.get());
// VLOG(1) << "NCCLInitOp finished.";
// }
// ncclAllReduceOp with desc
TEST_F(NCCLTester, ncclAllReduceOp) {
std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind); std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
op2->SetType("ncclAllReduce"); op2->SetType("ncclAllReduce");
op2->SetInput("X", {"st"}); op2->SetInput("X", {"st"});
...@@ -126,36 +152,18 @@ TEST(NCCL, ncclAllReduceOp) { ...@@ -126,36 +152,18 @@ TEST(NCCL, ncclAllReduceOp) {
std::vector<std::thread> ths; std::vector<std::thread> ths;
for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list.size(); ++i) {
std::thread th(DeviceProgram<float>, gpu_list[i], *op2, std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
&g_scope.NewScope()); *op2.get(), &g_scope.NewScope());
ths.emplace_back(std::move(th)); ths.emplace_back(std::move(th));
} }
for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list.size(); ++i) {
ths[i].join(); ths[i].join();
} }
g_scope->reset(nullptr);
} }
// ncclReduceOp with desc // ncclReduceOp with desc
TEST(NCCL, ncclReduceOp) { TEST(NCCL, ncclReduceOp) {
std::unique_ptr<p::DeviceContext> ctx(new p::CPUDeviceContext(p::CPUPlace()));
std::unique_ptr<f::Scope> g_scope(new Scope);
std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
op1->SetType("ncclInit");
op1->SetOutput("Communicator", {"comm"});
op1->SetAttr("gpus", {gpu_list});
auto *var = g_scope.Var("comm");
var->GetMutable<p::Communicator>();
auto op = f::OpRegistry::CreateOp(*op1);
VLOG(1) << "invoke NCCLInitOp.";
op->Run(g_scope, *ctx);
VLOG(1) << "NCCLInitOp finished.";
delete ctx;
std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind); std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
op2->SetType("ncclReduce"); op2->SetType("ncclReduce");
op2->SetInput("X", {"st"}); op2->SetInput("X", {"st"});
...@@ -164,53 +172,36 @@ TEST(NCCL, ncclReduceOp) { ...@@ -164,53 +172,36 @@ TEST(NCCL, ncclReduceOp) {
std::vector<std::thread> ths; std::vector<std::thread> ths;
for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list.size(); ++i) {
std::thread th(DeviceProgram<float>, gpu_list[i], *op2, std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
&g_scope.NewScope()); *op2.get(), &g_scope.NewScope());
ths.emplace_back(std::move(th)); ths.emplace_back(std::move(th));
} }
for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list.size(); ++i) {
ths[i].join(); ths[i].join();
} }
g_scope->reset(nullptr);
} }
// ncclBcastOp with desc // ncclBcastOp with desc
TEST(NCCL, ncclBcastOp) { // TEST(NCCL, ncclBcastOp) {
f::ProgramDescBind program; // std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
f::BlockDescBind *block = program.Block(0); // op2->SetType("ncclBcastSend");
f::OpDescBind *op1 = block->AppendOp(); // op2->SetInput("X", {"st"});
// op2->SetInput("Communicator", {"comm"});
p::DeviceContext *ctx = new p::CPUDeviceContext(p::CPUPlace()); // op2->SetOutput("Out", {"rt"});
op1->SetType("ncclInit"); // std::vector<std::thread> ths;
op1->SetOutput("Communicator", {"comm"}); // for (size_t i = 0; i < gpu_list.size(); ++i) {
op1->SetAttr("gpus", {gpu_list}); // std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
// *op2.get(),
auto *var = g_scope.Var("comm"); // &g_scope.NewScope());
var->GetMutable<p::Communicator>(); // ths.emplace_back(std::move(th));
// }
auto op = f::OpRegistry::CreateOp(*op1);
VLOG(1) << "invoke NCCLInitOp."; // for (size_t i = 0; i < gpu_list.size(); ++i) {
op->Run(g_scope, *ctx); // ths[i].join();
VLOG(1) << "NCCLInitOp finished."; // }
// }
f::OpDescBind *op2 = new f::OpDescBind;
op2->SetType("ncclBcastSend");
op2->SetInput("X", {"st"});
op2->SetInput("Communicator", {"comm"});
op2->SetOutput("Out", {"rt"});
std::vector<std::thread> ths;
for (size_t i = 0; i < gpu_list.size(); ++i) {
std::thread th(DeviceProgram<float>, gpu_list[i], *op2);
ths.emplace_back(std::move(th));
}
for (size_t i = 0; i < gpu_list.size(); ++i) {
ths[i].join();
}
}
int main(int argc, char **argv) { int main(int argc, char **argv) {
const int dev_count = p::GetCUDADeviceCount(); const int dev_count = p::GetCUDADeviceCount();
...@@ -228,9 +219,5 @@ int main(int argc, char **argv) { ...@@ -228,9 +219,5 @@ int main(int argc, char **argv) {
// device context should be release before scope. // device context should be release before scope.
// otherwise driver will down. // otherwise driver will down.
for (size_t i = 0; i < gpu_list.size(); ++i) {
p::GPUPlace place(i);
dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
}
return RUN_ALL_TESTS(); return RUN_ALL_TESTS();
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册