From 94992a990b2716d19427b4758060a5196baf1c56 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Wed, 25 Oct 2017 12:55:14 -0700
Subject: [PATCH] "add multiop testcase"

---
 paddle/operators/nccl_op.cc      |  4 ++
 paddle/operators/nccl_op_test.cu | 84 ++++++++++++++++++++++++++++++--
 2 files changed, 85 insertions(+), 3 deletions(-)
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index ec7a89d5f..5b6c9bec7 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -93,6 +93,10 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
         " Input(Communicator) of Reduce op input should not be NULL");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    " Input(X) of Reduce op input should not be NULL");
+
+    auto x_dims = ctx->GetInputsDim("X");
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
index 8c54a3dcb..0eda0c6b5 100644
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@@ -16,6 +16,7 @@
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <algorithm>
 #include <memory>
 #include <mutex>
 #include <thread>
@@ -150,16 +151,41 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
   op2->SetInput("Communicator", {"comm"});
   op2->SetOutput("Out", {"rt"});
 
+  std::vector<f::Scope *> dev_scopes;
+
   std::vector<std::thread> ths;
+
   for (size_t i = 0; i < gpu_list.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope.NewScope());
     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
-                   *op2.get(), &g_scope.NewScope());
+                   *op2.get(), dev_scopes[i]);
     ths.emplace_back(std::move(th));
   }
 
   for (size_t i = 0; i < gpu_list.size(); ++i) {
     ths[i].join();
   }
+
+  // check results
+  float result = 0;
+  std::accumulate(gpu_list.begin(), gpu_list.end(), result);
+  for (size_t i = 0; i < dev_scopes.size(); ++i) {
+    auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
+    auto *rt = recv_tensor.data<float>();
+
+    p::CPUPlace cpu_place;
+    auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
+    result_tensor->Resize(kDims);
+    auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+    paddle::memory::Copy(
+        cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
+        recv_tensor.numel() * sizeof(float),
+        static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
+    for (size_t j = 0; j < f::product(kDims); ++j) {
+      ASSERT_NEAR(ct[j], result, 1e-5);
+    }
+  }
 }
 
 // ncclReduceOp with desc
@@ -170,24 +196,76 @@ TEST(NCCL, ncclReduceOp) {
   op2->SetInput("Communicator", {"comm"});
   op2->SetOutput("Out", {"rt"});
 
+  std::vector<f::Scope *> dev_scopes;
+
   std::vector<std::thread> ths;
   for (size_t i = 0; i < gpu_list.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope.NewScope());
     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
-                   *op2.get(), &g_scope.NewScope());
+                   *op2.get(), dev_scopes[i]);
     ths.emplace_back(std::move(th));
   }
 
   for (size_t i = 0; i < gpu_list.size(); ++i) {
     ths[i].join();
   }
+
+  // check results
+  float result = 0;
+  std::accumulate(gpu_list.begin(), gpu_list.end(), result);
+  for (size_t i = 0; i < dev_scopes.size(); ++i) {
+    auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
+    auto *rt = recv_tensor.data<float>();
+
+    p::CPUPlace cpu_place;
+    auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
+    result_tensor->Resize(kDims);
+    auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+    paddle::memory::Copy(
+        cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
+        recv_tensor.numel() * sizeof(float),
+        static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
+    for (size_t j = 0; j < f::product(kDims); ++j) {
+      ASSERT_NEAR(ct[j], result, 1e-5);
+    }
+  }
 }
 
 // ncclBcastOp with desc
-// TEST(NCCL, ncclBcastOp) {
+TEST(NCCL, ncclBcastOp) {
+  std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
+  op1->SetType("ncclBcastSend");
+  op1->SetInput("X", {"st"});
+  op1->SetInput("Communicator", {"comm"});
+
+  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  op2->SetType("ncclBcastRecv");
+  op2->SetInput("Communicator", {"comm"});
+  op2->SetOutput("Out", {"rt"});
+
+  std::vector<std::thread> ths;
+  for (size_t i = 1; i < gpu_list.size(); ++i) {
+    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+                   *op2.get(), &g_scope.NewScope());
+    ths.emplace_back(std::move(th));
+  }
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    ths[i].join();
+  }
+}
+
+// joint ncclBcastOp and ncclReduceOp
+// TEST(NCCL, MultipleOp) {
 //   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
 //   op2->SetType("ncclBcastSend");
 //   op2->SetInput("X", {"st"});
 //   op2->SetInput("Communicator", {"comm"});
+
+//   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+//   op2->SetType("ncclBcastRecv");
+//   op2->SetInput("Communicator", {"comm"});
 //   op2->SetOutput("Out", {"rt"});
 
 //   std::vector<std::thread> ths;
-- 
GitLab