From 423d7438a1960b4314fff0db873197acd92ec5c3 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Mon, 23 Oct 2017 14:03:17 -0700
Subject: [PATCH] "add register gpu macro"

---
 paddle/framework/op_registry.h                |  4 +
 paddle/operators/CMakeLists.txt               |  4 +-
 paddle/operators/nccl/CMakeLists.txt          |  2 +-
 paddle/operators/nccl_op.cc                   | 81 +++++++++++++++++--
 paddle/operators/nccl_op.cu                   | 77 ++++++++++++++++++
 .../v2/framework/tests/test_nccl_reduce_op.py |  6 ++
 6 files changed, 165 insertions(+), 9 deletions(-)
 create mode 100644 python/paddle/v2/framework/tests/test_nccl_reduce_op.py

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 226e8ddcd4..6ab65ef5e7 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -233,6 +233,10 @@ class OpKernelRegistrar : public Registrar {
   USE_OP_ITSELF(op_type);        \
   USE_OP_DEVICE_KERNEL(op_type, CPU);
 
+#define USE_GPU_ONLY_OP(op_type) \
+  USE_OP_ITSELF(op_type);        \
+  USE_OP_DEVICE_KERNEL(op_type, GPU)
+
 #define USE_OP(op_type)   \
   USE_OP_ITSELF(op_type); \
   USE_OP_KERNEL(op_type)
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 4faf9bbb08..0ea1037a7b 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -80,8 +80,8 @@ function(op_library TARGET)
     if ("${TARGET}" STREQUAL "nccl_op")
         set(pybind_flag 1)
         # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(ncclInit);\n")
-        # file(APPEND ${pybind_file} "USE_OP(ncclInit);\n")
+        file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclInit);\n")
+        file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
     endif()
     
     # reduce_op contains several operators
diff --git a/paddle/operators/nccl/CMakeLists.txt b/paddle/operators/nccl/CMakeLists.txt
index bdd873b3f3..21cc1d9ee9 100644
--- a/paddle/operators/nccl/CMakeLists.txt
+++ b/paddle/operators/nccl/CMakeLists.txt
@@ -1,4 +1,4 @@
 if(WITH_GPU)
-  nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator)
+  nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
   nv_test(nccl_gpu_common_test SRCS nccl_gpu_common_test.cc DEPS nccl_common)
 endif()
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index 91584a377e..f0f7b205b6 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -67,6 +67,54 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
   }
 };
 
+// ReduceOp
+class NCCLReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   " Input(X) of Reduce op input should not be NULL");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Communicator"),
+        " Input(Communicator) of Reduce op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   " Input(X) of Reduce op input should not be NULL");
+
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+// BcastSendOp
+class NCCLBcastSendOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   " Input(X) of Bcast op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasInput("Communicator"),
+                   " Input(Communicator) of Bcast op input should not be NULL");
+  }
+};
+
+// BcastRecvOp
+class NCCLBcastRecvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Communicator"),
+                   " Input(Communicator) of Bcast op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   " Output(Out) of Bcast op output should not be NULL");
+  }
+};
+
 // AllreduceOp
 class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -85,15 +133,31 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+// BcastSend should be in the root
+// BcastSendOp
+class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCCLAllBcastSendOpMaker(framework::OpProto *proto,
+                          framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of BcastSend op");
+    AddInput("Communicator", "Communicator for communicating between gpus");
+    AddAttr<int>("root", "root gpu of Bcast");
+    AddComment(R"DOC(
+            Bcast the tensors.
+        )DOC");
+  }
+};
+
 // BcastOp
-class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
+class NCCLBcastRecvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLAllBcastOpMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  NCCLAllBcastRecvOpMaker(framework::OpProto *proto,
+                          framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input of Bcast op");
     AddInput("Communicator", "Communicator for communicating between gpus");
-    AddInput("root", "root gpu of Bcast");
+    AddAttr<int>("root", "root gpu of BcastRecv");
+    AddOutput("Out", "The output of Bcast");
     AddComment(R"DOC(
             Bcast the tensors.
         )DOC");
@@ -108,7 +172,6 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of Reduce op");
     AddInput("Communicator", "Communicator for communicating between gpus");
-    AddInput("root", "root gpu of Reduce");
     AddOutput("Out", "The output of Reduce op");
     AddComment(R"DOC(
             Reduce the tensors.
@@ -123,4 +186,10 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp,
                              ops::NCCLAllReduceOpMaker);
 REGISTER_OP_WITHOUT_GRADIENT(ncclInit, ops::NCCLInitOp, ops::NCCLInitOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(ncclBcastSend, ops::NCCLBcastSendOp,
+                             ops::NCCLBcastSendOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(ncclBcastRecv, ops::NCCLBcastRecvOp,
+                             ops::NCCLBcastRecvOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(ncclReduce, ops::NCCLReduceOp,
+                             ops::NCCLReduceOpMaker);
 REGISTER_OP_CPU_KERNEL(ncclInit, ops::NCCLInitKernel<float>);
diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu
index 6b0a325d17..4d91a3055f 100644
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@@ -10,6 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
+#include <functional>
+
 #include "paddle/operators/nccl_op.h"
 
 namespace paddle {
@@ -59,8 +61,83 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+class NCCLReduceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+
+    auto ins = ctx.MultiInput<Tensor>("X");
+    auto outs = ctx.MultiOutput<Tensor>("Out");
+
+    auto* comm = ctx.Input<Communicator>("Communicator");
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+    // device id
+    int device_id =
+        boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int idx = comm->GetCommId(device_id);
+
+    for (size_t i = 0; i < ins.size(); ++i) {
+      int root = std::hash() % comm->comms_.size();
+      T* recvbuffer = nullptr;
+      if (root == device_id) {
+        recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace());
+      }
+      PADDLE_ENFORCE(ncclReduce(ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
+                                NCCLTypeWrapper<T>::type, root, ncclSum,
+                                comm->comms_[idx], stream));
+      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+    }
+  }
+};
+
+template <typename T>
+class NCCLBcastKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+
+    int root = ctx.Attr<int>("root");
+
+    auto* comm = ctx.Input<Communicator>("Communicator");
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+    // device id
+    int device_id =
+        boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int idx = comm->GetCommId(device_id);
+    if (idx == root) {
+      auto ins = ctx.MultiInput<Tensor>("X");
+      for (size_t i = 0; i < ins.size(); ++i) {
+        PADDLE_ENFORCE(ncclBcast((void*)ins[i]->data<T>(), ins[i]->numel(),
+                                 NCCLTypeWrapper<T>::type, root,
+                                 comm->comms_[idx], stream));
+        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+      }
+    } else {
+      auto outs = ctx.MultiOutput<Tensor>("Out");
+      for (size_t i = 0; i < outs.size(); ++i) {
+        PADDLE_ENFORCE(ncclBcast((void*)outs[i]->mutable_data<T>(),
+                                 outs[i]->numel(), NCCLTypeWrapper<T>::type,
+                                 root, comm->comms_[idx], stream));
+        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+      }
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
+REGISTER_OP_GPU_KERNEL(ncclBcastSend, ops::NCCLBcastKernel<float>);
+REGISTER_OP_GPU_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
+REGISTER_OP_GPU_KERNEL(ncclBcastRecv, ops::NCCLBcastKernel<float>);
diff --git a/python/paddle/v2/framework/tests/test_nccl_reduce_op.py b/python/paddle/v2/framework/tests/test_nccl_reduce_op.py
new file mode 100644
index 0000000000..675ad5766c
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_nccl_reduce_op.py
@@ -0,0 +1,6 @@
+import unittest, os
+import numpy as np
+import paddle.v2 as paddle
+from paddle.v2.framework.op import Operator
+import paddle.v2.framework.core as core
+from op_test import OpTest, create_op, set_input
-- 
GitLab