"add register gpu macro"

423d7438 · Dong Zhihong · 333045d7 · 423d7438 · 423d7438 · 423d7438
6 changed file
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -233,6 +233,10 @@ class OpKernelRegistrar : public Registrar {
  USE_OP_ITSELF(op_type);        \
  USE_OP_DEVICE_KERNEL(op_type, CPU);

+#define USE_GPU_ONLY_OP(op_type) \
+  USE_OP_ITSELF(op_type);        \
+  USE_OP_DEVICE_KERNEL(op_type, GPU)
+
 #define USE_OP(op_type)   \
  USE_OP_ITSELF(op_type); \
  USE_OP_KERNEL(op_type)

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -80,8 +80,8 @@ function(op_library TARGET)
    if ("${TARGET}" STREQUAL "nccl_op")
        set(pybind_flag 1)
        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(ncclInit);\n")
-        # file(APPEND ${pybind_file} "USE_OP(ncclInit);\n")
+        file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclInit);\n")
+        file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
    endif()
    
    # reduce_op contains several operators

--- a/paddle/operators/nccl/CMakeLists.txt
+++ b/paddle/operators/nccl/CMakeLists.txt
 if(WITH_GPU)
-  nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator)
+  nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
  nv_test(nccl_gpu_common_test SRCS nccl_gpu_common_test.cc DEPS nccl_common)
 endif()
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -67,6 +67,54 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
  }
 };

+// ReduceOp
+class NCCLReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   " Input(X) of Reduce op input should not be NULL");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Communicator"),
+        " Input(Communicator) of Reduce op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   " Input(X) of Reduce op input should not be NULL");
+
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+// BcastSendOp
+class NCCLBcastSendOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   " Input(X) of Bcast op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasInput("Communicator"),
+                   " Input(Communicator) of Bcast op input should not be NULL");
+  }
+};
+
+// BcastRecvOp
+class NCCLBcastRecvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Communicator"),
+                   " Input(Communicator) of Bcast op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   " Output(Out) of Bcast op output should not be NULL");
+  }
+};
+
 // AllreduceOp
 class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
@@ -85,15 +133,31 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  }
 };

+// BcastSend should be in the root
+// BcastSendOp
+class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCCLAllBcastSendOpMaker(framework::OpProto *proto,
+                          framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of BcastSend op");
+    AddInput("Communicator", "Communicator for communicating between gpus");
+    AddAttr<int>("root", "root gpu of Bcast");
+    AddComment(R"DOC(
+            Bcast the tensors.
+        )DOC");
+  }
+};
+
 // BcastOp
-class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
+class NCCLBcastRecvOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  NCCLAllBcastOpMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  NCCLAllBcastRecvOpMaker(framework::OpProto *proto,
+                          framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input of Bcast op");
    AddInput("Communicator", "Communicator for communicating between gpus");
-    AddInput("root", "root gpu of Bcast");
+    AddAttr<int>("root", "root gpu of BcastRecv");
+    AddOutput("Out", "The output of Bcast");
    AddComment(R"DOC(
            Bcast the tensors.
        )DOC");
@@ -108,7 +172,6 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of Reduce op");
    AddInput("Communicator", "Communicator for communicating between gpus");
-    AddInput("root", "root gpu of Reduce");
    AddOutput("Out", "The output of Reduce op");
    AddComment(R"DOC(
            Reduce the tensors.
@@ -123,4 +186,10 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp,
                             ops::NCCLAllReduceOpMaker);
 REGISTER_OP_WITHOUT_GRADIENT(ncclInit, ops::NCCLInitOp, ops::NCCLInitOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(ncclBcastSend, ops::NCCLBcastSendOp,
+                             ops::NCCLBcastSendOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(ncclBcastRecv, ops::NCCLBcastRecvOp,
+                             ops::NCCLBcastRecvOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(ncclReduce, ops::NCCLReduceOp,
+                             ops::NCCLReduceOpMaker);
 REGISTER_OP_CPU_KERNEL(ncclInit, ops::NCCLInitKernel<float>);
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@@ -10,6 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #define EIGEN_USE_GPU
+#include <functional>
+
 #include "paddle/operators/nccl_op.h"

 namespace paddle {
@@ -59,8 +61,83 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
  }
 };

+template <typename T>
+class NCCLReduceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+
+    auto ins = ctx.MultiInput<Tensor>("X");
+    auto outs = ctx.MultiOutput<Tensor>("Out");
+
+    auto* comm = ctx.Input<Communicator>("Communicator");
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+    // device id
+    int device_id =
+        boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int idx = comm->GetCommId(device_id);
+
+    for (size_t i = 0; i < ins.size(); ++i) {
+      int root = std::hash() % comm->comms_.size();
+      T* recvbuffer = nullptr;
+      if (root == device_id) {
+        recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace());
+      }
+      PADDLE_ENFORCE(ncclReduce(ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
+                                NCCLTypeWrapper<T>::type, root, ncclSum,
+                                comm->comms_[idx], stream));
+      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+    }
+  }
+};
+
+template <typename T>
+class NCCLBcastKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+
+    int root = ctx.Attr<int>("root");
+
+    auto* comm = ctx.Input<Communicator>("Communicator");
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+    // device id
+    int device_id =
+        boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int idx = comm->GetCommId(device_id);
+    if (idx == root) {
+      auto ins = ctx.MultiInput<Tensor>("X");
+      for (size_t i = 0; i < ins.size(); ++i) {
+        PADDLE_ENFORCE(ncclBcast((void*)ins[i]->data<T>(), ins[i]->numel(),
+                                 NCCLTypeWrapper<T>::type, root,
+                                 comm->comms_[idx], stream));
+        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+      }
+    } else {
+      auto outs = ctx.MultiOutput<Tensor>("Out");
+      for (size_t i = 0; i < outs.size(); ++i) {
+        PADDLE_ENFORCE(ncclBcast((void*)outs[i]->mutable_data<T>(),
+                                 outs[i]->numel(), NCCLTypeWrapper<T>::type,
+                                 root, comm->comms_[idx], stream));
+        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+      }
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
+REGISTER_OP_GPU_KERNEL(ncclBcastSend, ops::NCCLBcastKernel<float>);
+REGISTER_OP_GPU_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
+REGISTER_OP_GPU_KERNEL(ncclBcastRecv, ops::NCCLBcastKernel<float>);
--- a/python/paddle/v2/framework/tests/test_nccl_reduce_op.py
+++ b/python/paddle/v2/framework/tests/test_nccl_reduce_op.py
+import unittest, os
+import numpy as np
+import paddle.v2 as paddle
+from paddle.v2.framework.op import Operator
+import paddle.v2.framework.core as core
+from op_test import OpTest, create_op, set_input