From 423d7438a1960b4314fff0db873197acd92ec5c3 Mon Sep 17 00:00:00 2001 From: Dong Zhihong Date: Mon, 23 Oct 2017 14:03:17 -0700 Subject: [PATCH] "add register gpu macro" --- paddle/framework/op_registry.h | 4 + paddle/operators/CMakeLists.txt | 4 +- paddle/operators/nccl/CMakeLists.txt | 2 +- paddle/operators/nccl_op.cc | 81 +++++++++++++++++-- paddle/operators/nccl_op.cu | 77 ++++++++++++++++++ .../v2/framework/tests/test_nccl_reduce_op.py | 6 ++ 6 files changed, 165 insertions(+), 9 deletions(-) create mode 100644 python/paddle/v2/framework/tests/test_nccl_reduce_op.py diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h index 226e8ddcd4..6ab65ef5e7 100644 --- a/paddle/framework/op_registry.h +++ b/paddle/framework/op_registry.h @@ -233,6 +233,10 @@ class OpKernelRegistrar : public Registrar { USE_OP_ITSELF(op_type); \ USE_OP_DEVICE_KERNEL(op_type, CPU); +#define USE_GPU_ONLY_OP(op_type) \ + USE_OP_ITSELF(op_type); \ + USE_OP_DEVICE_KERNEL(op_type, GPU) + #define USE_OP(op_type) \ USE_OP_ITSELF(op_type); \ USE_OP_KERNEL(op_type) diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 4faf9bbb08..0ea1037a7b 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -80,8 +80,8 @@ function(op_library TARGET) if ("${TARGET}" STREQUAL "nccl_op") set(pybind_flag 1) # It's enough to just adding one operator to pybind - file(APPEND ${pybind_file} "USE_OP(ncclInit);\n") - # file(APPEND ${pybind_file} "USE_OP(ncclInit);\n") + file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclInit);\n") + file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n") endif() # reduce_op contains several operators diff --git a/paddle/operators/nccl/CMakeLists.txt b/paddle/operators/nccl/CMakeLists.txt index bdd873b3f3..21cc1d9ee9 100644 --- a/paddle/operators/nccl/CMakeLists.txt +++ b/paddle/operators/nccl/CMakeLists.txt @@ -1,4 +1,4 @@ if(WITH_GPU) - nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator) + nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator ) nv_test(nccl_gpu_common_test SRCS nccl_gpu_common_test.cc DEPS nccl_common) endif() diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc index 91584a377e..f0f7b205b6 100644 --- a/paddle/operators/nccl_op.cc +++ b/paddle/operators/nccl_op.cc @@ -67,6 +67,54 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel { } }; +// ReduceOp +class NCCLReduceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + " Input(X) of Reduce op input should not be NULL"); + PADDLE_ENFORCE( + ctx->HasInput("Communicator"), + " Input(Communicator) of Reduce op input should not be NULL"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + " Input(X) of Reduce op input should not be NULL"); + + ctx->SetOutputsDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +// BcastSendOp +class NCCLBcastSendOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + " Input(X) of Bcast op input should not be NULL"); + PADDLE_ENFORCE(ctx->HasInput("Communicator"), + " Input(Communicator) of Bcast op input should not be NULL"); + } +}; + +// BcastRecvOp +class NCCLBcastRecvOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Communicator"), + " Input(Communicator) of Bcast op input should not be NULL"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + " Output(Out) of Bcast op output should not be NULL"); + } +}; + // AllreduceOp class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { public: @@ -85,15 +133,31 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { } }; +// BcastSend should be in the root +// BcastSendOp +class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker { + public: + NCCLAllBcastSendOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of BcastSend op"); + AddInput("Communicator", "Communicator for communicating between gpus"); + AddAttr("root", "root gpu of Bcast"); + AddComment(R"DOC( + Bcast the tensors. + )DOC"); + } +}; + // BcastOp -class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker { +class NCCLBcastRecvOpMaker : public framework::OpProtoAndCheckerMaker { public: - NCCLAllBcastOpMaker(framework::OpProto *proto, - framework::OpAttrChecker *op_checker) + NCCLAllBcastRecvOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "The input of Bcast op"); AddInput("Communicator", "Communicator for communicating between gpus"); - AddInput("root", "root gpu of Bcast"); + AddAttr("root", "root gpu of BcastRecv"); + AddOutput("Out", "The output of Bcast"); AddComment(R"DOC( Bcast the tensors. )DOC"); @@ -108,7 +172,6 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker { : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The input of Reduce op"); AddInput("Communicator", "Communicator for communicating between gpus"); - AddInput("root", "root gpu of Reduce"); AddOutput("Out", "The output of Reduce op"); AddComment(R"DOC( Reduce the tensors. @@ -123,4 +186,10 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp, ops::NCCLAllReduceOpMaker); REGISTER_OP_WITHOUT_GRADIENT(ncclInit, ops::NCCLInitOp, ops::NCCLInitOpMaker); +REGISTER_OP_WITHOUT_GRADIENT(ncclBcastSend, ops::NCCLBcastSendOp, + ops::NCCLBcastSendOpMaker); +REGISTER_OP_WITHOUT_GRADIENT(ncclBcastRecv, ops::NCCLBcastRecvOp, + ops::NCCLBcastRecvOpMaker); +REGISTER_OP_WITHOUT_GRADIENT(ncclReduce, ops::NCCLReduceOp, + ops::NCCLReduceOpMaker); REGISTER_OP_CPU_KERNEL(ncclInit, ops::NCCLInitKernel); diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu index 6b0a325d17..4d91a3055f 100644 --- a/paddle/operators/nccl_op.cu +++ b/paddle/operators/nccl_op.cu @@ -10,6 +10,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #define EIGEN_USE_GPU +#include + #include "paddle/operators/nccl_op.h" namespace paddle { @@ -59,8 +61,83 @@ class NCCLAllReduceKernel : public framework::OpKernel { } }; +template +class NCCLReduceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + + auto ins = ctx.MultiInput("X"); + auto outs = ctx.MultiOutput("Out"); + + auto* comm = ctx.Input("Communicator"); + + auto stream = reinterpret_cast( + ctx.device_context()) + .stream(); + // device id + int device_id = + boost::get(ctx.GetPlace()).GetDeviceId(); + int idx = comm->GetCommId(device_id); + + for (size_t i = 0; i < ins.size(); ++i) { + int root = std::hash() % comm->comms_.size(); + T* recvbuffer = nullptr; + if (root == device_id) { + recvbuffer = outs[i]->mutable_data(ctx.GetPlace()); + } + PADDLE_ENFORCE(ncclReduce(ins[i]->data(), recvbuffer, ins[i]->numel(), + NCCLTypeWrapper::type, root, ncclSum, + comm->comms_[idx], stream)); + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); + } + } +}; + +template +class NCCLBcastKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + + int root = ctx.Attr("root"); + + auto* comm = ctx.Input("Communicator"); + + auto stream = reinterpret_cast( + ctx.device_context()) + .stream(); + // device id + int device_id = + boost::get(ctx.GetPlace()).GetDeviceId(); + int idx = comm->GetCommId(device_id); + if (idx == root) { + auto ins = ctx.MultiInput("X"); + for (size_t i = 0; i < ins.size(); ++i) { + PADDLE_ENFORCE(ncclBcast((void*)ins[i]->data(), ins[i]->numel(), + NCCLTypeWrapper::type, root, + comm->comms_[idx], stream)); + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); + } + } else { + auto outs = ctx.MultiOutput("Out"); + for (size_t i = 0; i < outs.size(); ++i) { + PADDLE_ENFORCE(ncclBcast((void*)outs[i]->mutable_data(), + outs[i]->numel(), NCCLTypeWrapper::type, + root, comm->comms_[idx], stream)); + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); + } + } + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel); +REGISTER_OP_GPU_KERNEL(ncclBcastSend, ops::NCCLBcastKernel); +REGISTER_OP_GPU_KERNEL(ncclReduce, ops::NCCLReduceKernel); +REGISTER_OP_GPU_KERNEL(ncclBcastRecv, ops::NCCLBcastKernel); diff --git a/python/paddle/v2/framework/tests/test_nccl_reduce_op.py b/python/paddle/v2/framework/tests/test_nccl_reduce_op.py new file mode 100644 index 0000000000..675ad5766c --- /dev/null +++ b/python/paddle/v2/framework/tests/test_nccl_reduce_op.py @@ -0,0 +1,6 @@ +import unittest, os +import numpy as np +import paddle.v2 as paddle +from paddle.v2.framework.op import Operator +import paddle.v2.framework.core as core +from op_test import OpTest, create_op, set_input -- GitLab