提交 423d7438 编写于 作者: D Dong Zhihong

"add register gpu macro"

上级 333045d7
...@@ -233,6 +233,10 @@ class OpKernelRegistrar : public Registrar { ...@@ -233,6 +233,10 @@ class OpKernelRegistrar : public Registrar {
USE_OP_ITSELF(op_type); \ USE_OP_ITSELF(op_type); \
USE_OP_DEVICE_KERNEL(op_type, CPU); USE_OP_DEVICE_KERNEL(op_type, CPU);
#define USE_GPU_ONLY_OP(op_type) \
USE_OP_ITSELF(op_type); \
USE_OP_DEVICE_KERNEL(op_type, GPU)
#define USE_OP(op_type) \ #define USE_OP(op_type) \
USE_OP_ITSELF(op_type); \ USE_OP_ITSELF(op_type); \
USE_OP_KERNEL(op_type) USE_OP_KERNEL(op_type)
......
...@@ -80,8 +80,8 @@ function(op_library TARGET) ...@@ -80,8 +80,8 @@ function(op_library TARGET)
if ("${TARGET}" STREQUAL "nccl_op") if ("${TARGET}" STREQUAL "nccl_op")
set(pybind_flag 1) set(pybind_flag 1)
# It's enough to just adding one operator to pybind # It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(ncclInit);\n") file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclInit);\n")
# file(APPEND ${pybind_file} "USE_OP(ncclInit);\n") file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
endif() endif()
# reduce_op contains several operators # reduce_op contains several operators
......
if(WITH_GPU) if(WITH_GPU)
nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator) nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
nv_test(nccl_gpu_common_test SRCS nccl_gpu_common_test.cc DEPS nccl_common) nv_test(nccl_gpu_common_test SRCS nccl_gpu_common_test.cc DEPS nccl_common)
endif() endif()
...@@ -67,6 +67,54 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel { ...@@ -67,6 +67,54 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
} }
}; };
// ReduceOp
class NCCLReduceOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
" Input(X) of Reduce op input should not be NULL");
PADDLE_ENFORCE(
ctx->HasInput("Communicator"),
" Input(Communicator) of Reduce op input should not be NULL");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
" Input(X) of Reduce op input should not be NULL");
ctx->SetOutputsDim("Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
};
// BcastSendOp
class NCCLBcastSendOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
" Input(X) of Bcast op input should not be NULL");
PADDLE_ENFORCE(ctx->HasInput("Communicator"),
" Input(Communicator) of Bcast op input should not be NULL");
}
};
// BcastRecvOp
class NCCLBcastRecvOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Communicator"),
" Input(Communicator) of Bcast op input should not be NULL");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
" Output(Out) of Bcast op output should not be NULL");
}
};
// AllreduceOp // AllreduceOp
class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
...@@ -85,15 +133,31 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -85,15 +133,31 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
} }
}; };
// BcastSend should be in the root
// BcastSendOp
class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker {
public:
NCCLAllBcastSendOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input of BcastSend op");
AddInput("Communicator", "Communicator for communicating between gpus");
AddAttr<int>("root", "root gpu of Bcast");
AddComment(R"DOC(
Bcast the tensors.
)DOC");
}
};
// BcastOp // BcastOp
class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker { class NCCLBcastRecvOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
NCCLAllBcastOpMaker(framework::OpProto *proto, NCCLAllBcastRecvOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker) framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input of Bcast op");
AddInput("Communicator", "Communicator for communicating between gpus"); AddInput("Communicator", "Communicator for communicating between gpus");
AddInput("root", "root gpu of Bcast"); AddAttr<int>("root", "root gpu of BcastRecv");
AddOutput("Out", "The output of Bcast");
AddComment(R"DOC( AddComment(R"DOC(
Bcast the tensors. Bcast the tensors.
)DOC"); )DOC");
...@@ -108,7 +172,6 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -108,7 +172,6 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input of Reduce op"); AddInput("X", "The input of Reduce op");
AddInput("Communicator", "Communicator for communicating between gpus"); AddInput("Communicator", "Communicator for communicating between gpus");
AddInput("root", "root gpu of Reduce");
AddOutput("Out", "The output of Reduce op"); AddOutput("Out", "The output of Reduce op");
AddComment(R"DOC( AddComment(R"DOC(
Reduce the tensors. Reduce the tensors.
...@@ -123,4 +186,10 @@ namespace ops = paddle::operators; ...@@ -123,4 +186,10 @@ namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp, REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp,
ops::NCCLAllReduceOpMaker); ops::NCCLAllReduceOpMaker);
REGISTER_OP_WITHOUT_GRADIENT(ncclInit, ops::NCCLInitOp, ops::NCCLInitOpMaker); REGISTER_OP_WITHOUT_GRADIENT(ncclInit, ops::NCCLInitOp, ops::NCCLInitOpMaker);
REGISTER_OP_WITHOUT_GRADIENT(ncclBcastSend, ops::NCCLBcastSendOp,
ops::NCCLBcastSendOpMaker);
REGISTER_OP_WITHOUT_GRADIENT(ncclBcastRecv, ops::NCCLBcastRecvOp,
ops::NCCLBcastRecvOpMaker);
REGISTER_OP_WITHOUT_GRADIENT(ncclReduce, ops::NCCLReduceOp,
ops::NCCLReduceOpMaker);
REGISTER_OP_CPU_KERNEL(ncclInit, ops::NCCLInitKernel<float>); REGISTER_OP_CPU_KERNEL(ncclInit, ops::NCCLInitKernel<float>);
...@@ -10,6 +10,8 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include <functional>
#include "paddle/operators/nccl_op.h" #include "paddle/operators/nccl_op.h"
namespace paddle { namespace paddle {
...@@ -59,8 +61,83 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> { ...@@ -59,8 +61,83 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
} }
}; };
template <typename T>
class NCCLReduceKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
auto ins = ctx.MultiInput<Tensor>("X");
auto outs = ctx.MultiOutput<Tensor>("Out");
auto* comm = ctx.Input<Communicator>("Communicator");
auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
ctx.device_context())
.stream();
// device id
int device_id =
boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
int idx = comm->GetCommId(device_id);
for (size_t i = 0; i < ins.size(); ++i) {
int root = std::hash() % comm->comms_.size();
T* recvbuffer = nullptr;
if (root == device_id) {
recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace());
}
PADDLE_ENFORCE(ncclReduce(ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
NCCLTypeWrapper<T>::type, root, ncclSum,
comm->comms_[idx], stream));
PADDLE_ENFORCE(cudaStreamSynchronize(stream));
}
}
};
template <typename T>
class NCCLBcastKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
int root = ctx.Attr<int>("root");
auto* comm = ctx.Input<Communicator>("Communicator");
auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
ctx.device_context())
.stream();
// device id
int device_id =
boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
int idx = comm->GetCommId(device_id);
if (idx == root) {
auto ins = ctx.MultiInput<Tensor>("X");
for (size_t i = 0; i < ins.size(); ++i) {
PADDLE_ENFORCE(ncclBcast((void*)ins[i]->data<T>(), ins[i]->numel(),
NCCLTypeWrapper<T>::type, root,
comm->comms_[idx], stream));
PADDLE_ENFORCE(cudaStreamSynchronize(stream));
}
} else {
auto outs = ctx.MultiOutput<Tensor>("Out");
for (size_t i = 0; i < outs.size(); ++i) {
PADDLE_ENFORCE(ncclBcast((void*)outs[i]->mutable_data<T>(),
outs[i]->numel(), NCCLTypeWrapper<T>::type,
root, comm->comms_[idx], stream));
PADDLE_ENFORCE(cudaStreamSynchronize(stream));
}
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>); REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
REGISTER_OP_GPU_KERNEL(ncclBcastSend, ops::NCCLBcastKernel<float>);
REGISTER_OP_GPU_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
REGISTER_OP_GPU_KERNEL(ncclBcastRecv, ops::NCCLBcastKernel<float>);
import unittest, os
import numpy as np
import paddle.v2 as paddle
from paddle.v2.framework.op import Operator
import paddle.v2.framework.core as core
from op_test import OpTest, create_op, set_input
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册