diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index 09989c374c6ca3f4cebb7281fcb49b1a5f744ef5..32362503664eb217cba0f2bc8cfc8434cc9248a1 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -290,11 +290,12 @@ class ExecutionContext { return device_context_; } - //! Get a input which has multiple variables. + //! Get variables vector with same input name. const std::vector& Inputs(const std::string& name) const { return op_.Inputs(name); } - //! Get an output which has multiple variables. + + //! Get variables vector with same output name. const std::vector& Outputs(const std::string& name) const { return op_.Outputs(name); } diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h index 0d71eddf021eab797e5afdbf3996697672f4f5e4..5858cd4839d367bb888b2b98cde2225751391162 100644 --- a/paddle/operators/nccl/nccl_gpu_common.h +++ b/paddle/operators/nccl/nccl_gpu_common.h @@ -30,6 +30,8 @@ namespace paddle { namespace platform { +constexpr int kInvalidGPUId = -1; + struct Communicator { std::vector comms_; std::unordered_map comm_id_map_; diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc index 6a0589cb20823a23934fa2fcb9ad5c27945c24f4..4f3a2f2768f8cc1d1f257977d806ed9b7fa8b8cb 100644 --- a/paddle/operators/nccl_op.cc +++ b/paddle/operators/nccl_op.cc @@ -69,10 +69,10 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputsDim("X"); - // std::string reduction = ctx->Attrs().Get("reduction"); - // PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" || - // reduction == "ncclMin" || reduction == "ncclMax"), - // "invalid reduction."); + std::string reduction = ctx->Attrs().Get("reduction"); + PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" || + reduction == "ncclMin" || reduction == "ncclMax"), + "invalid reduction."); ctx->SetOutputsDim("Out", x_dims); ctx->ShareLoD("X", /*->*/ "Out"); @@ -115,7 +115,7 @@ class NCCLBcastOp : public framework::OperatorWithKernel { " Output(Out) of Bcast op output should not be NULL"); int root = ctx->Attrs().Get("root"); - PADDLE_ENFORCE(root != -1, "Bcast root must be set."); + PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set."); auto x_dims = ctx->GetInputsDim("X"); ctx->SetOutputsDim("Out", x_dims); @@ -132,9 +132,9 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "The input of AllReduce op"); AddInput("Communicator", "Communicator for communicating between gpus"); AddOutput("Out", "The output of AllReduce op"); - // AddAttr("reduction", - // "{'ncclmin', 'ncclmax', 'ncclprod', 'ncclsum'}."); - // AddAttr>("gpus", "gpu id lists"); + AddAttr("reduction", + "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.") + .SetDefault("ncclSum"); AddComment(R"DOC( AllReduce the input tensors. )DOC"); @@ -151,8 +151,9 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Communicator", "Communicator for communicating between gpus"); AddOutput("Out", "The output of Reduce op"); AddAttr("root", - "root gpu of the parameter. if not set(-1). hashed by name.") - .SetDefault(-1); + "root gpu of the parameter. if not " + "set(platform::kInvalidGPUId). hashed by name.") + .SetDefault(platform::kInvalidGPUId); AddComment(R"DOC( Reduce the tensors)DOC"); } @@ -168,8 +169,9 @@ class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Communicator", "Communicator for communicating between gpus"); AddOutput("Out", "The output of Bcast"); AddAttr("root", - "root gpu of the parameter. if not set(-1). hashed by name.") - .SetDefault(-1); + "root gpu of the parameter. if not " + "set(platform::kInvalidGPUId). hashed by name.") + .SetDefault(platform::kInvalidGPUId); AddComment(R"DOC( Bcast the tensors. )DOC"); diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu index 1eef2f218f403fbecc367fd171dae4fc6ad86ba4..cc01db80ca1a14d1dce8f33ee766b402794d73b0 100644 --- a/paddle/operators/nccl_op.cu +++ b/paddle/operators/nccl_op.cu @@ -48,11 +48,28 @@ class NCCLAllReduceKernel : public framework::OpKernel { auto ins = ctx.MultiInput("X"); auto outs = ctx.MultiOutput("Out"); + std::string reduction = ctx.Attr("reduction"); + + ncclRedOp_t reduction_op_ = ncclSum; + + if (reduction == "ncclMin") { + reduction_op_ = ncclMin; + } else if (reduction == "ncclMax") { + reduction_op_ = ncclMax; + } else if (reduction == "ncclSum") { + reduction_op_ = ncclSum; + } else if (reduction == "ncclProd") { + reduction_op_ = ncclProd; + } else { + PADDLE_ENFORCE(false, "Invalid reduction. default ncclSum."); + } + auto* comm = ctx.Input("Communicator"); auto stream = reinterpret_cast( ctx.device_context()) .stream(); + // device id int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); int idx = comm->GetCommId(gpu_id); @@ -64,7 +81,7 @@ class NCCLAllReduceKernel : public framework::OpKernel { PADDLE_ENFORCE(platform::dynload::ncclAllReduce( ins[i]->data(), outs[i]->mutable_data(ctx.GetPlace()), - outs[i]->numel(), NCCLTypeWrapper::type, ncclSum, + outs[i]->numel(), NCCLTypeWrapper::type, reduction_op_, comm->comms_[idx], stream)); PADDLE_ENFORCE(cudaStreamSynchronize(stream)); @@ -98,7 +115,7 @@ class NCCLReduceKernel : public framework::OpKernel { auto ins_names = ctx.Inputs("X"); std::hash hasher; for (size_t i = 0; i < ins.size(); ++i) { - if (root == -1) { + if (root == platform::kInvalidGPUId) { root = hasher(ins_names[i]) % comm->comms_.size(); } T* recvbuffer = nullptr;