diff --git a/paddle/fluid/operators/nccl_op.cc b/paddle/fluid/operators/nccl_op.cc index 329656d26da0d32a4e30dd2aeecb9f7aa7f9a84d..5e4ed886b10bd48bf991ce84a9099611cf5d1d26 100644 --- a/paddle/fluid/operators/nccl_op.cc +++ b/paddle/fluid/operators/nccl_op.cc @@ -104,19 +104,38 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel { " Input(Communicator) of AllReduce op input should not be NULL"); PADDLE_ENFORCE(ctx->HasOutput("Out"), " Output(Out) of AllReduce op output should not be NULL"); - - auto x_dims = ctx->GetInputsDim("X"); - std::string reduction = ctx->Attrs().Get("reduction"); PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" || reduction == "ncclMin" || reduction == "ncclMax"), "invalid reduction."); + auto x_dims = ctx->GetInputsDim("X"); ctx->SetOutputsDim("Out", x_dims); ctx->ShareLoD("X", /*->*/ "Out"); } }; +// AllReduceOp +class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of AllReduce op"); + AddInput("Communicator", "Communicator for communicating between gpus"); + AddOutput("Out", "The output of AllReduce op"); + AddAttr("reduction", + "(string, default 'ncclSum') " + "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.") + .SetDefault("ncclSum"); + AddComment(R"DOC( +NCCLAllReduce Operator. + +AllReduce the input tensors. + +)DOC"); + } +}; + // ReduceOp class NCCLReduceOp : public framework::OperatorWithKernel { public: @@ -143,50 +162,6 @@ class NCCLReduceOp : public framework::OperatorWithKernel { } }; -// BcastOp -class NCCLBcastOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - " Input(X) of Bcast op input should not be NULL"); - PADDLE_ENFORCE(ctx->HasInput("Communicator"), - " Input(Communicator) of Bcast op input should not be NULL"); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - " Output(Out) of Bcast op output should not be NULL"); - - int root = ctx->Attrs().Get("root"); - PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set."); - - auto x_dims = ctx->GetInputsDim("X"); - ctx->SetOutputsDim("Out", x_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } -}; - -// AllreduceOp -class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { - public: - NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "The input of AllReduce op"); - AddInput("Communicator", "Communicator for communicating between gpus"); - AddOutput("Out", "The output of AllReduce op"); - AddAttr("reduction", - "(string, default 'ncclSum') " - "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.") - .SetDefault("ncclSum"); - AddComment(R"DOC( -NCCLAllReduce Operator. - -AllReduce the input tensors. - -)DOC"); - } -}; - // ReduceOp class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker { public: @@ -213,6 +188,29 @@ Reduce the tensors. } }; +// BcastOp +class NCCLBcastOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + " Input(X) of Bcast op input should not be NULL"); + PADDLE_ENFORCE(ctx->HasInput("Communicator"), + " Input(Communicator) of Bcast op input should not be NULL"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + " Output(Out) of Bcast op output should not be NULL"); + + int root = ctx->Attrs().Get("root"); + PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set."); + + auto x_dims = ctx->GetInputsDim("X"); + ctx->SetOutputsDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + // BcastOp class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker { public: diff --git a/paddle/fluid/operators/nccl_op.cu.cc b/paddle/fluid/operators/nccl_op.cu.cc index 683a520e99fe72875d52393a18463324a8b6c3f2..4d83a70e7334a84bb98bd52f0172f6b7ecedb58d 100644 --- a/paddle/fluid/operators/nccl_op.cu.cc +++ b/paddle/fluid/operators/nccl_op.cu.cc @@ -43,13 +43,12 @@ class NCCLAllReduceKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "This kernel only runs on GPU device."); - - auto ins = ctx.MultiInput("X"); - auto outs = ctx.MultiOutput("Out"); - + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + auto* comm = ctx.Input("Communicator"); std::string reduction = ctx.Attr("reduction"); - ncclRedOp_t reduction_op_ = ncclSum; + ncclRedOp_t reduction_op_ = ncclSum; if (reduction == "ncclMin") { reduction_op_ = ncclMin; } else if (reduction == "ncclMax") { @@ -61,30 +60,19 @@ class NCCLAllReduceKernel : public framework::OpKernel { } else { PADDLE_THROW("Invalid reduction. default ncclSum."); } - - auto* comm = ctx.Input("Communicator"); - - auto stream = ctx.cuda_device_context().stream(); - // device id int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); int idx = comm->GetCommId(gpu_id); - - for (size_t i = 0; i < ins.size(); ++i) { - VLOG(1) << "gpu : " - << " invoke allreduce. send " << ins[i]->numel() << " recv " - << outs[i]->numel(); - - PADDLE_ENFORCE(platform::dynload::ncclAllReduce( - ins[i]->data(), outs[i]->mutable_data(ctx.GetPlace()), - outs[i]->numel(), NCCLTypeWrapper::type, reduction_op_, - comm->comms().at(idx), stream)); - PADDLE_ENFORCE(cudaStreamSynchronize(stream)); - - VLOG(1) << "gpu : " - << " finished allreduce. send " << ins[i]->numel() << " recv " - << outs[i]->numel(); - } + VLOG(3) << "gpu : " + << " invoke allreduce. send " << x->numel() << " recv " + << out->numel(); + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + x->data(), out->mutable_data(ctx.GetPlace()), out->numel(), + NCCLTypeWrapper::type, reduction_op_, comm->comms().at(idx), + ctx.cuda_device_context().stream())); + VLOG(3) << "gpu : " + << " finished allreduce. send " << x->numel() << " recv " + << out->numel(); } }; @@ -94,13 +82,13 @@ class NCCLReduceKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "This kernel only runs on GPU device."); - - auto ins = ctx.MultiInput("X"); // x0, x1, x2 - auto outs = ctx.MultiOutput("Out"); - + auto x = ctx.Input("X"); // x0, x1, x2 + auto out = ctx.Output("Out"); + auto* comm = ctx.Input("Communicator"); + int root = ctx.Attr("root"); std::string reduction = ctx.Attr("reduction"); - ncclRedOp_t reduction_op_ = ncclSum; + ncclRedOp_t reduction_op_ = ncclSum; if (reduction == "ncclMin") { reduction_op_ = ncclMin; } else if (reduction == "ncclMax") { @@ -112,40 +100,21 @@ class NCCLReduceKernel : public framework::OpKernel { } else { PADDLE_THROW("Invalid reduction. default ncclSum."); } - - int root = ctx.Attr("root"); - auto* comm = ctx.Input("Communicator"); - - auto stream = reinterpret_cast( - ctx.device_context()) - .stream(); // device id int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); int idx = comm->GetCommId(gpu_id); - - auto ins_names = ctx.Inputs("X"); - std::hash hasher; - for (size_t i = 0; i < ins.size(); ++i) { - if (root == platform::kInvalidGPUId) { - root = hasher(ins_names[i]) % comm->comms().size(); - } - T* recvbuffer = nullptr; - if (root == gpu_id) { - recvbuffer = outs[i]->mutable_data(ctx.GetPlace()); - } - - VLOG(1) << "gpu : " << gpu_id << " invoke reduce. send " - << ins[i]->numel() << " recv " << outs[i]->numel(); - - PADDLE_ENFORCE(platform::dynload::ncclReduce( - ins[i]->data(), recvbuffer, ins[i]->numel(), - NCCLTypeWrapper::type, reduction_op_, root, comm->comms().at(idx), - stream)); - PADDLE_ENFORCE(cudaStreamSynchronize(stream)); - - VLOG(1) << "gpu : " << gpu_id << " finished reduce. send " - << ins[i]->numel() << " recv " << outs[i]->numel(); + T* recvbuffer = nullptr; + if (root == gpu_id) { + recvbuffer = out->mutable_data(ctx.GetPlace()); } + VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel() + << " recv " << out->numel(); + PADDLE_ENFORCE(platform::dynload::ncclReduce( + x->data(), recvbuffer, x->numel(), NCCLTypeWrapper::type, + reduction_op_, root, comm->comms().at(idx), + ctx.cuda_device_context().stream())); + VLOG(3) << "gpu : " << gpu_id << " finished reduce. send " << x->numel() + << " recv " << out->numel(); } }; @@ -155,47 +124,27 @@ class NCCLBcastKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "This kernel only runs on GPU device."); - int root = ctx.Attr("root"); - auto* comm = ctx.Input("Communicator"); - - auto stream = reinterpret_cast( - ctx.device_context()) - .stream(); // device id int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); int idx = comm->GetCommId(gpu_id); - if (idx == root) { - auto ins = ctx.MultiInput("X"); - for (size_t i = 0; i < ins.size(); ++i) { - VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. send " - << ins[i]->numel(); - - VLOG(1) << " before ncclBcast"; - PADDLE_ENFORCE(platform::dynload::ncclBcast( - (void*)ins[i]->data(), ins[i]->numel(), NCCLTypeWrapper::type, - root, comm->comms().at(idx), stream)); - VLOG(1) << " after ncclBcast"; - PADDLE_ENFORCE(cudaStreamSynchronize(stream)); - - VLOG(1) << "gpu : " << gpu_id << " finished Bcast."; - } + auto* x = ctx.Input("X"); + VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel(); + PADDLE_ENFORCE(platform::dynload::ncclBcast( + (void*)x->data(), x->numel(), NCCLTypeWrapper::type, root, + comm->comms().at(idx), ctx.cuda_device_context().stream())); + VLOG(3) << "gpu : " << gpu_id << " finished Bcast."; } else { - auto outs = ctx.MultiOutput("Out"); - for (size_t i = 0; i < outs.size(); ++i) { - VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. recv buffer " - << framework::product(outs[i]->dims()); - - PADDLE_ENFORCE(platform::dynload::ncclBcast( - outs[i]->mutable_data(ctx.GetPlace()), outs[i]->numel(), - NCCLTypeWrapper::type, root, comm->comms().at(idx), stream)); - PADDLE_ENFORCE(cudaStreamSynchronize(stream)); - - VLOG(1) << "gpu : " << gpu_id << " finished Bcast. recv " - << outs[i]->numel(); - } + auto* out = ctx.Output("Out"); + VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer " + << framework::product(out->dims()); + PADDLE_ENFORCE(platform::dynload::ncclBcast( + out->mutable_data(ctx.GetPlace()), out->numel(), + NCCLTypeWrapper::type, root, comm->comms().at(idx), + ctx.cuda_device_context().stream())); + VLOG(3) << "gpu : " << gpu_id << " finished Bcast. recv " << out->numel(); } } };