"polish code based on comment"

52200523 · Dong Zhihong · 6cce5268 · 52200523 · 52200523
显示空白变更内容
内联并排

Showing with 26 addition and 3 deletion

paddle/operators/nccl_op.cc paddle/operators/nccl_op.cc +8 -0

paddle/operators/nccl_op.cu paddle/operators/nccl_op.cu +18 -3

未找到文件。
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -94,6 +94,11 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   " Input(X) of Reduce op input should not be NULL");

+    std::string reduction = ctx->Attrs().Get<std::string>("reduction");
+    PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
+                    reduction == "ncclMin" || reduction == "ncclMax"),
+                   "invalid reduction.");
+
    auto x_dims = ctx->GetInputsDim("X");
    ctx->SetOutputsDim("Out", x_dims);
    ctx->ShareLoD("X", /*->*/ "Out");
@@ -150,6 +155,9 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "The input of Reduce op");
    AddInput("Communicator", "Communicator for communicating between gpus");
    AddOutput("Out", "The output of Reduce op");
+    AddAttr<std::string>("reduction",
+                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
+        .SetDefault("ncclSum");
    AddAttr<int>("root",
                 "root gpu of the parameter. if not "
                 "set(platform::kInvalidGPUId). hashed by name.")

--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@@ -49,7 +49,6 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
    auto outs = ctx.MultiOutput<LoDTensor>("Out");

    std::string reduction = ctx.Attr<std::string>("reduction");
-
    ncclRedOp_t reduction_op_ = ncclSum;

    if (reduction == "ncclMin") {
@@ -101,8 +100,23 @@ class NCCLReduceKernel : public framework::OpKernel<T> {

    auto ins = ctx.MultiInput<LoDTensor>("X");  // x0, x1, x2
    auto outs = ctx.MultiOutput<LoDTensor>("Out");
-    int root = ctx.Attr<int>("root");

+    std::string reduction = ctx.Attr<std::string>("reduction");
+    ncclRedOp_t reduction_op_ = ncclSum;
+
+    if (reduction == "ncclMin") {
+      reduction_op_ = ncclMin;
+    } else if (reduction == "ncclMax") {
+      reduction_op_ = ncclMax;
+    } else if (reduction == "ncclSum") {
+      reduction_op_ = ncclSum;
+    } else if (reduction == "ncclProd") {
+      reduction_op_ = ncclProd;
+    } else {
+      PADDLE_ENFORCE(false, "Invalid reduction. default ncclSum.");
+    }
+
+    int root = ctx.Attr<int>("root");
    auto* comm = ctx.Input<Communicator>("Communicator");

    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
@@ -128,7 +142,8 @@ class NCCLReduceKernel : public framework::OpKernel<T> {

      PADDLE_ENFORCE(platform::dynload::ncclReduce(
          ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
-          NCCLTypeWrapper<T>::type, ncclSum, root, comm->comms_[idx], stream));
+          NCCLTypeWrapper<T>::type, reduction_op_, root, comm->comms_[idx],
+          stream));
      PADDLE_ENFORCE(cudaStreamSynchronize(stream));

      VLOG(1) << "gpu : " << gpu_id << " finished reduce. send "