diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 09989c374c6ca3f4cebb7281fcb49b1a5f744ef5..32362503664eb217cba0f2bc8cfc8434cc9248a1 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -290,11 +290,12 @@ class ExecutionContext {
     return device_context_;
   }
 
-  //! Get a input which has multiple variables.
+  //! Get variables vector with same input name.
   const std::vector<std::string>& Inputs(const std::string& name) const {
     return op_.Inputs(name);
   }
-  //! Get an output which has multiple variables.
+
+  //! Get variables vector with same output name.
   const std::vector<std::string>& Outputs(const std::string& name) const {
     return op_.Outputs(name);
   }
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index 0d71eddf021eab797e5afdbf3996697672f4f5e4..5858cd4839d367bb888b2b98cde2225751391162 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -30,6 +30,8 @@
 namespace paddle {
 namespace platform {
 
+constexpr int kInvalidGPUId = -1;
+
 struct Communicator {
   std::vector<ncclComm_t> comms_;
   std::unordered_map<int, int> comm_id_map_;
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index 6a0589cb20823a23934fa2fcb9ad5c27945c24f4..4f3a2f2768f8cc1d1f257977d806ed9b7fa8b8cb 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -69,10 +69,10 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputsDim("X");
 
-    // std::string reduction = ctx->Attrs().Get<std::string>("reduction");
-    // PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
-    //                 reduction == "ncclMin" || reduction == "ncclMax"),
-    //                "invalid reduction.");
+    std::string reduction = ctx->Attrs().Get<std::string>("reduction");
+    PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
+                    reduction == "ncclMin" || reduction == "ncclMax"),
+                   "invalid reduction.");
 
     ctx->SetOutputsDim("Out", x_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
@@ -115,7 +115,7 @@ class NCCLBcastOp : public framework::OperatorWithKernel {
                    " Output(Out) of Bcast op output should not be NULL");
 
     int root = ctx->Attrs().Get<int>("root");
-    PADDLE_ENFORCE(root != -1, "Bcast root must be set.");
+    PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set.");
 
     auto x_dims = ctx->GetInputsDim("X");
     ctx->SetOutputsDim("Out", x_dims);
@@ -132,9 +132,9 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "The input of AllReduce op");
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of AllReduce op");
-    // AddAttr<std::string>("reduction",
-    //                      "{'ncclmin', 'ncclmax', 'ncclprod', 'ncclsum'}.");
-    // AddAttr<std::vector<int>>("gpus", "gpu id lists");
+    AddAttr<std::string>("reduction",
+                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
+        .SetDefault("ncclSum");
     AddComment(R"DOC(
             AllReduce the input tensors.
         )DOC");
@@ -151,8 +151,9 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of Reduce op");
     AddAttr<int>("root",
-                 "root gpu of the parameter. if not set(-1). hashed by name.")
-        .SetDefault(-1);
+                 "root gpu of the parameter. if not "
+                 "set(platform::kInvalidGPUId). hashed by name.")
+        .SetDefault(platform::kInvalidGPUId);
     AddComment(R"DOC(
             Reduce the tensors)DOC");
   }
@@ -168,8 +169,9 @@ class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of Bcast");
     AddAttr<int>("root",
-                 "root gpu of the parameter. if not set(-1). hashed by name.")
-        .SetDefault(-1);
+                 "root gpu of the parameter. if not "
+                 "set(platform::kInvalidGPUId). hashed by name.")
+        .SetDefault(platform::kInvalidGPUId);
     AddComment(R"DOC(
             Bcast the tensors.
         )DOC");
diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu
index 1eef2f218f403fbecc367fd171dae4fc6ad86ba4..cc01db80ca1a14d1dce8f33ee766b402794d73b0 100644
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@@ -48,11 +48,28 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
     auto ins = ctx.MultiInput<LoDTensor>("X");
     auto outs = ctx.MultiOutput<LoDTensor>("Out");
 
+    std::string reduction = ctx.Attr<std::string>("reduction");
+
+    ncclRedOp_t reduction_op_ = ncclSum;
+
+    if (reduction == "ncclMin") {
+      reduction_op_ = ncclMin;
+    } else if (reduction == "ncclMax") {
+      reduction_op_ = ncclMax;
+    } else if (reduction == "ncclSum") {
+      reduction_op_ = ncclSum;
+    } else if (reduction == "ncclProd") {
+      reduction_op_ = ncclProd;
+    } else {
+      PADDLE_ENFORCE(false, "Invalid reduction. default ncclSum.");
+    }
+
     auto* comm = ctx.Input<Communicator>("Communicator");
 
     auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
                       ctx.device_context())
                       .stream();
+
     // device id
     int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
     int idx = comm->GetCommId(gpu_id);
@@ -64,7 +81,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
 
       PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
           ins[i]->data<T>(), outs[i]->mutable_data<T>(ctx.GetPlace()),
-          outs[i]->numel(), NCCLTypeWrapper<T>::type, ncclSum,
+          outs[i]->numel(), NCCLTypeWrapper<T>::type, reduction_op_,
           comm->comms_[idx], stream));
       PADDLE_ENFORCE(cudaStreamSynchronize(stream));
 
@@ -98,7 +115,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
     auto ins_names = ctx.Inputs("X");
     std::hash<std::string> hasher;
     for (size_t i = 0; i < ins.size(); ++i) {
-      if (root == -1) {
+      if (root == platform::kInvalidGPUId) {
         root = hasher(ins_names[i]) % comm->comms_.size();
       }
       T* recvbuffer = nullptr;