diff --git a/paddle/fluid/operators/nccl_op.cc b/paddle/fluid/operators/nccl_op.cc
index 329656d26da0d32a4e30dd2aeecb9f7aa7f9a84d..5e4ed886b10bd48bf991ce84a9099611cf5d1d26 100644
--- a/paddle/fluid/operators/nccl_op.cc
+++ b/paddle/fluid/operators/nccl_op.cc
@@ -104,19 +104,38 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
         " Input(Communicator) of AllReduce op input should not be NULL");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    " Output(Out) of AllReduce op output should not be NULL");
-
-    auto x_dims = ctx->GetInputsDim("X");
-
     std::string reduction = ctx->Attrs().Get<std::string>("reduction");
     PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
                     reduction == "ncclMin" || reduction == "ncclMax"),
                    "invalid reduction.");
 
+    auto x_dims = ctx->GetInputsDim("X");
     ctx->SetOutputsDim("Out", x_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
+// AllReduceOp
+class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of AllReduce op");
+    AddInput("Communicator", "Communicator for communicating between gpus");
+    AddOutput("Out", "The output of AllReduce op");
+    AddAttr<std::string>("reduction",
+                         "(string, default 'ncclSum') "
+                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
+        .SetDefault("ncclSum");
+    AddComment(R"DOC(
+NCCLAllReduce Operator.
+
+AllReduce the input tensors.
+
+)DOC");
+  }
+};
+
 // ReduceOp
 class NCCLReduceOp : public framework::OperatorWithKernel {
  public:
@@ -143,50 +162,6 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
   }
 };
 
-// BcastOp
-class NCCLBcastOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   " Input(X) of Bcast op input should not be NULL");
-    PADDLE_ENFORCE(ctx->HasInput("Communicator"),
-                   " Input(Communicator) of Bcast op input should not be NULL");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   " Output(Out) of Bcast op output should not be NULL");
-
-    int root = ctx->Attrs().Get<int>("root");
-    PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set.");
-
-    auto x_dims = ctx->GetInputsDim("X");
-    ctx->SetOutputsDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-// AllreduceOp
-class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input of AllReduce op");
-    AddInput("Communicator", "Communicator for communicating between gpus");
-    AddOutput("Out", "The output of AllReduce op");
-    AddAttr<std::string>("reduction",
-                         "(string, default 'ncclSum') "
-                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
-        .SetDefault("ncclSum");
-    AddComment(R"DOC(
-NCCLAllReduce Operator.
-
-AllReduce the input tensors.
-
-)DOC");
-  }
-};
-
 // ReduceOp
 class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -213,6 +188,29 @@ Reduce the tensors.
   }
 };
 
+// BcastOp
+class NCCLBcastOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   " Input(X) of Bcast op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasInput("Communicator"),
+                   " Input(Communicator) of Bcast op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   " Output(Out) of Bcast op output should not be NULL");
+
+    int root = ctx->Attrs().Get<int>("root");
+    PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set.");
+
+    auto x_dims = ctx->GetInputsDim("X");
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
 // BcastOp
 class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
diff --git a/paddle/fluid/operators/nccl_op.cu.cc b/paddle/fluid/operators/nccl_op.cu.cc
index 683a520e99fe72875d52393a18463324a8b6c3f2..4d83a70e7334a84bb98bd52f0172f6b7ecedb58d 100644
--- a/paddle/fluid/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
@@ -43,13 +43,12 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "This kernel only runs on GPU device.");
-
-    auto ins = ctx.MultiInput<LoDTensor>("X");
-    auto outs = ctx.MultiOutput<LoDTensor>("Out");
-
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+    auto* comm = ctx.Input<Communicator>("Communicator");
     std::string reduction = ctx.Attr<std::string>("reduction");
-    ncclRedOp_t reduction_op_ = ncclSum;
 
+    ncclRedOp_t reduction_op_ = ncclSum;
     if (reduction == "ncclMin") {
       reduction_op_ = ncclMin;
     } else if (reduction == "ncclMax") {
@@ -61,30 +60,19 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
     } else {
       PADDLE_THROW("Invalid reduction. default ncclSum.");
     }
-
-    auto* comm = ctx.Input<Communicator>("Communicator");
-
-    auto stream = ctx.cuda_device_context().stream();
-
     // device id
     int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
     int idx = comm->GetCommId(gpu_id);
-
-    for (size_t i = 0; i < ins.size(); ++i) {
-      VLOG(1) << "gpu : "
-              << " invoke allreduce. send " << ins[i]->numel() << " recv "
-              << outs[i]->numel();
-
-      PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-          ins[i]->data<T>(), outs[i]->mutable_data<T>(ctx.GetPlace()),
-          outs[i]->numel(), NCCLTypeWrapper<T>::type, reduction_op_,
-          comm->comms().at(idx), stream));
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
-
-      VLOG(1) << "gpu : "
-              << " finished allreduce. send " << ins[i]->numel() << " recv "
-              << outs[i]->numel();
-    }
+    VLOG(3) << "gpu : "
+            << " invoke allreduce. send " << x->numel() << " recv "
+            << out->numel();
+    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+        x->data<T>(), out->mutable_data<T>(ctx.GetPlace()), out->numel(),
+        NCCLTypeWrapper<T>::type, reduction_op_, comm->comms().at(idx),
+        ctx.cuda_device_context().stream()));
+    VLOG(3) << "gpu : "
+            << " finished allreduce. send " << x->numel() << " recv "
+            << out->numel();
   }
 };
 
@@ -94,13 +82,13 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "This kernel only runs on GPU device.");
-
-    auto ins = ctx.MultiInput<LoDTensor>("X");  // x0, x1, x2
-    auto outs = ctx.MultiOutput<LoDTensor>("Out");
-
+    auto x = ctx.Input<LoDTensor>("X");  // x0, x1, x2
+    auto out = ctx.Output<LoDTensor>("Out");
+    auto* comm = ctx.Input<Communicator>("Communicator");
+    int root = ctx.Attr<int>("root");
     std::string reduction = ctx.Attr<std::string>("reduction");
-    ncclRedOp_t reduction_op_ = ncclSum;
 
+    ncclRedOp_t reduction_op_ = ncclSum;
     if (reduction == "ncclMin") {
       reduction_op_ = ncclMin;
     } else if (reduction == "ncclMax") {
@@ -112,40 +100,21 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
     } else {
       PADDLE_THROW("Invalid reduction. default ncclSum.");
     }
-
-    int root = ctx.Attr<int>("root");
-    auto* comm = ctx.Input<Communicator>("Communicator");
-
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                      ctx.device_context())
-                      .stream();
     // device id
     int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
     int idx = comm->GetCommId(gpu_id);
-
-    auto ins_names = ctx.Inputs("X");
-    std::hash<std::string> hasher;
-    for (size_t i = 0; i < ins.size(); ++i) {
-      if (root == platform::kInvalidGPUId) {
-        root = hasher(ins_names[i]) % comm->comms().size();
-      }
-      T* recvbuffer = nullptr;
-      if (root == gpu_id) {
-        recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace());
-      }
-
-      VLOG(1) << "gpu : " << gpu_id << " invoke reduce. send "
-              << ins[i]->numel() << " recv " << outs[i]->numel();
-
-      PADDLE_ENFORCE(platform::dynload::ncclReduce(
-          ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
-          NCCLTypeWrapper<T>::type, reduction_op_, root, comm->comms().at(idx),
-          stream));
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
-
-      VLOG(1) << "gpu : " << gpu_id << " finished reduce. send "
-              << ins[i]->numel() << " recv " << outs[i]->numel();
+    T* recvbuffer = nullptr;
+    if (root == gpu_id) {
+      recvbuffer = out->mutable_data<T>(ctx.GetPlace());
     }
+    VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
+            << " recv " << out->numel();
+    PADDLE_ENFORCE(platform::dynload::ncclReduce(
+        x->data<T>(), recvbuffer, x->numel(), NCCLTypeWrapper<T>::type,
+        reduction_op_, root, comm->comms().at(idx),
+        ctx.cuda_device_context().stream()));
+    VLOG(3) << "gpu : " << gpu_id << " finished reduce. send " << x->numel()
+            << " recv " << out->numel();
   }
 };
 
@@ -155,47 +124,27 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "This kernel only runs on GPU device.");
-
     int root = ctx.Attr<int>("root");
-
     auto* comm = ctx.Input<Communicator>("Communicator");
-
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                      ctx.device_context())
-                      .stream();
     // device id
     int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
     int idx = comm->GetCommId(gpu_id);
-
     if (idx == root) {
-      auto ins = ctx.MultiInput<LoDTensor>("X");
-      for (size_t i = 0; i < ins.size(); ++i) {
-        VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. send "
-                << ins[i]->numel();
-
-        VLOG(1) << " before ncclBcast";
-        PADDLE_ENFORCE(platform::dynload::ncclBcast(
-            (void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type,
-            root, comm->comms().at(idx), stream));
-        VLOG(1) << " after ncclBcast";
-        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
-
-        VLOG(1) << "gpu : " << gpu_id << " finished Bcast.";
-      }
+      auto* x = ctx.Input<LoDTensor>("X");
+      VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
+      PADDLE_ENFORCE(platform::dynload::ncclBcast(
+          (void*)x->data<T>(), x->numel(), NCCLTypeWrapper<T>::type, root,
+          comm->comms().at(idx), ctx.cuda_device_context().stream()));
+      VLOG(3) << "gpu : " << gpu_id << " finished Bcast.";
     } else {
-      auto outs = ctx.MultiOutput<LoDTensor>("Out");
-      for (size_t i = 0; i < outs.size(); ++i) {
-        VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
-                << framework::product(outs[i]->dims());
-
-        PADDLE_ENFORCE(platform::dynload::ncclBcast(
-            outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(),
-            NCCLTypeWrapper<T>::type, root, comm->comms().at(idx), stream));
-        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
-
-        VLOG(1) << "gpu : " << gpu_id << " finished Bcast. recv "
-                << outs[i]->numel();
-      }
+      auto* out = ctx.Output<LoDTensor>("Out");
+      VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
+              << framework::product(out->dims());
+      PADDLE_ENFORCE(platform::dynload::ncclBcast(
+          out->mutable_data<T>(ctx.GetPlace()), out->numel(),
+          NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
+          ctx.cuda_device_context().stream()));
+      VLOG(3) << "gpu : " << gpu_id << " finished Bcast. recv " << out->numel();
     }
   }
 };