diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3a21574b855bc6bc37fefe61de98d657e712cde7..37556a37a09f883c0b38a76bfb663b378924b760 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -141,7 +141,6 @@ include(external/boost)     # download boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
-include(external/nccl)
 include(external/cares)
 include(external/grpc)
 
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 2a88e5a92985fab7311c1edd266cb89f7d76d867..c604fdcc7b1fe3b16fc48edf4c13445cf26feb62 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/reader.h"
+#include "paddle/operators/nccl/nccl_gpu_common.h"  // platform::Communicator
 #include "paddle/platform/place.h"
 #include "paddle/platform/profiler.h"
 
@@ -53,6 +54,8 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
     var->GetMutable<LoDTensorArray>();
   } else if (var_type == proto::VarDesc::PLACE_LIST) {
     var->GetMutable<platform::PlaceList>();
+  } else if (var_type == proto::VarDesc::NCCL_COM) {
+    var->GetMutable<platform::Communicator>();
   } else if (var_type == proto::VarDesc::READER) {
     var->GetMutable<ReaderHolder>();
   } else {
@@ -118,13 +121,12 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
 
   for (auto& op_desc : block.AllOps()) {
     auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
-    VLOG(4) << op->DebugStringEx(local_scope);
+    VLOG(3) << op->DebugStringEx(local_scope);
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     platform::RecordEvent record_event(op->Type(), pool.Get(place_));
 
     op->Run(*local_scope, place_);
-    VLOG(3) << op->DebugStringEx(local_scope);
     if (FLAGS_benchmark) {
       VLOG(2) << "Memory used after operator " + op->Type() + " running: "
               << memory::memory_usage(place_);
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index d7be1a7352da56e411396614e33919bb55bc3b0f..1e3db1a3bab192d51757040f882a3a3ead1a4f3f 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -129,6 +129,7 @@ message VarDesc {
     LOD_TENSOR_ARRAY = 7;
     PLACE_LIST = 8;
     READER = 9;
+    NCCL_COM = 10;
   }
   required string name = 1;
   required VarType type = 2;
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index 9d51153b0631b988c9297f395672be67e18ee3f9..83ac67f353dcd52eb7f63d8a72e3f166d1709882 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -31,8 +31,13 @@ class NCCLInitOp : public framework::OperatorBase {
     const auto &name = Output("Communicator");
     PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
                             "Can not find variable '%s' in the scope.", name);
-    std::vector<int> gpus = Attr<std::vector<int>>("gpus");
-    PADDLE_ENFORCE(!gpus.empty(), "Attr(gpus) should not be empty.");
+
+    int count = platform::GetCUDADeviceCount();
+    std::vector<int> gpus(count);
+    for (int i = 0; i < count; ++i) {
+      gpus[i] = i;
+    }
+    PADDLE_ENFORCE(!gpus.empty(), "NCCL init with 0 gpus.");
 
     if (scope.FindVar(name) == nullptr) {
       PADDLE_THROW("Output(Communicator) is needed for ncclInit operator.");
@@ -50,11 +55,6 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddOutput("Communicator",
               "Create Communicator for communicating between gpus");
-    AddAttr<std::vector<int>>("gpus", "(vector<int>) GPU id lists");
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::DataType::FP32);
     AddComment(R"DOC(
 NCCLInit Operator.
 
@@ -77,7 +77,7 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
         ctx->HasInput("Communicator"),
         " Input(Communicator) of AllReduce op input should not be NULL");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   " Input(X) of AllReduce op input should not be NULL");
+                   " Output(Out) of AllReduce op output should not be NULL");
 
     auto x_dims = ctx->GetInputsDim("X");
 
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 5ce4b3de39d93e1935c6349ae446dec11d2fa986..b91fd4cf5410b84a116ad15c3472cab614548c85 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,5 +1,5 @@
 if(WITH_GPU)
-  cc_library(enforce SRCS enforce.cc DEPS nccl)
+  cc_library(enforce SRCS enforce.cc DEPS)
 else()
   cc_library(enforce SRCS enforce.cc)
 endif()
diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
index cf2081b434961c17c1b65509909699788d2b9ad9..264b4ebf2c06d9e688a32a223dff3ec079333fd9 100644
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
@@ -1,4 +1,4 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc
-        DEPS dynamic_loader nccl)
+        DEPS dynamic_loader)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 0a92e10927caf00be60fdd8107600b4033cf09ea..02aeae8b3d2bdb9a9ec851cd4dbdbec6e3d867f6 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -241,7 +241,8 @@ void BindVarDsec(py::module &m) {
       .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE)
       .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY)
       .value("PLACE_LIST", proto::VarDesc::PLACE_LIST)
-      .value("READER", proto::VarDesc::READER);
+      .value("READER", proto::VarDesc::READER)
+      .value("NCCL_COM", proto::VarDesc::NCCL_COM);
 }
 
 void BindOpDesc(py::module &m) {
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index ba496db5f834efe767bfe446a46877932faa81a0..26ecb128eb82a69c03484846e1aab3fa5ef914de 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -34,6 +34,7 @@ function cmake_gen() {
     Configuring cmake in /paddle/build ...
         -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
         ${PYTHON_FLAGS}
+        -DWITH_DSO=ON
         -DWITH_DOC=OFF
         -DWITH_GPU=${WITH_GPU:-OFF}
         -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
@@ -57,6 +58,7 @@ EOF
     cmake .. \
         -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
         ${PYTHON_FLAGS} \
+        -DWITH_DSO=ON \
         -DWITH_DOC=OFF \
         -DWITH_GPU=${WITH_GPU:-OFF} \
         -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
@@ -173,7 +175,7 @@ EOF
     if [[ ${WITH_GPU} == "ON"  ]]; then
         NCCL_DEPS="apt-get install -y libnccl-dev &&"
     else
-        NCCL_DEPS="" 
+        NCCL_DEPS=""
     fi
 
     cat >> /paddle/build/Dockerfile <<EOF
@@ -189,6 +191,7 @@ EOF
         ldconfig
     ${DOCKERFILE_CUDNN_DSO}
     ${DOCKERFILE_GPU_ENV}
+    ENV NCCL_LAUNCH_MODE PARALLEL
     ADD go/cmd/pserver/pserver /usr/bin/
     ADD go/cmd/master/master /usr/bin/
     # default command shows the paddle version and exit