add nccl

672cdc21 · Yang Yang · 1c91574b · 672cdc21 · 672cdc21 · 672cdc21
8 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -141,7 +141,6 @@ include(external/boost)     # download boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
-include(external/nccl)
 include(external/cares)
 include(external/grpc)

--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/reader.h"
+#include "paddle/operators/nccl/nccl_gpu_common.h"  // platform::Communicator
 #include "paddle/platform/place.h"
 #include "paddle/platform/profiler.h"
@@ -53,6 +54,8 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
    var->GetMutable<LoDTensorArray>();
  } else if (var_type == proto::VarDesc::PLACE_LIST) {
    var->GetMutable<platform::PlaceList>();
+  } else if (var_type == proto::VarDesc::NCCL_COM) {
+    var->GetMutable<platform::Communicator>();
  } else if (var_type == proto::VarDesc::READER) {
    var->GetMutable<ReaderHolder>();
  } else {
@@ -118,13 +121,12 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
  for (auto& op_desc : block.AllOps()) {
    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
-    VLOG(4) << op->DebugStringEx(local_scope);
+    VLOG(3) << op->DebugStringEx(local_scope);
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    platform::RecordEvent record_event(op->Type(), pool.Get(place_));
    op->Run(*local_scope, place_);
-    VLOG(3) << op->DebugStringEx(local_scope);
    if (FLAGS_benchmark) {
      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
              << memory::memory_usage(place_);

--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -129,6 +129,7 @@ message VarDesc {
    LOD_TENSOR_ARRAY = 7;
    PLACE_LIST = 8;
    READER = 9;
+    NCCL_COM = 10;
  }
  required string name = 1;
  required VarType type = 2;

--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -31,8 +31,13 @@ class NCCLInitOp : public framework::OperatorBase {
    const auto &name = Output("Communicator");
    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
                            "Can not find variable '%s' in the scope.", name);
-    std::vector<int> gpus = Attr<std::vector<int>>("gpus");
-    PADDLE_ENFORCE(!gpus.empty(), "Attr(gpus) should not be empty.");
+    int count = platform::GetCUDADeviceCount();
+    std::vector<int> gpus(count);
+    for (int i = 0; i < count; ++i) {
+      gpus[i] = i;
+    }
+    PADDLE_ENFORCE(!gpus.empty(), "NCCL init with 0 gpus.");
    if (scope.FindVar(name) == nullptr) {
      PADDLE_THROW("Output(Communicator) is needed for ncclInit operator.");
@@ -50,11 +55,6 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddOutput("Communicator",
              "Create Communicator for communicating between gpus");
-    AddAttr<std::vector<int>>("gpus", "(vector<int>) GPU id lists");
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::DataType::FP32);
    AddComment(R"DOC(
 NCCLInit Operator.
@@ -77,7 +77,7 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
        ctx->HasInput("Communicator"),
        " Input(Communicator) of AllReduce op input should not be NULL");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   " Input(X) of AllReduce op input should not be NULL");
+                   " Output(Out) of AllReduce op output should not be NULL");
    auto x_dims = ctx->GetInputsDim("X");

--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
 if(WITH_GPU)
-  cc_library(enforce SRCS enforce.cc DEPS nccl)
+  cc_library(enforce SRCS enforce.cc DEPS)
 else()
  cc_library(enforce SRCS enforce.cc)
 endif()

--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc
-        DEPS dynamic_loader nccl)
+        DEPS dynamic_loader)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -241,7 +241,8 @@ void BindVarDsec(py::module &m) {
      .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE)
      .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY)
      .value("PLACE_LIST", proto::VarDesc::PLACE_LIST)
-      .value("READER", proto::VarDesc::READER);
+      .value("READER", proto::VarDesc::READER)
+      .value("NCCL_COM", proto::VarDesc::NCCL_COM);
 }
 void BindOpDesc(py::module &m) {

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -34,6 +34,7 @@ function cmake_gen() {
    Configuring cmake in /paddle/build ...
        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
        ${PYTHON_FLAGS}
+        -DWITH_DSO=ON
        -DWITH_DOC=OFF
        -DWITH_GPU=${WITH_GPU:-OFF}
        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
@@ -57,6 +58,7 @@ EOF
    cmake .. \
        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
        ${PYTHON_FLAGS} \
+        -DWITH_DSO=ON \
        -DWITH_DOC=OFF \
        -DWITH_GPU=${WITH_GPU:-OFF} \
        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
@@ -189,6 +191,7 @@ EOF
        ldconfig
    ${DOCKERFILE_CUDNN_DSO}
    ${DOCKERFILE_GPU_ENV}
+    ENV NCCL_LAUNCH_MODE PARALLEL
    ADD go/cmd/pserver/pserver /usr/bin/
    ADD go/cmd/master/master /usr/bin/
    # default command shows the paddle version and exit