diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a21574b855bc6bc37fefe61de98d657e712cde7..37556a37a09f883c0b38a76bfb663b378924b760 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -141,7 +141,6 @@ include(external/boost) # download boost include(external/any) # download libn::any include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 -include(external/nccl) include(external/cares) include(external/grpc) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 2a88e5a92985fab7311c1edd266cb89f7d76d867..c604fdcc7b1fe3b16fc48edf4c13445cf26feb62 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/reader.h" +#include "paddle/operators/nccl/nccl_gpu_common.h" // platform::Communicator #include "paddle/platform/place.h" #include "paddle/platform/profiler.h" @@ -53,6 +54,8 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { var->GetMutable(); } else if (var_type == proto::VarDesc::PLACE_LIST) { var->GetMutable(); + } else if (var_type == proto::VarDesc::NCCL_COM) { + var->GetMutable(); } else if (var_type == proto::VarDesc::READER) { var->GetMutable(); } else { @@ -118,13 +121,12 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, for (auto& op_desc : block.AllOps()) { auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); - VLOG(4) << op->DebugStringEx(local_scope); + VLOG(3) << op->DebugStringEx(local_scope); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::RecordEvent record_event(op->Type(), pool.Get(place_)); op->Run(*local_scope, place_); - VLOG(3) << op->DebugStringEx(local_scope); if (FLAGS_benchmark) { VLOG(2) << "Memory used after operator " + op->Type() + " running: " << memory::memory_usage(place_); diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto index d7be1a7352da56e411396614e33919bb55bc3b0f..1e3db1a3bab192d51757040f882a3a3ead1a4f3f 100644 --- a/paddle/framework/framework.proto +++ b/paddle/framework/framework.proto @@ -129,6 +129,7 @@ message VarDesc { LOD_TENSOR_ARRAY = 7; PLACE_LIST = 8; READER = 9; + NCCL_COM = 10; } required string name = 1; required VarType type = 2; diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc index 9d51153b0631b988c9297f395672be67e18ee3f9..83ac67f353dcd52eb7f63d8a72e3f166d1709882 100644 --- a/paddle/operators/nccl_op.cc +++ b/paddle/operators/nccl_op.cc @@ -31,8 +31,13 @@ class NCCLInitOp : public framework::OperatorBase { const auto &name = Output("Communicator"); PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name), "Can not find variable '%s' in the scope.", name); - std::vector gpus = Attr>("gpus"); - PADDLE_ENFORCE(!gpus.empty(), "Attr(gpus) should not be empty."); + + int count = platform::GetCUDADeviceCount(); + std::vector gpus(count); + for (int i = 0; i < count; ++i) { + gpus[i] = i; + } + PADDLE_ENFORCE(!gpus.empty(), "NCCL init with 0 gpus."); if (scope.FindVar(name) == nullptr) { PADDLE_THROW("Output(Communicator) is needed for ncclInit operator."); @@ -50,11 +55,6 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker { : OpProtoAndCheckerMaker(proto, op_checker) { AddOutput("Communicator", "Create Communicator for communicating between gpus"); - AddAttr>("gpus", "(vector) GPU id lists"); - AddAttr("dtype", - "(int, default 5 (FP32)) " - "Output data type") - .SetDefault(framework::proto::DataType::FP32); AddComment(R"DOC( NCCLInit Operator. @@ -77,7 +77,7 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel { ctx->HasInput("Communicator"), " Input(Communicator) of AllReduce op input should not be NULL"); PADDLE_ENFORCE(ctx->HasOutput("Out"), - " Input(X) of AllReduce op input should not be NULL"); + " Output(Out) of AllReduce op output should not be NULL"); auto x_dims = ctx->GetInputsDim("X"); diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index 5ce4b3de39d93e1935c6349ae446dec11d2fa986..b91fd4cf5410b84a116ad15c3472cab614548c85 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -1,5 +1,5 @@ if(WITH_GPU) - cc_library(enforce SRCS enforce.cc DEPS nccl) + cc_library(enforce SRCS enforce.cc DEPS) else() cc_library(enforce SRCS enforce.cc) endif() diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt index cf2081b434961c17c1b65509909699788d2b9ad9..264b4ebf2c06d9e688a32a223dff3ec079333fd9 100644 --- a/paddle/platform/dynload/CMakeLists.txt +++ b/paddle/platform/dynload/CMakeLists.txt @@ -1,4 +1,4 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc - DEPS dynamic_loader nccl) + DEPS dynamic_loader) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index 0a92e10927caf00be60fdd8107600b4033cf09ea..02aeae8b3d2bdb9a9ec851cd4dbdbec6e3d867f6 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -241,7 +241,8 @@ void BindVarDsec(py::module &m) { .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE) .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY) .value("PLACE_LIST", proto::VarDesc::PLACE_LIST) - .value("READER", proto::VarDesc::READER); + .value("READER", proto::VarDesc::READER) + .value("NCCL_COM", proto::VarDesc::NCCL_COM); } void BindOpDesc(py::module &m) { diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index ba496db5f834efe767bfe446a46877932faa81a0..26ecb128eb82a69c03484846e1aab3fa5ef914de 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -34,6 +34,7 @@ function cmake_gen() { Configuring cmake in /paddle/build ... -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} ${PYTHON_FLAGS} + -DWITH_DSO=ON -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU:-OFF} -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} @@ -57,6 +58,7 @@ EOF cmake .. \ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \ ${PYTHON_FLAGS} \ + -DWITH_DSO=ON \ -DWITH_DOC=OFF \ -DWITH_GPU=${WITH_GPU:-OFF} \ -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \ @@ -173,7 +175,7 @@ EOF if [[ ${WITH_GPU} == "ON" ]]; then NCCL_DEPS="apt-get install -y libnccl-dev &&" else - NCCL_DEPS="" + NCCL_DEPS="" fi cat >> /paddle/build/Dockerfile <