提交 672cdc21 编写于 作者: Y Yang Yang

add nccl

上级 1c91574b
...@@ -141,7 +141,6 @@ include(external/boost) # download boost ...@@ -141,7 +141,6 @@ include(external/boost) # download boost
include(external/any) # download libn::any include(external/any) # download libn::any
include(external/eigen) # download eigen3 include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11 include(external/pybind11) # download pybind11
include(external/nccl)
include(external/cares) include(external/cares)
include(external/grpc) include(external/grpc)
......
...@@ -23,6 +23,7 @@ limitations under the License. */ ...@@ -23,6 +23,7 @@ limitations under the License. */
#include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/reader.h" #include "paddle/framework/reader.h"
#include "paddle/operators/nccl/nccl_gpu_common.h" // platform::Communicator
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
#include "paddle/platform/profiler.h" #include "paddle/platform/profiler.h"
...@@ -53,6 +54,8 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { ...@@ -53,6 +54,8 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
var->GetMutable<LoDTensorArray>(); var->GetMutable<LoDTensorArray>();
} else if (var_type == proto::VarDesc::PLACE_LIST) { } else if (var_type == proto::VarDesc::PLACE_LIST) {
var->GetMutable<platform::PlaceList>(); var->GetMutable<platform::PlaceList>();
} else if (var_type == proto::VarDesc::NCCL_COM) {
var->GetMutable<platform::Communicator>();
} else if (var_type == proto::VarDesc::READER) { } else if (var_type == proto::VarDesc::READER) {
var->GetMutable<ReaderHolder>(); var->GetMutable<ReaderHolder>();
} else { } else {
...@@ -118,13 +121,12 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -118,13 +121,12 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(4) << op->DebugStringEx(local_scope); VLOG(3) << op->DebugStringEx(local_scope);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(op->Type(), pool.Get(place_)); platform::RecordEvent record_event(op->Type(), pool.Get(place_));
op->Run(*local_scope, place_); op->Run(*local_scope, place_);
VLOG(3) << op->DebugStringEx(local_scope);
if (FLAGS_benchmark) { if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: " VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_); << memory::memory_usage(place_);
......
...@@ -129,6 +129,7 @@ message VarDesc { ...@@ -129,6 +129,7 @@ message VarDesc {
LOD_TENSOR_ARRAY = 7; LOD_TENSOR_ARRAY = 7;
PLACE_LIST = 8; PLACE_LIST = 8;
READER = 9; READER = 9;
NCCL_COM = 10;
} }
required string name = 1; required string name = 1;
required VarType type = 2; required VarType type = 2;
......
...@@ -31,8 +31,13 @@ class NCCLInitOp : public framework::OperatorBase { ...@@ -31,8 +31,13 @@ class NCCLInitOp : public framework::OperatorBase {
const auto &name = Output("Communicator"); const auto &name = Output("Communicator");
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name), PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
"Can not find variable '%s' in the scope.", name); "Can not find variable '%s' in the scope.", name);
std::vector<int> gpus = Attr<std::vector<int>>("gpus");
PADDLE_ENFORCE(!gpus.empty(), "Attr(gpus) should not be empty."); int count = platform::GetCUDADeviceCount();
std::vector<int> gpus(count);
for (int i = 0; i < count; ++i) {
gpus[i] = i;
}
PADDLE_ENFORCE(!gpus.empty(), "NCCL init with 0 gpus.");
if (scope.FindVar(name) == nullptr) { if (scope.FindVar(name) == nullptr) {
PADDLE_THROW("Output(Communicator) is needed for ncclInit operator."); PADDLE_THROW("Output(Communicator) is needed for ncclInit operator.");
...@@ -50,11 +55,6 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -50,11 +55,6 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddOutput("Communicator", AddOutput("Communicator",
"Create Communicator for communicating between gpus"); "Create Communicator for communicating between gpus");
AddAttr<std::vector<int>>("gpus", "(vector<int>) GPU id lists");
AddAttr<int>("dtype",
"(int, default 5 (FP32)) "
"Output data type")
.SetDefault(framework::proto::DataType::FP32);
AddComment(R"DOC( AddComment(R"DOC(
NCCLInit Operator. NCCLInit Operator.
...@@ -77,7 +77,7 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel { ...@@ -77,7 +77,7 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
ctx->HasInput("Communicator"), ctx->HasInput("Communicator"),
" Input(Communicator) of AllReduce op input should not be NULL"); " Input(Communicator) of AllReduce op input should not be NULL");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
" Input(X) of AllReduce op input should not be NULL"); " Output(Out) of AllReduce op output should not be NULL");
auto x_dims = ctx->GetInputsDim("X"); auto x_dims = ctx->GetInputsDim("X");
......
if(WITH_GPU) if(WITH_GPU)
cc_library(enforce SRCS enforce.cc DEPS nccl) cc_library(enforce SRCS enforce.cc DEPS)
else() else()
cc_library(enforce SRCS enforce.cc) cc_library(enforce SRCS enforce.cc)
endif() endif()
......
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc
DEPS dynamic_loader nccl) DEPS dynamic_loader)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
...@@ -241,7 +241,8 @@ void BindVarDsec(py::module &m) { ...@@ -241,7 +241,8 @@ void BindVarDsec(py::module &m) {
.value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE) .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE)
.value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY) .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY)
.value("PLACE_LIST", proto::VarDesc::PLACE_LIST) .value("PLACE_LIST", proto::VarDesc::PLACE_LIST)
.value("READER", proto::VarDesc::READER); .value("READER", proto::VarDesc::READER)
.value("NCCL_COM", proto::VarDesc::NCCL_COM);
} }
void BindOpDesc(py::module &m) { void BindOpDesc(py::module &m) {
......
...@@ -34,6 +34,7 @@ function cmake_gen() { ...@@ -34,6 +34,7 @@ function cmake_gen() {
Configuring cmake in /paddle/build ... Configuring cmake in /paddle/build ...
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
${PYTHON_FLAGS} ${PYTHON_FLAGS}
-DWITH_DSO=ON
-DWITH_DOC=OFF -DWITH_DOC=OFF
-DWITH_GPU=${WITH_GPU:-OFF} -DWITH_GPU=${WITH_GPU:-OFF}
-DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
...@@ -57,6 +58,7 @@ EOF ...@@ -57,6 +58,7 @@ EOF
cmake .. \ cmake .. \
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
${PYTHON_FLAGS} \ ${PYTHON_FLAGS} \
-DWITH_DSO=ON \
-DWITH_DOC=OFF \ -DWITH_DOC=OFF \
-DWITH_GPU=${WITH_GPU:-OFF} \ -DWITH_GPU=${WITH_GPU:-OFF} \
-DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \ -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
...@@ -173,7 +175,7 @@ EOF ...@@ -173,7 +175,7 @@ EOF
if [[ ${WITH_GPU} == "ON" ]]; then if [[ ${WITH_GPU} == "ON" ]]; then
NCCL_DEPS="apt-get install -y libnccl-dev &&" NCCL_DEPS="apt-get install -y libnccl-dev &&"
else else
NCCL_DEPS="" NCCL_DEPS=""
fi fi
cat >> /paddle/build/Dockerfile <<EOF cat >> /paddle/build/Dockerfile <<EOF
...@@ -189,6 +191,7 @@ EOF ...@@ -189,6 +191,7 @@ EOF
ldconfig ldconfig
${DOCKERFILE_CUDNN_DSO} ${DOCKERFILE_CUDNN_DSO}
${DOCKERFILE_GPU_ENV} ${DOCKERFILE_GPU_ENV}
ENV NCCL_LAUNCH_MODE PARALLEL
ADD go/cmd/pserver/pserver /usr/bin/ ADD go/cmd/pserver/pserver /usr/bin/
ADD go/cmd/master/master /usr/bin/ ADD go/cmd/master/master /usr/bin/
# default command shows the paddle version and exit # default command shows the paddle version and exit
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册