From 7bc4b095008058fdde05e3e9337e31845f1ce9b5 Mon Sep 17 00:00:00 2001 From: Wilber Date: Wed, 5 Feb 2020 14:01:10 +0800 Subject: [PATCH] add WITH_NCCL option for cmake. (#22384) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cmake选项中添加了WITH_NCCL,显示指定是否编译NCCL的部分代码,WITH_NCCL默认打开,但如果WITH_GPU为OFF,则关闭WITH_NCCL 添加了PADDLE_WITH_NCCL定义 单机单卡能够关闭NCCL编译,多卡的话需要默认打开NCCL,如果关闭NCCL,则只能使用单卡 Co-authored-by: 石晓伟 <39303645+Shixiaowei02@users.noreply.github.com> --- CMakeLists.txt | 22 +++++++++++++++++++ .../framework/details/all_reduce_op_handle.cc | 6 ++--- .../framework/details/all_reduce_op_handle.h | 8 +++---- .../framework/details/broadcast_op_handle.cc | 2 +- .../framework/details/broadcast_op_handle.h | 6 ++--- .../details/broadcast_op_handle_test.h | 12 +++++----- .../details/fused_all_reduce_op_handle.cc | 2 +- .../details/fused_all_reduce_op_handle.h | 4 ++-- .../details/fused_broadcast_op_handle.h | 4 ++-- .../details/fused_broadcast_op_handle_test.cc | 4 ++-- .../framework/details/reduce_op_handle.cc | 2 +- .../framework/details/reduce_op_handle.h | 4 ++-- .../fluid/framework/device_worker_factory.cc | 2 +- paddle/fluid/framework/fleet/CMakeLists.txt | 4 +++- .../all_reduce_deps_pass.cc | 2 +- .../fuse_all_reduce_op_pass.cc | 14 ++++++------ .../multi_devices_graph_pass.cc | 16 +++++++------- .../multi_devices_graph_pass.h | 2 +- paddle/fluid/framework/parallel_executor.cc | 6 ++--- paddle/fluid/framework/parallel_executor.h | 2 +- paddle/fluid/framework/pipeline_trainer.cc | 2 +- paddle/fluid/framework/section_worker.cc | 2 +- paddle/fluid/framework/trainer_factory.cc | 2 +- paddle/fluid/imperative/CMakeLists.txt | 4 +++- paddle/fluid/imperative/nccl_context.cc | 2 +- paddle/fluid/imperative/nccl_context.h | 4 ++-- paddle/fluid/imperative/tests/CMakeLists.txt | 4 +++- paddle/fluid/operators/CMakeLists.txt | 6 +++-- .../fluid/operators/collective/CMakeLists.txt | 2 +- paddle/fluid/operators/data_norm_op.cu | 4 ++-- .../operators/distributed_ops/CMakeLists.txt | 2 +- paddle/fluid/operators/nccl/CMakeLists.txt | 4 ++++ paddle/fluid/platform/device_context.cc | 2 +- paddle/fluid/platform/device_context.h | 6 ++--- paddle/fluid/platform/dynload/CMakeLists.txt | 5 ++++- paddle/fluid/platform/enforce_test.cc | 2 +- paddle/fluid/pybind/CMakeLists.txt | 20 ++++++++++++----- paddle/fluid/pybind/imperative.cc | 2 +- paddle/fluid/pybind/pybind.cc | 2 +- .../fluid/tests/unittests/CMakeLists.txt | 3 +++ 40 files changed, 127 insertions(+), 77 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c7a613c8e7..87fc787b74 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,6 +89,7 @@ option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VER option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}) option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF) option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF) +option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON) # PY_VERSION if(NOT PY_VERSION) @@ -121,6 +122,27 @@ if(WIN32) set(WITH_DISTRIBUTE OFF CACHE STRING "Disable DISTRIBUTE when compiling for Windows" FORCE) endif() + if(WITH_NCCL) + MESSAGE(WARNING + "Disable NCCL when compiling for Windows. Force WITH_NCCL=OFF.") + set(WITH_NCCL OFF CACHE STRING + "Disable NCCL when compiling for Windows" FORCE) + endif() +endif() + +if (NOT WITH_GPU AND WITH_NCCL) + MESSAGE(WARNING + "Disable NCCL when compiling without GPU. Force WITH_NCCL=OFF.") + set(WITH_NCCL OFF CACHE STRING + "Disable NCCL when compiling without GPU" FORCE) +endif() + +if(WITH_NCCL) + add_definitions("-DPADDLE_WITH_NCCL") +else() + if(WITH_GPU) + MESSAGE(WARNING "If the environment is multi-card, the WITH_NCCL option needs to be turned on, otherwise only a single card can be used.") + endif() endif() if(WITH_BRPC_RDMA) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 6c3b0923ed..3110e579d3 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -28,7 +28,7 @@ namespace paddle { namespace framework { namespace details { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, @@ -121,7 +121,7 @@ void AllReduceOpHandle::AllReduceFunc( const std::vector &places, const std::vector &out_var_names) { if (is_gpu_place(places[0])) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr."); ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype); std::vector> all_reduce_calls; @@ -161,7 +161,7 @@ void AllReduceOpHandle::AllReduceFunc( VLOG(10) << Name() << " size:" << numel * SizeOfType(dtype); } -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) void AllReduceOpHandle::NCCLAllReduceFunc( const std::vector> &all_reduce_calls) { this->RunAndRecordEvent([&] { diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h index c8ff151a88..36f5d3adfa 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/all_reduce_op_handle.h @@ -20,7 +20,7 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -29,7 +29,7 @@ namespace paddle { namespace framework { namespace details { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) class AllReduceOpHandle : public NCCLOpHandleBase { public: AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, @@ -54,13 +54,13 @@ class AllReduceOpHandle : public OpHandleBase { std::vector local_scopes_; -#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32)) +#ifndef PADDLE_WITH_NCCL // NCCLOpHandleBase already have these attributes. // Will polish it by class inheritance framework. std::vector places_; #endif -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) void NCCLAllReduceFunc( const std::vector> &all_reduce_calls); diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 3637625f7e..b3c79b63ce 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -73,7 +73,7 @@ void BroadcastOpHandle::BroadcastOneVar( }); } } else { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) VarHandle *out_handle = nullptr; int root_id = boost::get(in_tensor.place()).device; std::vector> broadcast_calls; diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index 45ccbb41e0..588248d645 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -24,7 +24,7 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/platform/device_context.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -34,7 +34,7 @@ namespace details { struct BroadcastOpHandle : public OpHandleBase { public: -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) BroadcastOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, const platform::NCCLContextMap *nccl_ctxs) @@ -70,7 +70,7 @@ struct BroadcastOpHandle : public OpHandleBase { std::vector local_scopes_; std::vector places_; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) const platform::NCCLContextMap *nccl_ctxs_; #endif diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h index abc3f39e68..6d14c7e4e7 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.h +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h @@ -44,7 +44,7 @@ struct TestBroadcastOpHandle { std::vector> nodes_; std::vector place_list_; bool use_gpu_; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) std::unique_ptr nccl_ctxs_; #endif @@ -52,7 +52,7 @@ struct TestBroadcastOpHandle { for (size_t j = 0; j < ctxs_.size(); ++j) { ctxs_[j]->Wait(); } -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) if (nccl_ctxs_) { nccl_ctxs_->WaitAll(); } @@ -62,7 +62,7 @@ struct TestBroadcastOpHandle { void InitCtxOnGpu(bool use_gpu) { use_gpu_ = use_gpu; if (use_gpu_) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) int count = p::GetCUDADeviceCount(); if (count <= 1) { LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " @@ -86,7 +86,7 @@ struct TestBroadcastOpHandle { place_list_.push_back(p); ctxs_.emplace_back(new p::CPUDeviceContext(p)); } -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) nccl_ctxs_.reset(nullptr); #endif } @@ -107,14 +107,14 @@ struct TestBroadcastOpHandle { nodes_.emplace_back( ir::CreateNodeForTest("node0", ir::Node::Type::kOperation)); if (use_gpu_) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else PADDLE_THROW("CUDA is not support."); #endif } else { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index ddd6d10e5e..c67e21d5c4 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -28,7 +28,7 @@ namespace details { typedef std::vector>> GradientAndLoDTensor; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) FusedAllReduceOpHandle::FusedAllReduceOpHandle( ir::Node *node, const std::vector &local_scopes, const std::vector &places, const size_t num_of_all_reduce, diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h index f6a11c4e50..16c13ac1c0 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h @@ -21,7 +21,7 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -30,7 +30,7 @@ namespace paddle { namespace framework { namespace details { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) struct FusedAllReduceOpHandle : public AllReduceOpHandle { FusedAllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h index e43d545c9c..8a59d2bfa9 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h @@ -25,7 +25,7 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/platform/device_context.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -35,7 +35,7 @@ namespace details { struct FusedBroadcastOpHandle : public BroadcastOpHandle { public: -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) FusedBroadcastOpHandle(ir::Node *node, const std::vector local_scopes, const std::vector &places, diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc index 49404509a6..cbded074f2 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc @@ -45,14 +45,14 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { nodes_.emplace_back( ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation)); if (use_gpu_) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) op_handle_ = new FusedBroadcastOpHandle( nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else PADDLE_THROW("CUDA is not supported."); #endif } else { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) op_handle_ = new FusedBroadcastOpHandle( nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index f524571367..80c0cda505 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -264,7 +264,7 @@ void ReduceOpHandle::RunImpl() { } }); } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) auto pre_in = pre_in_var->Get(); VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var); VariableVisitor::GetMutableTensor(out_var).mutable_data( diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index 15064a108e..8b92bdef47 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -24,7 +24,7 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/platform/device_context.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -62,7 +62,7 @@ struct ReduceOpHandle : public OpHandleBase { std::vector local_scopes_; std::vector places_; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) const platform::NCCLContextMap *nccl_ctxs_; ReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc index dc85941f57..e163b601d9 100644 --- a/paddle/fluid/framework/device_worker_factory.cc +++ b/paddle/fluid/framework/device_worker_factory.cc @@ -61,7 +61,7 @@ std::shared_ptr DeviceWorkerFactory::CreateDeviceWorker( REGISTER_DEVICE_WORKER_CLASS(HogwildWorker); REGISTER_DEVICE_WORKER_CLASS(DownpourWorker); -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) REGISTER_DEVICE_WORKER_CLASS(SectionWorker); #endif } // namespace framework diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index 1b031ab315..78e9cb10d5 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -4,7 +4,9 @@ else() cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope) endif(WITH_PSLIB) -cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope) +if(WITH_NCCL) + cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope) +endif() if(WITH_BOX_PS) cc_library(box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor box_ps) else() diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc index 4c00599680..8923dfc323 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc @@ -39,7 +39,7 @@ class AllReduceDepsPass : public ir::Pass { std::vector all_reduce_op_handles = GetSortedAllReduceOps(*graph); -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) auto use_hierarchical_allreduce = Get(details::kUseHierarchicalAllReduce); for (size_t i = 0; i < all_reduce_op_handles.size(); ++i) { diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc index 9b7bd8cc82..86fbbaf772 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc @@ -38,7 +38,7 @@ class FuseAllReduceOpPass : public ir::Pass { auto &places = Get>(details::kPlaces); auto &local_scopes = Get>(details::kLocalScopes); -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) auto *multi_nccl_ctxs = &Get(details::kNCCLCtxs); #endif @@ -85,7 +85,7 @@ class FuseAllReduceOpPass : public ir::Pass { for (auto &p_g : group_p_g) { group_all_reduce_ops.emplace_back(all_reduce_ops.at(p_g.second)); } -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) InsertFusedAllReduce(places, local_scopes, group_size, group_all_reduce_ops, multi_nccl_ctxs, &result); #else @@ -134,7 +134,7 @@ class FuseAllReduceOpPass : public ir::Pass { const std::vector &local_scopes, const size_t num_of_all_reduce, const std::vector &all_reduce_ops, -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) const platform::NCCLCommunicator *multi_nccl_ctxs, #endif ir::Graph *result) const { @@ -161,7 +161,7 @@ class FuseAllReduceOpPass : public ir::Pass { result->RemoveNode(op_handle.Node()); } -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places, local_scopes, multi_nccl_ctxs, result); #else @@ -177,11 +177,11 @@ class FuseAllReduceOpPass : public ir::Pass { const size_t num_of_all_reduce, const std::vector &places, const std::vector &local_scopes, -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) const platform::NCCLCommunicator *multi_nccl_ctxs, #endif ir::Graph *result) const { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) auto *op_handle = new details::FusedAllReduceOpHandle( result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation), local_scopes, places, num_of_all_reduce, multi_nccl_ctxs); @@ -199,7 +199,7 @@ class FuseAllReduceOpPass : public ir::Pass { op_handle->AddOutput(out); } -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) if (!multi_nccl_ctxs) { SetCommunicationContext(places, op_handle); } diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc index 7a04b44910..79b5099355 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc @@ -156,7 +156,7 @@ void MultiDevSSAGraphBuilderBase::Init() const { places_ = Get>(details::kPlaces); local_scopes_ = Get>(details::kLocalScopes); strategy_ = Get(kStrategy); -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) multi_nccl_ctxs_ = &Get(details::kNCCLCtxs); nccl_ctxs_ = nullptr; if (multi_nccl_ctxs_) { @@ -298,7 +298,7 @@ std::vector MultiDevSSAGraphBuilderBase::SortOperations( bool MultiDevSSAGraphBuilderBase::UseGPU() const { bool use_gpu = false; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) use_gpu = nccl_ctxs_ != nullptr; #endif return use_gpu; @@ -348,7 +348,7 @@ void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result, void MultiDevSSAGraphBuilderBase::SetCommunicationContext( details::OpHandleBase *op_handle, const platform::Place &p) const { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) if (nccl_ctxs_ == nullptr) { op_handle->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p)); @@ -362,7 +362,7 @@ void MultiDevSSAGraphBuilderBase::SetCommunicationContext( void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result, const std::string &p_name, size_t src_dev_id) const { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) auto *op_handle = new details::BroadcastOpHandle( result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_); @@ -395,7 +395,7 @@ void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result, void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp( ir::Graph *result, const std::vector> &bcast_varnames) const { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) auto *op_handle = new details::FusedBroadcastOpHandle( result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_); @@ -451,7 +451,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result, auto append_allreduce_op = [&]( const std::vector &scopes, const std::vector &places) -> details::OpHandleBase * { -#if defined(PADDLE_WITH_DGC) +#if defined(PADDLE_WITH_DGC) && defined(PADDLE_WITH_NCCL) if (is_encoded) { result->Get(kGraphOps).emplace_back( new details::SparseAllReduceOpHandle( @@ -464,7 +464,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result, result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), scopes, places, multi_nccl_ctxs_)); } -#elif defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#elif defined(PADDLE_WITH_NCCL) result->Get(kGraphOps).emplace_back( new details::AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), @@ -539,7 +539,7 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOps( details::VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp( ir::Graph *result, const std::string &og, size_t dst_dev_id) const { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) result->Get(kGraphOps).emplace_back(new details::ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h index ea0455b6a8..719df016d9 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h @@ -94,7 +94,7 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass { void CreateOpHandleIOs(ir::Graph *result, ir::Node *node, size_t device_id) const; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) mutable platform::NCCLContextMap *nccl_ctxs_{nullptr}; mutable platform::NCCLCommunicator *multi_nccl_ctxs_{nullptr}; #endif diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 7dc13bae4b..7958d3f99c 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -109,7 +109,7 @@ class ParallelExecutorPrivate { } } -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) { VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_ << ", num_trainers:" << bst.num_trainers_ @@ -473,7 +473,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, } if (member_->use_cuda_ && member_->nranks_ > 1) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_); // Initialize device context's nccl comm, will be used by normal @@ -652,7 +652,7 @@ void ParallelExecutor::BCastParamsToDevices( } auto &dims = main_tensor.dims(); if (paddle::platform::is_gpu_place(main_tensor.place())) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) std::vector buffers; buffers.reserve(member_->places_.size()); size_t numel = main_tensor.numel(); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 612a58de07..56aeb21531 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -32,7 +32,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/nccl_helper.h" #endif diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc index 3a851f0920..88c1d83ff8 100644 --- a/paddle/fluid/framework/pipeline_trainer.cc +++ b/paddle/fluid/framework/pipeline_trainer.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/trainer.h" diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index c1a404c1cb..cd5204b490 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "google/protobuf/io/zero_copy_stream_impl.h" #include "google/protobuf/message.h" #include "google/protobuf/text_format.h" diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc index ce0eb5ec30..23cfa11d4c 100644 --- a/paddle/fluid/framework/trainer_factory.cc +++ b/paddle/fluid/framework/trainer_factory.cc @@ -63,7 +63,7 @@ std::shared_ptr TrainerFactory::CreateTrainer( REGISTER_TRAINER_CLASS(MultiTrainer); REGISTER_TRAINER_CLASS(DistMultiTrainer); -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) REGISTER_TRAINER_CLASS(PipelineTrainer); #endif } // namespace framework diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 4250ba20ea..3b09c8402b 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -9,7 +9,9 @@ cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer) cc_library(engine SRCS engine.cc DEPS layer gradient_accumulator) cc_library(imperative_profiler SRCS profiler.cc) if(NOT WIN32) - cc_library(nccl_context SRCS nccl_context.cc DEPS device_context) + if(WITH_NCCL) + cc_library(nccl_context SRCS nccl_context.cc DEPS device_context) + endif() cc_library(data_loader SRCS data_loader.cc DEPS enforce) endif(NOT WIN32) diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc index ab612b2f15..3c3634b0bd 100644 --- a/paddle/fluid/imperative/nccl_context.cc +++ b/paddle/fluid/imperative/nccl_context.cc @@ -16,7 +16,7 @@ namespace paddle { namespace imperative { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) void NCCLParallelContext::RecvNCCLID(const std::string &ep, ncclUniqueId *nccl_id) { auto addr = paddle::string::Split(ep, ':'); diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h index b4f44e5640..ac36ed77b4 100644 --- a/paddle/fluid/imperative/nccl_context.h +++ b/paddle/fluid/imperative/nccl_context.h @@ -26,7 +26,7 @@ #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/device_context.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/dynload/nccl.h" #endif #include "paddle/fluid/platform/place.h" @@ -57,7 +57,7 @@ class ParallelContext { platform::Place place_; }; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) class NCCLParallelContext : ParallelContext { public: explicit NCCLParallelContext(const ParallelStrategy& strategy, diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt index a128a774d0..e3c82474e0 100644 --- a/paddle/fluid/imperative/tests/CMakeLists.txt +++ b/paddle/fluid/imperative/tests/CMakeLists.txt @@ -1,7 +1,9 @@ if(WIN32) cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS device_context) else() - cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context) + if (WITH_NCCL) + cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context) + endif() endif(WIN32) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 675466bfb8..9ad9227610 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -69,8 +69,10 @@ if (WITH_GPU) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) endif() if (NOT WIN32) - op_library(sync_batch_norm_op) - file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n") + if (WITH_NCCL) + op_library(sync_batch_norm_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n") + endif() endif() else() op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt index c3b3f60eaf..3f9423ae5c 100644 --- a/paddle/fluid/operators/collective/CMakeLists.txt +++ b/paddle/fluid/operators/collective/CMakeLists.txt @@ -30,7 +30,7 @@ endforeach() register_operators(EXCLUDES c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) -if(WITH_GPU AND NOT WIN32) +if(WITH_NCCL) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper) op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} nccl_common) endif() diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu index 643e7bced5..9afb5b9908 100644 --- a/paddle/fluid/operators/data_norm_op.cu +++ b/paddle/fluid/operators/data_norm_op.cu @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/operators/data_norm_op.h" #include "paddle/fluid/platform/cuda_primitives.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -178,7 +178,7 @@ class DataNormGradKernel d_batch_sum, d_batch_square_sum); if (need_sync_stats) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) auto comm = platform::NCCLCommContext::Instance().Get(0, ctx.GetPlace()); PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( reinterpret_cast(d_batch_size), diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt index d1a9c65c9c..79f14d75d2 100644 --- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt +++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt @@ -30,7 +30,7 @@ endforeach() register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS}) -if(WITH_GPU AND NOT WIN32) +if(WITH_NCCL) set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common) op_library(gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS} nccl_common) endif() diff --git a/paddle/fluid/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt index 9b26e19cc7..4f1fe372f5 100644 --- a/paddle/fluid/operators/nccl/CMakeLists.txt +++ b/paddle/fluid/operators/nccl/CMakeLists.txt @@ -1,3 +1,7 @@ +if (NOT WITH_NCCL) + return() +endif() + if(WITH_GPU AND NOT WIN32) nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator ) endif() diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 62c83e0c4e..322a327967 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -305,7 +305,7 @@ CUDADeviceContext::~CUDADeviceContext() { PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_), "Failed to destory Cudnn handle"); } -#if !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) if (nccl_comm_) { PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_)); } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 50e4538987..d6b8cda94f 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -22,7 +22,7 @@ limitations under the License. */ #include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cudnn.h" -#if !defined(__APPLE__) && !defined(_WIN32) +#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/dynload/nccl.h" #endif #include "paddle/fluid/platform/gpu_info.h" @@ -144,7 +144,7 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return cuda stream in the device context. */ cudaStream_t stream() const; -#if !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) /*! \brief Return nccl communicators. */ ncclComm_t nccl_comm() const { return nccl_comm_; } @@ -180,7 +180,7 @@ class CUDADeviceContext : public DeviceContext { std::unique_ptr cublas_handle_; std::unique_ptr cublas_tensor_core_handle_; -#if !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) // NCCL communicator (single process version) for NCCL collective operations. // NCCL collective operations provides fast collectives over multiple GPUs // both within and across nodes. diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 81312111ae..04597830a8 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -5,7 +5,10 @@ list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc) # There is no macOS version of NCCL. # Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux. if (NOT APPLE AND NOT WIN32) - list(APPEND CUDA_SRCS nccl.cc nvrtc.cc cuda_driver.cc) + list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc) + if (WITH_NCCL) + list(APPEND CUDA_SRCS nccl.cc) + endif() endif() if (TENSORRT_FOUND) diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index 3e0f046cef..f1f24bb920 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -292,7 +292,7 @@ TEST(enforce, cuda_success) { EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS)); EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED)); EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE)); -#if !defined(__APPLE__) && !defined(_WIN32) +#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess)); EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError)); EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError)); diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 171d7752a7..320951debb 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,10 +1,17 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper nccl_wrapper prune +set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context gloo_wrapper) +if (WITH_NCCL) + set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper) +endif() + if(NOT WIN32) - set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context data_loader) + set(PYBIND_DEPS ${PYBIND_DEPS} data_loader) + if (WITH_NCCL) + set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context) + endif() endif(NOT WIN32) if(WITH_PYTHON) @@ -25,7 +32,6 @@ set(PYBIND_SRCS fleet_wrapper_py.cc gloo_wrapper_py.cc box_helper_py.cc - nccl_wrapper_py.cc data_set_py.cc imperative.cc ir.cc @@ -35,15 +41,19 @@ if (WITH_DISTRIBUTE) list(APPEND PYBIND_SRCS communicator_py.cc) endif() +if (WITH_NCCL) + list(APPEND PYBIND_SRCS nccl_wrapper_py.cc) +endif() + if(WITH_PYTHON) # generate op pybind functions automatically for dygraph. set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag) list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB}) list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS}) - if(NOT WIN32) + if(WITH_NCCL) list(APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context) - endif(NOT WIN32) + endif(WITH_NCCL) add_executable(op_function_generator op_function_generator.cc) target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS}) diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 84b6ff267e..dc1a104183 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -611,7 +611,7 @@ void BindImperative(py::module *m_ptr) { }, [](imperative::ParallelStrategy &self, const std::string &ep) { self.current_endpoint_ = ep; }); -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) py::class_ nccl_ctx(m, "NCCLParallelContext"); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 90482d36cf..837088b4f8 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2209,7 +2209,7 @@ All parameter, weight, gradient are variables in Paddle. BindFleetWrapper(&m); BindGlooWrapper(&m); BindBoxHelper(&m); -#ifndef _WIN32 +#ifdef PADDLE_WITH_NCCL BindNCCLWrapper(&m); #endif BindGraph(&m); diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index e7ddc7a7b8..ed3b2469b4 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -4,6 +4,9 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FL set(dist_ENVS http_proxy="" https_proxy="") file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py") +if(NOT WITH_NCCL) + list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl") +endif() string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}") list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist) list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext) -- GitLab