add WITH_NCCL option for cmake. (#22384)

cmake选项中添加了WITH_NCCL，显示指定是否编译NCCL的部分代码，WITH_NCCL默认打开，但如果WITH_GPU为OFF，则关闭WITH_NCCL 添加了PADDLE_WITH_NCCL定义单机单卡能够关闭NCCL编译，多卡的话需要默认打开NCCL，如果关闭NCCL，则只能使用单卡 Co-authored-by: N 石晓伟 <39303645+Shixiaowei02@users.noreply.github.com>

add WITH_NCCL option for cmake. (#22384)
cmake选项中添加了WITH_NCCL，显示指定是否编译NCCL的部分代码，WITH_NCCL默认打开，但如果WITH_GPU为OFF，则关闭WITH_NCCL 添加了PADDLE_WITH_NCCL定义单机单卡能够关闭NCCL编译，多卡的话需要默认打开NCCL，如果关闭NCCL，则只能使用单卡 Co-authored-by: N 石晓伟 <39303645+Shixiaowei02@users.noreply.github.com>
7bc4b095 · Wilber · GitHub · c8b90d8f · 7bc4b095 · 7bc4b095
40 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,7 @@ option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VER
 option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
 option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
 option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
+option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)

 # PY_VERSION
 if(NOT PY_VERSION)
@@ -121,6 +122,27 @@ if(WIN32)
        set(WITH_DISTRIBUTE OFF CACHE STRING
            "Disable DISTRIBUTE when compiling for Windows" FORCE)
    endif()
+    if(WITH_NCCL)
+        MESSAGE(WARNING
+            "Disable NCCL when compiling for Windows. Force WITH_NCCL=OFF.")
+        set(WITH_NCCL OFF CACHE STRING
+            "Disable NCCL when compiling for Windows" FORCE)
+    endif()
+endif()
+
+if (NOT WITH_GPU AND WITH_NCCL)
+    MESSAGE(WARNING
+        "Disable NCCL when compiling without GPU. Force WITH_NCCL=OFF.")
+    set(WITH_NCCL OFF CACHE STRING
+        "Disable NCCL when compiling without GPU" FORCE)
+endif()
+
+if(WITH_NCCL)
+     add_definitions("-DPADDLE_WITH_NCCL")
+else()
+     if(WITH_GPU)
+         MESSAGE(WARNING "If the environment is multi-card, the WITH_NCCL option needs to be turned on, otherwise only a single card can be used.")
+     endif()
 endif()

 if(WITH_BRPC_RDMA)

--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -28,7 +28,7 @@ namespace paddle {
 namespace framework {
 namespace details {

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                     const std::vector<Scope *> &local_scopes,
                                     const std::vector<platform::Place> &places,
@@ -121,7 +121,7 @@ void AllReduceOpHandle::AllReduceFunc(
    const std::vector<platform::Place> &places,
    const std::vector<std::string> &out_var_names) {
  if (is_gpu_place(places[0])) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
    PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
    ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
    std::vector<std::function<void()>> all_reduce_calls;
@@ -161,7 +161,7 @@ void AllReduceOpHandle::AllReduceFunc(
  VLOG(10) << Name() << " size:" << numel * SizeOfType(dtype);
 }

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 void AllReduceOpHandle::NCCLAllReduceFunc(
    const std::vector<std::function<void()>> &all_reduce_calls) {
  this->RunAndRecordEvent([&] {

--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -20,7 +20,7 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
@@ -29,7 +29,7 @@ namespace paddle {
 namespace framework {
 namespace details {

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 class AllReduceOpHandle : public NCCLOpHandleBase {
 public:
  AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
@@ -54,13 +54,13 @@ class AllReduceOpHandle : public OpHandleBase {

  std::vector<Scope *> local_scopes_;

-#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
+#ifndef PADDLE_WITH_NCCL
  // NCCLOpHandleBase already have these attributes.
  // Will polish it by class inheritance framework.
  std::vector<platform::Place> places_;
 #endif

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
  void NCCLAllReduceFunc(
      const std::vector<std::function<void()>> &all_reduce_calls);


--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -73,7 +73,7 @@ void BroadcastOpHandle::BroadcastOneVar(
      });
    }
  } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
    VarHandle *out_handle = nullptr;
    int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device;
    std::vector<std::function<void()>> broadcast_calls;

--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -24,7 +24,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

@@ -34,7 +34,7 @@ namespace details {

 struct BroadcastOpHandle : public OpHandleBase {
 public:
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
  BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places,
                    const platform::NCCLContextMap *nccl_ctxs)
@@ -70,7 +70,7 @@ struct BroadcastOpHandle : public OpHandleBase {

  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
  const platform::NCCLContextMap *nccl_ctxs_;
 #endif


--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -44,7 +44,7 @@ struct TestBroadcastOpHandle {
  std::vector<std::unique_ptr<ir::Node>> nodes_;
  std::vector<p::Place> place_list_;
  bool use_gpu_;
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif

@@ -52,7 +52,7 @@ struct TestBroadcastOpHandle {
    for (size_t j = 0; j < ctxs_.size(); ++j) {
      ctxs_[j]->Wait();
    }
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
    if (nccl_ctxs_) {
      nccl_ctxs_->WaitAll();
    }
@@ -62,7 +62,7 @@ struct TestBroadcastOpHandle {
  void InitCtxOnGpu(bool use_gpu) {
    use_gpu_ = use_gpu;
    if (use_gpu_) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
      int count = p::GetCUDADeviceCount();
      if (count <= 1) {
        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
@@ -86,7 +86,7 @@ struct TestBroadcastOpHandle {
        place_list_.push_back(p);
        ctxs_.emplace_back(new p::CPUDeviceContext(p));
      }
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
      nccl_ctxs_.reset(nullptr);
 #endif
    }
@@ -107,14 +107,14 @@ struct TestBroadcastOpHandle {
    nodes_.emplace_back(
        ir::CreateNodeForTest("node0", ir::Node::Type::kOperation));
    if (use_gpu_) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
      op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
                                         place_list_, nccl_ctxs_.get());
 #else
      PADDLE_THROW("CUDA is not support.");
 #endif
    } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
      op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
                                         place_list_, nccl_ctxs_.get());
 #else

--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -28,7 +28,7 @@ namespace details {
 typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
    GradientAndLoDTensor;

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 FusedAllReduceOpHandle::FusedAllReduceOpHandle(
    ir::Node *node, const std::vector<Scope *> &local_scopes,
    const std::vector<platform::Place> &places, const size_t num_of_all_reduce,

--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
@@ -21,7 +21,7 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
@@ -30,7 +30,7 @@ namespace paddle {
 namespace framework {
 namespace details {

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 struct FusedAllReduceOpHandle : public AllReduceOpHandle {
  FusedAllReduceOpHandle(ir::Node *node,
                         const std::vector<Scope *> &local_scopes,

--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
@@ -25,7 +25,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

@@ -35,7 +35,7 @@ namespace details {

 struct FusedBroadcastOpHandle : public BroadcastOpHandle {
 public:
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
  FusedBroadcastOpHandle(ir::Node *node,
                         const std::vector<Scope *> local_scopes,
                         const std::vector<platform::Place> &places,

--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -45,14 +45,14 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
    nodes_.emplace_back(
        ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation));
    if (use_gpu_) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
      op_handle_ = new FusedBroadcastOpHandle(
          nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else
      PADDLE_THROW("CUDA is not supported.");
 #endif
    } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
      op_handle_ = new FusedBroadcastOpHandle(
          nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else

--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -264,7 +264,7 @@ void ReduceOpHandle::RunImpl() {
        }
      });
    } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
      auto pre_in = pre_in_var->Get<framework::LoDTensor>();
      VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
      VariableVisitor::GetMutableTensor(out_var).mutable_data(

--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -24,7 +24,7 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

@@ -62,7 +62,7 @@ struct ReduceOpHandle : public OpHandleBase {
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
  const platform::NCCLContextMap *nccl_ctxs_;
  ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                 const std::vector<platform::Place> &places,

--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -61,7 +61,7 @@ std::shared_ptr<DeviceWorker> DeviceWorkerFactory::CreateDeviceWorker(

 REGISTER_DEVICE_WORKER_CLASS(HogwildWorker);
 REGISTER_DEVICE_WORKER_CLASS(DownpourWorker);
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
 #endif
 }  // namespace framework

--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -4,7 +4,9 @@ else()
    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope)
 endif(WITH_PSLIB)

-cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope)
+if(WITH_NCCL)
+    cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope)
+endif()
 if(WITH_BOX_PS)
    cc_library(box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor box_ps)
 else()

--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
@@ -39,7 +39,7 @@ class AllReduceDepsPass : public ir::Pass {
    std::vector<details::OpHandleBase*> all_reduce_op_handles =
        GetSortedAllReduceOps(*graph);

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
    auto use_hierarchical_allreduce =
        Get<bool>(details::kUseHierarchicalAllReduce);
    for (size_t i = 0; i < all_reduce_op_handles.size(); ++i) {

--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
@@ -38,7 +38,7 @@ class FuseAllReduceOpPass : public ir::Pass {
    auto &places = Get<const std::vector<platform::Place>>(details::kPlaces);
    auto &local_scopes = Get<const std::vector<Scope *>>(details::kLocalScopes);

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
    auto *multi_nccl_ctxs =
        &Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
 #endif
@@ -85,7 +85,7 @@ class FuseAllReduceOpPass : public ir::Pass {
      for (auto &p_g : group_p_g) {
        group_all_reduce_ops.emplace_back(all_reduce_ops.at(p_g.second));
      }
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
      InsertFusedAllReduce(places, local_scopes, group_size,
                           group_all_reduce_ops, multi_nccl_ctxs, &result);
 #else
@@ -134,7 +134,7 @@ class FuseAllReduceOpPass : public ir::Pass {
                            const std::vector<Scope *> &local_scopes,
                            const size_t num_of_all_reduce,
                            const std::vector<ir::Node *> &all_reduce_ops,
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
                            const platform::NCCLCommunicator *multi_nccl_ctxs,
 #endif
                            ir::Graph *result) const {
@@ -161,7 +161,7 @@ class FuseAllReduceOpPass : public ir::Pass {
      result->RemoveNode(op_handle.Node());
    }

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
    CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
                           local_scopes, multi_nccl_ctxs, result);
 #else
@@ -177,11 +177,11 @@ class FuseAllReduceOpPass : public ir::Pass {
      const size_t num_of_all_reduce,
      const std::vector<platform::Place> &places,
      const std::vector<Scope *> &local_scopes,
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
      const platform::NCCLCommunicator *multi_nccl_ctxs,
 #endif
      ir::Graph *result) const {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
    auto *op_handle = new details::FusedAllReduceOpHandle(
        result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
        local_scopes, places, num_of_all_reduce, multi_nccl_ctxs);
@@ -199,7 +199,7 @@ class FuseAllReduceOpPass : public ir::Pass {
      op_handle->AddOutput(out);
    }

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
    if (!multi_nccl_ctxs) {
      SetCommunicationContext(places, op_handle);
    }

--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
@@ -156,7 +156,7 @@ void MultiDevSSAGraphBuilderBase::Init() const {
  places_ = Get<const std::vector<platform::Place>>(details::kPlaces);
  local_scopes_ = Get<const std::vector<Scope *>>(details::kLocalScopes);
  strategy_ = Get<const details::BuildStrategy>(kStrategy);
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
  multi_nccl_ctxs_ = &Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
  nccl_ctxs_ = nullptr;
  if (multi_nccl_ctxs_) {
@@ -298,7 +298,7 @@ std::vector<ir::Node *> MultiDevSSAGraphBuilderBase::SortOperations(

 bool MultiDevSSAGraphBuilderBase::UseGPU() const {
  bool use_gpu = false;
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
  use_gpu = nccl_ctxs_ != nullptr;
 #endif
  return use_gpu;
@@ -348,7 +348,7 @@ void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result,

 void MultiDevSSAGraphBuilderBase::SetCommunicationContext(
    details::OpHandleBase *op_handle, const platform::Place &p) const {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
  if (nccl_ctxs_ == nullptr) {
    op_handle->SetDeviceContext(p,
                                platform::DeviceContextPool::Instance().Get(p));
@@ -362,7 +362,7 @@ void MultiDevSSAGraphBuilderBase::SetCommunicationContext(
 void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result,
                                                    const std::string &p_name,
                                                    size_t src_dev_id) const {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
  auto *op_handle = new details::BroadcastOpHandle(
      result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
      local_scopes_, places_, nccl_ctxs_);
@@ -395,7 +395,7 @@ void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result,
 void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp(
    ir::Graph *result,
    const std::vector<std::unordered_set<std::string>> &bcast_varnames) const {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
  auto *op_handle = new details::FusedBroadcastOpHandle(
      result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation),
      local_scopes_, places_, nccl_ctxs_);
@@ -451,7 +451,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
  auto append_allreduce_op = [&](
      const std::vector<Scope *> &scopes,
      const std::vector<platform::Place> &places) -> details::OpHandleBase * {
-#if defined(PADDLE_WITH_DGC)
+#if defined(PADDLE_WITH_DGC) && defined(PADDLE_WITH_NCCL)
    if (is_encoded) {
      result->Get<GraphOps>(kGraphOps).emplace_back(
          new details::SparseAllReduceOpHandle(
@@ -464,7 +464,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
              result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
              scopes, places, multi_nccl_ctxs_));
    }
-#elif defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#elif defined(PADDLE_WITH_NCCL)
    result->Get<GraphOps>(kGraphOps).emplace_back(
        new details::AllReduceOpHandle(
            result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
@@ -539,7 +539,7 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOps(

 details::VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(
    ir::Graph *result, const std::string &og, size_t dst_dev_id) const {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
  result->Get<GraphOps>(kGraphOps).emplace_back(new details::ReduceOpHandle(
      result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
      local_scopes_, places_, nccl_ctxs_));

--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
@@ -94,7 +94,7 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
  void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
                         size_t device_id) const;

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
  mutable platform::NCCLContextMap *nccl_ctxs_{nullptr};
  mutable platform::NCCLCommunicator *multi_nccl_ctxs_{nullptr};
 #endif

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -109,7 +109,7 @@ class ParallelExecutorPrivate {
    }
  }

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
  void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) {
    VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_
            << ", num_trainers:" << bst.num_trainers_
@@ -473,7 +473,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
  }

  if (member_->use_cuda_ && member_->nranks_ > 1) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
    member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_);

    // Initialize device context's nccl comm, will be used by normal
@@ -652,7 +652,7 @@ void ParallelExecutor::BCastParamsToDevices(
    }
    auto &dims = main_tensor.dims();
    if (paddle::platform::is_gpu_place(main_tensor.place())) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
      std::vector<void *> buffers;
      buffers.reserve(member_->places_.size());
      size_t numel = main_tensor.numel();

--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -32,7 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif


--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/trainer.h"

--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/message.h"
 #include "google/protobuf/text_format.h"

--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -63,7 +63,7 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(

 REGISTER_TRAINER_CLASS(MultiTrainer);
 REGISTER_TRAINER_CLASS(DistMultiTrainer);
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 REGISTER_TRAINER_CLASS(PipelineTrainer);
 #endif
 }  // namespace framework

--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -9,7 +9,9 @@ cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer)
 cc_library(engine SRCS engine.cc DEPS layer gradient_accumulator)
 cc_library(imperative_profiler SRCS profiler.cc)
 if(NOT WIN32)
+    if(WITH_NCCL)
        cc_library(nccl_context SRCS nccl_context.cc DEPS device_context)
+    endif()
    cc_library(data_loader SRCS data_loader.cc DEPS enforce)
 endif(NOT WIN32)


--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -16,7 +16,7 @@

 namespace paddle {
 namespace imperative {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 void NCCLParallelContext::RecvNCCLID(const std::string &ep,
                                     ncclUniqueId *nccl_id) {
  auto addr = paddle::string::Split(ep, ':');

--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
@@ -26,7 +26,7 @@

 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
 #include "paddle/fluid/platform/place.h"
@@ -57,7 +57,7 @@ class ParallelContext {
  platform::Place place_;
 };

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 class NCCLParallelContext : ParallelContext {
 public:
  explicit NCCLParallelContext(const ParallelStrategy& strategy,

--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
 if(WIN32)
    cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS device_context)
 else()
+    if (WITH_NCCL)
        cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
+    endif()
 endif(WIN32)



--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -69,9 +69,11 @@ if (WITH_GPU)
        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
    endif()
    if (NOT WIN32)
+        if (WITH_NCCL)
            op_library(sync_batch_norm_op)
            file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
        endif()
+    endif()
 else()
    op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()

--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -30,7 +30,7 @@ endforeach()

 register_operators(EXCLUDES c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})

-if(WITH_GPU AND NOT WIN32)
+if(WITH_NCCL)
    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
    op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} nccl_common)
 endif()

--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/data_norm_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
@@ -178,7 +178,7 @@ class DataNormGradKernel<platform::CUDADeviceContext, T>
        d_batch_sum, d_batch_square_sum);

    if (need_sync_stats) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
      auto comm = platform::NCCLCommContext::Instance().Get(0, ctx.GetPlace());
      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
          reinterpret_cast<const void *>(d_batch_size),

--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
@@ -30,7 +30,7 @@ endforeach()

 register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS})

-if(WITH_GPU AND NOT WIN32)
+if(WITH_NCCL)
    set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common)
    op_library(gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS} nccl_common)
 endif()

--- a/paddle/fluid/operators/nccl/CMakeLists.txt
+++ b/paddle/fluid/operators/nccl/CMakeLists.txt
+if (NOT WITH_NCCL)
+  return()
+endif()
+
 if(WITH_GPU AND NOT WIN32)
  nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
 endif()

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -305,7 +305,7 @@ CUDADeviceContext::~CUDADeviceContext() {
    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_),
                                "Failed to destory Cudnn handle");
  }
-#if !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
  if (nccl_comm_) {
    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
  }

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/cuda_helper.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
-#if !defined(__APPLE__) && !defined(_WIN32)
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
 #include "paddle/fluid/platform/gpu_info.h"
@@ -144,7 +144,7 @@ class CUDADeviceContext : public DeviceContext {
  /*! \brief  Return cuda stream in the device context. */
  cudaStream_t stream() const;

-#if !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
  /*! \brief  Return nccl communicators. */
  ncclComm_t nccl_comm() const { return nccl_comm_; }

@@ -180,7 +180,7 @@ class CUDADeviceContext : public DeviceContext {
  std::unique_ptr<CublasHandleHolder> cublas_handle_;
  std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;

-#if !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
  // NCCL communicator (single process version) for NCCL collective operations.
  // NCCL collective operations provides fast collectives over multiple GPUs
  // both within and across nodes.

--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -5,7 +5,10 @@ list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc)
 # There is no macOS version of NCCL.
 # Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux.
 if (NOT APPLE AND NOT WIN32)
-  list(APPEND CUDA_SRCS nccl.cc nvrtc.cc cuda_driver.cc)
+  list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
+  if (WITH_NCCL)
+    list(APPEND CUDA_SRCS nccl.cc)
+  endif()
 endif()

 if (TENSORRT_FOUND)

--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -292,7 +292,7 @@ TEST(enforce, cuda_success) {
  EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS));
  EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED));
  EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE));
-#if !defined(__APPLE__) && !defined(_WIN32)
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
  EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
  EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError));
  EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError));

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
-set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper nccl_wrapper prune
+set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
  feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
  analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
  gloo_wrapper)

+if (WITH_NCCL)
+  set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
+endif()
+
 if(NOT WIN32)
-  set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context data_loader)
+  set(PYBIND_DEPS ${PYBIND_DEPS} data_loader)
+  if (WITH_NCCL)
+    set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context)
+  endif()
 endif(NOT WIN32)

 if(WITH_PYTHON)
@@ -25,7 +32,6 @@ set(PYBIND_SRCS
  fleet_wrapper_py.cc
  gloo_wrapper_py.cc
  box_helper_py.cc
-  nccl_wrapper_py.cc
  data_set_py.cc
  imperative.cc
  ir.cc
@@ -35,15 +41,19 @@ if (WITH_DISTRIBUTE)
  list(APPEND PYBIND_SRCS communicator_py.cc)
 endif()

+if (WITH_NCCL)
+  list(APPEND PYBIND_SRCS nccl_wrapper_py.cc)
+endif()
+
 if(WITH_PYTHON)
  # generate op pybind functions automatically for dygraph.
  set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
  list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB})
  list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS})

-  if(NOT WIN32)
+  if(WITH_NCCL)
    list(APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context)
-  endif(NOT WIN32)
+  endif(WITH_NCCL)

  add_executable(op_function_generator op_function_generator.cc)
  target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})

--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -611,7 +611,7 @@ void BindImperative(py::module *m_ptr) {
                    },
                    [](imperative::ParallelStrategy &self,
                       const std::string &ep) { self.current_endpoint_ = ep; });
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
  py::class_<imperative::NCCLParallelContext> nccl_ctx(m,
                                                       "NCCLParallelContext");


--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2209,7 +2209,7 @@ All parameter, weight, gradient are variables in Paddle.
  BindFleetWrapper(&m);
  BindGlooWrapper(&m);
  BindBoxHelper(&m);
-#ifndef _WIN32
+#ifdef PADDLE_WITH_NCCL
  BindNCCLWrapper(&m);
 #endif
  BindGraph(&m);

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -4,6 +4,9 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FL
 set(dist_ENVS http_proxy="" https_proxy="")

 file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
+if(NOT WITH_NCCL)
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl")
+endif()
 string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)