diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 3110e579d30bb20693ec5ac4dc5e17501e2dbd58..1882bd73adaae53393640032307914d2e08c0404 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -20,7 +20,7 @@ #include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/profiler.h" -#ifdef PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_NCCL DECLARE_bool(sync_nccl_allreduce); #endif diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 36e32a68633fabe9d09cd6b8b223aa1428c96383..0305081b814fa6dcd871f7ecd99200071cc82583 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -306,7 +306,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, const std::string &loss_var_name, const std::vector &local_scopes, const size_t &nranks, -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) const bool use_cuda, platform::NCCLCommunicator *nccl_ctxs) const { #else @@ -329,7 +329,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, pass->Erase(kNRanks); pass->Set(kNRanks, new size_t(nranks)); -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr; pass->Erase(kNCCLCtxs); pass->SetNotOwned(kNCCLCtxs, nctx); @@ -342,7 +342,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, pass->Erase(kLocalScopes); pass->SetNotOwned>(kLocalScopes, &local_scopes); -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr; pass->Erase(kNCCLCtxs); pass->SetNotOwned(kNCCLCtxs, nctx); @@ -357,7 +357,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, LOG(INFO) << "set enable_sequential_execution:" << enable_sequential_execution_; } else if (pass->Type() == "all_reduce_deps_pass") { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr; pass->Erase(kNCCLCtxs); pass->SetNotOwned(kNCCLCtxs, nctx); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 0e59969989868b088e117b63aeb28f7ca7dd1754..738bbf51115fc126e072a3ca3519e234b22c6b4b 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -26,7 +26,7 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -163,7 +163,7 @@ struct BuildStrategy { const std::string &loss_var_name, const std::vector &local_scopes, const size_t &nranks, -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) const bool use_cuda, platform::NCCLCommunicator *nccl_ctxs) const; #else diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index 664bd00fb68fc37c6d4e7624ed42a2a905f1bd25..d71251b76c75b08e35c6b2ba3af2f8ab2e53308c 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -36,7 +36,7 @@ struct TestReduceOpHandle { std::vector gpu_list_; std::vector> ctxs_; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) std::unique_ptr nccl_ctxs_; #endif @@ -44,7 +44,7 @@ struct TestReduceOpHandle { for (size_t j = 0; j < ctxs_.size(); ++j) { ctxs_[j]->Wait(); } -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) if (nccl_ctxs_) { nccl_ctxs_->WaitAll(); } @@ -54,7 +54,7 @@ struct TestReduceOpHandle { void InitCtxOnGpu(bool use_gpu) { use_gpu_ = use_gpu; if (use_gpu) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) int count = p::GetCUDADeviceCount(); if (count <= 1) { LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " @@ -78,7 +78,7 @@ struct TestReduceOpHandle { gpu_list_.push_back(p); ctxs_.emplace_back(new p::CPUDeviceContext(p)); } -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) nccl_ctxs_.reset(nullptr); #endif } @@ -99,14 +99,14 @@ struct TestReduceOpHandle { nodes.emplace_back(new ir::Node("node")); if (use_gpu_) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_, nccl_ctxs_.get())); #else PADDLE_THROW("CUDA is not support."); #endif } else { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_, nccl_ctxs_.get())); #else diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 2589f6b9edd2f1195fea4656b5b6cae01d40c0ec..3dbdda14cb578bfead39f7e7b45e9eadf872021c 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -38,7 +38,7 @@ limitations under the License. */ #include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/timer.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -257,7 +257,7 @@ class DownpourWorker : public HogwildWorker { std::unordered_map> feasign_set_; }; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) using ScopeQueue = operators::reader::BlockingQueue; class SyncFunctor { diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc index 38c75b1df5a79bdd1a866480c3f12f953d26ad76..d5a25605cf81147b520bf541e38f4f75e53ae756 100644 --- a/paddle/fluid/framework/fleet/nccl_wrapper.cc +++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc @@ -24,7 +24,7 @@ std::shared_ptr NCCLWrapper::s_instance_ = NULL; bool NCCLWrapper::is_initialized_ = false; void NCCLWrapper::InitNCCL() { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_, nccl_info_.my_global_rank_)); @@ -33,14 +33,14 @@ void NCCLWrapper::InitNCCL() { } void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) nccl_info_.nccl_id_ = nccl_info.nccl_id_; #endif return; } NCCLInfo NCCLWrapper::GetNCCLId() { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_))); #endif return nccl_info_; @@ -48,7 +48,7 @@ NCCLInfo NCCLWrapper::GetNCCLId() { void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank, const int ranks) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) nccl_info_.local_rank_ = local_rank; nccl_info_.my_global_rank_ = global_rank; nccl_info_.global_ranks_ = ranks; @@ -60,7 +60,7 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank, void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope, const std::vector& var_names) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) for (auto& name : var_names) { auto var = scope.FindVar(name); LoDTensor* tensor = var->GetMutable(); diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.h b/paddle/fluid/framework/fleet/nccl_wrapper.h index 84354308ea31a0ede9d16a95033346aefe587aa2..a55921f1ac2a1204965433cac12ba2ca2e19367e 100644 --- a/paddle/fluid/framework/fleet/nccl_wrapper.h +++ b/paddle/fluid/framework/fleet/nccl_wrapper.h @@ -24,7 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/variable_helper.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/dynload/nccl.h" #endif #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN @@ -41,7 +41,7 @@ class NCCLInfo { int local_rank_; int global_ranks_; int my_global_rank_; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) ncclUniqueId nccl_id_; ncclComm_t comm_; cudaStream_t stream_; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 7958d3f99c8a51205b36ad9a36fe3868a7176c47..7c4fa248a4181605d0713c314d1b38938591b350 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -247,7 +247,7 @@ class ParallelExecutorPrivate { std::unordered_map is_persistable_; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) platform::NCCLCommunicator *nccl_ctxs_{nullptr}; #endif bool own_local_scope_; @@ -427,6 +427,16 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, } #endif +#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_NCCL) + PADDLE_ENFORCE_EQ( + places.size(), 1, + platform::errors::PermissionDenied( + "Your machine has multiple cards, " + "but the WITH_NCCL option is not turned on during compilation, " + "and you cannot use multi-card training or prediction. " + "Please recompile and turn on the WITH_NCCL option.")); +#endif + LOG(INFO) << string::Sprintf( "The Program will be executed on %s using ParallelExecutor, %lu " "cards are used, so %lu programs are executed in parallel.", @@ -516,7 +526,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp std::vector async_graphs(places.size()); -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) if (member_->build_strategy_.async_mode_) { VLOG(3) << "use local async mode"; graph = member_->build_strategy_.Apply( diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 285125a89a2acd5056c0630de0f3a319fd916995..cbeb812ecdac8efb0c0540d8d83eeb4699c00baa 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -110,7 +110,7 @@ class DistMultiTrainer : public MultiTrainer { int dump_file_num_; }; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) class PipelineTrainer : public TrainerBase { public: PipelineTrainer() {} diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc index 7cc2b3b42258942e6016486f7cf7ecfcae92b91c..5c90b07149ec5575f9907e41cc65a826421cf3ec 100644 --- a/paddle/fluid/framework/var_type_traits.cc +++ b/paddle/fluid/framework/var_type_traits.cc @@ -21,7 +21,7 @@ #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/platform/macros.h" #ifdef PADDLE_WITH_CUDA -#ifndef _WIN32 +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #include "paddle/fluid/platform/nccl_helper.h" #endif diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 4ab01b9068e546022ec53325686b2349cd45f482..70f31d8bdb1e65394219d7d715767a2d37e5ffc5 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -24,7 +24,7 @@ #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_CUDA #include -#ifndef _WIN32 +#if defined(PADDLE_WITH_NCCL) #include #endif #endif @@ -34,7 +34,7 @@ namespace paddle { namespace platform { #ifdef PADDLE_WITH_CUDA -#ifndef _WIN32 +#if defined(PADDLE_WITH_NCCL) class Communicator; class NCCLCommunicator; #endif @@ -140,7 +140,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *, operators::reader::LoDTensorBlockingQueueHolder, #ifdef PADDLE_WITH_CUDA -#ifndef _WIN32 +#if defined(PADDLE_WITH_NCCL) ncclUniqueId, platform::Communicator, platform::NCCLCommunicator, #endif operators::CudnnRNNCache, diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc index 67dbfd740ed9b71fa06b684c14720ae2814fe11c..2d7172e801090ba20006a8e9fd90e9d3ccbc2971 100644 --- a/paddle/fluid/framework/var_type_traits_test.cc +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -24,7 +24,7 @@ #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #ifdef PADDLE_WITH_CUDA -#ifndef _WIN32 +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #include "paddle/fluid/platform/nccl_helper.h" #endif diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc index 74a74ebe921378e2994a6a4cb2087d0acde950b1..8ce257a6c37d7d4b0fa5bc3610ee7283fdcf0659 100644 --- a/paddle/fluid/imperative/tests/nccl_context_test.cc +++ b/paddle/fluid/imperative/tests/nccl_context_test.cc @@ -29,7 +29,7 @@ imperative::ParallelStrategy GetStrategy(int local_rank) { return strategy; } -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) void BcastNCCLId(int local_rank, ncclUniqueId *nccl_id) { auto strategy = GetStrategy(local_rank); platform::CUDAPlace gpu(local_rank); diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc index 14e2741e52e9cc11fd3de830d9224d8201898c77..b20e011f5cbf20575a80078fdddb97aeb292872b 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -28,7 +28,7 @@ template class CAllGatherOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); ncclDataType_t dtype = platform::ToNCCLDataType(in->type()); diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index c661d4215988df57b801e6d0ff860b33f7646933..096a2f6a0959768bcb99d87b0d42edf71d98f481 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -58,7 +58,7 @@ template class CAllReduceOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc index a4433d0b3d1214808e42d6bb697ab6ff4b6ca149..f95d467345bda95968fd1e4f317547215c375b39 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_broadcast_op.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -26,7 +26,7 @@ template class CBroadcastOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) auto x = ctx.Input("X"); auto out = ctx.Output("Out"); int numel = x->numel(); diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc index 758affbd438af0261727162685def40fa277bad4..f72d2ac4d33fe8ee1de8905e2d154363a5d09875 100644 --- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc @@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include #endif #include @@ -25,7 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -52,7 +52,7 @@ class CCommInitAllOp : public framework::OperatorBase { PADDLE_ENFORCE_EQ(is_gpu_place(place), true, "CCommInitAllOp can run on gpu place only."); -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) std::vector devices = Attr>("devices"); if (devices.empty()) { devices = platform::GetSelectedDevices(); diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc index 16ca6e5238e43c34cb45b6be4f8e310537dd4a88..2bfc6e55301526f2ff4fec373e330d5e5cf63ea9 100644 --- a/paddle/fluid/operators/collective/c_comm_init_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_op.cc @@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include #endif #include @@ -24,7 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -46,7 +46,7 @@ class CCommInitOp : public framework::OperatorBase { auto var = scope.FindVar(Input("X")); PADDLE_ENFORCE_NOT_NULL(var); -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) ncclUniqueId* nccl_id = var->GetMutable(); int nranks = Attr("nranks");