From a90fa540926d832056aa1a0581372ad677dd960f Mon Sep 17 00:00:00 2001 From: Wilber Date: Tue, 11 Feb 2020 11:13:53 +0800 Subject: [PATCH] Compile without nccl deps. [1/2] (#22509) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 支持不依赖nccl进行编译。[1/2] 多卡下,如果没有打开WITH_NCCL开关编译,多卡不能通信,则只能选择一张卡使用。 Co-authored-by: 石晓伟 <39303645+Shixiaowei02@users.noreply.github.com> --- .../framework/details/all_reduce_op_handle.cc | 2 +- paddle/fluid/framework/details/build_strategy.cc | 8 ++++---- paddle/fluid/framework/details/build_strategy.h | 4 ++-- .../framework/details/reduce_op_handle_test.cc | 12 ++++++------ paddle/fluid/framework/device_worker.h | 4 ++-- paddle/fluid/framework/fleet/nccl_wrapper.cc | 10 +++++----- paddle/fluid/framework/fleet/nccl_wrapper.h | 4 ++-- paddle/fluid/framework/parallel_executor.cc | 14 ++++++++++++-- paddle/fluid/framework/trainer.h | 2 +- paddle/fluid/framework/var_type_traits.cc | 2 +- paddle/fluid/framework/var_type_traits.h | 6 +++--- paddle/fluid/framework/var_type_traits_test.cc | 2 +- paddle/fluid/imperative/tests/nccl_context_test.cc | 2 +- .../operators/collective/c_allgather_op.cu.cc | 4 ++-- paddle/fluid/operators/collective/c_allreduce_op.h | 4 ++-- .../operators/collective/c_broadcast_op.cu.cc | 4 ++-- .../operators/collective/c_comm_init_all_op.cc | 6 +++--- .../fluid/operators/collective/c_comm_init_op.cc | 6 +++--- 18 files changed, 53 insertions(+), 43 deletions(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 3110e579d30..1882bd73ada 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -20,7 +20,7 @@ #include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/profiler.h" -#ifdef PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_NCCL DECLARE_bool(sync_nccl_allreduce); #endif diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 36e32a68633..0305081b814 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -306,7 +306,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, const std::string &loss_var_name, const std::vector &local_scopes, const size_t &nranks, -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) const bool use_cuda, platform::NCCLCommunicator *nccl_ctxs) const { #else @@ -329,7 +329,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, pass->Erase(kNRanks); pass->Set(kNRanks, new size_t(nranks)); -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr; pass->Erase(kNCCLCtxs); pass->SetNotOwned(kNCCLCtxs, nctx); @@ -342,7 +342,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, pass->Erase(kLocalScopes); pass->SetNotOwned>(kLocalScopes, &local_scopes); -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr; pass->Erase(kNCCLCtxs); pass->SetNotOwned(kNCCLCtxs, nctx); @@ -357,7 +357,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, LOG(INFO) << "set enable_sequential_execution:" << enable_sequential_execution_; } else if (pass->Type() == "all_reduce_deps_pass") { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr; pass->Erase(kNCCLCtxs); pass->SetNotOwned(kNCCLCtxs, nctx); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 0e599699898..738bbf51115 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -26,7 +26,7 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -163,7 +163,7 @@ struct BuildStrategy { const std::string &loss_var_name, const std::vector &local_scopes, const size_t &nranks, -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) const bool use_cuda, platform::NCCLCommunicator *nccl_ctxs) const; #else diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index 664bd00fb68..d71251b76c7 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -36,7 +36,7 @@ struct TestReduceOpHandle { std::vector gpu_list_; std::vector> ctxs_; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) std::unique_ptr nccl_ctxs_; #endif @@ -44,7 +44,7 @@ struct TestReduceOpHandle { for (size_t j = 0; j < ctxs_.size(); ++j) { ctxs_[j]->Wait(); } -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) if (nccl_ctxs_) { nccl_ctxs_->WaitAll(); } @@ -54,7 +54,7 @@ struct TestReduceOpHandle { void InitCtxOnGpu(bool use_gpu) { use_gpu_ = use_gpu; if (use_gpu) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) int count = p::GetCUDADeviceCount(); if (count <= 1) { LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " @@ -78,7 +78,7 @@ struct TestReduceOpHandle { gpu_list_.push_back(p); ctxs_.emplace_back(new p::CPUDeviceContext(p)); } -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) nccl_ctxs_.reset(nullptr); #endif } @@ -99,14 +99,14 @@ struct TestReduceOpHandle { nodes.emplace_back(new ir::Node("node")); if (use_gpu_) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_, nccl_ctxs_.get())); #else PADDLE_THROW("CUDA is not support."); #endif } else { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_, nccl_ctxs_.get())); #else diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 2589f6b9edd..3dbdda14cb5 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -38,7 +38,7 @@ limitations under the License. */ #include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/timer.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -257,7 +257,7 @@ class DownpourWorker : public HogwildWorker { std::unordered_map> feasign_set_; }; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) using ScopeQueue = operators::reader::BlockingQueue; class SyncFunctor { diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc index 38c75b1df5a..d5a25605cf8 100644 --- a/paddle/fluid/framework/fleet/nccl_wrapper.cc +++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc @@ -24,7 +24,7 @@ std::shared_ptr NCCLWrapper::s_instance_ = NULL; bool NCCLWrapper::is_initialized_ = false; void NCCLWrapper::InitNCCL() { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_, nccl_info_.my_global_rank_)); @@ -33,14 +33,14 @@ void NCCLWrapper::InitNCCL() { } void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) nccl_info_.nccl_id_ = nccl_info.nccl_id_; #endif return; } NCCLInfo NCCLWrapper::GetNCCLId() { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_))); #endif return nccl_info_; @@ -48,7 +48,7 @@ NCCLInfo NCCLWrapper::GetNCCLId() { void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank, const int ranks) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) nccl_info_.local_rank_ = local_rank; nccl_info_.my_global_rank_ = global_rank; nccl_info_.global_ranks_ = ranks; @@ -60,7 +60,7 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank, void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope, const std::vector& var_names) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) for (auto& name : var_names) { auto var = scope.FindVar(name); LoDTensor* tensor = var->GetMutable(); diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.h b/paddle/fluid/framework/fleet/nccl_wrapper.h index 84354308ea3..a55921f1ac2 100644 --- a/paddle/fluid/framework/fleet/nccl_wrapper.h +++ b/paddle/fluid/framework/fleet/nccl_wrapper.h @@ -24,7 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/variable_helper.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/dynload/nccl.h" #endif #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN @@ -41,7 +41,7 @@ class NCCLInfo { int local_rank_; int global_ranks_; int my_global_rank_; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) ncclUniqueId nccl_id_; ncclComm_t comm_; cudaStream_t stream_; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 7958d3f99c8..7c4fa248a41 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -247,7 +247,7 @@ class ParallelExecutorPrivate { std::unordered_map is_persistable_; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) platform::NCCLCommunicator *nccl_ctxs_{nullptr}; #endif bool own_local_scope_; @@ -427,6 +427,16 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, } #endif +#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_NCCL) + PADDLE_ENFORCE_EQ( + places.size(), 1, + platform::errors::PermissionDenied( + "Your machine has multiple cards, " + "but the WITH_NCCL option is not turned on during compilation, " + "and you cannot use multi-card training or prediction. " + "Please recompile and turn on the WITH_NCCL option.")); +#endif + LOG(INFO) << string::Sprintf( "The Program will be executed on %s using ParallelExecutor, %lu " "cards are used, so %lu programs are executed in parallel.", @@ -516,7 +526,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp std::vector async_graphs(places.size()); -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) if (member_->build_strategy_.async_mode_) { VLOG(3) << "use local async mode"; graph = member_->build_strategy_.Apply( diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 285125a89a2..cbeb812ecda 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -110,7 +110,7 @@ class DistMultiTrainer : public MultiTrainer { int dump_file_num_; }; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) class PipelineTrainer : public TrainerBase { public: PipelineTrainer() {} diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc index 7cc2b3b4225..5c90b07149e 100644 --- a/paddle/fluid/framework/var_type_traits.cc +++ b/paddle/fluid/framework/var_type_traits.cc @@ -21,7 +21,7 @@ #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/platform/macros.h" #ifdef PADDLE_WITH_CUDA -#ifndef _WIN32 +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #include "paddle/fluid/platform/nccl_helper.h" #endif diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 4ab01b9068e..70f31d8bdb1 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -24,7 +24,7 @@ #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_CUDA #include -#ifndef _WIN32 +#if defined(PADDLE_WITH_NCCL) #include #endif #endif @@ -34,7 +34,7 @@ namespace paddle { namespace platform { #ifdef PADDLE_WITH_CUDA -#ifndef _WIN32 +#if defined(PADDLE_WITH_NCCL) class Communicator; class NCCLCommunicator; #endif @@ -140,7 +140,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *, operators::reader::LoDTensorBlockingQueueHolder, #ifdef PADDLE_WITH_CUDA -#ifndef _WIN32 +#if defined(PADDLE_WITH_NCCL) ncclUniqueId, platform::Communicator, platform::NCCLCommunicator, #endif operators::CudnnRNNCache, diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc index 67dbfd740ed..2d7172e8010 100644 --- a/paddle/fluid/framework/var_type_traits_test.cc +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -24,7 +24,7 @@ #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #ifdef PADDLE_WITH_CUDA -#ifndef _WIN32 +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #include "paddle/fluid/platform/nccl_helper.h" #endif diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc index 74a74ebe921..8ce257a6c37 100644 --- a/paddle/fluid/imperative/tests/nccl_context_test.cc +++ b/paddle/fluid/imperative/tests/nccl_context_test.cc @@ -29,7 +29,7 @@ imperative::ParallelStrategy GetStrategy(int local_rank) { return strategy; } -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) void BcastNCCLId(int local_rank, ncclUniqueId *nccl_id) { auto strategy = GetStrategy(local_rank); platform::CUDAPlace gpu(local_rank); diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc index 14e2741e52e..b20e011f5cb 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -28,7 +28,7 @@ template class CAllGatherOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); ncclDataType_t dtype = platform::ToNCCLDataType(in->type()); diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index c661d421598..096a2f6a095 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -58,7 +58,7 @@ template class CAllReduceOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc index a4433d0b3d1..f95d467345b 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_broadcast_op.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -26,7 +26,7 @@ template class CBroadcastOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) auto x = ctx.Input("X"); auto out = ctx.Output("Out"); int numel = x->numel(); diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc index 758affbd438..f72d2ac4d33 100644 --- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc @@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include #endif #include @@ -25,7 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -52,7 +52,7 @@ class CCommInitAllOp : public framework::OperatorBase { PADDLE_ENFORCE_EQ(is_gpu_place(place), true, "CCommInitAllOp can run on gpu place only."); -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) std::vector devices = Attr>("devices"); if (devices.empty()) { devices = platform::GetSelectedDevices(); diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc index 16ca6e5238e..2bfc6e55301 100644 --- a/paddle/fluid/operators/collective/c_comm_init_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_op.cc @@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include #endif #include @@ -24,7 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -46,7 +46,7 @@ class CCommInitOp : public framework::OperatorBase { auto var = scope.FindVar(Input("X")); PADDLE_ENFORCE_NOT_NULL(var); -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if defined(PADDLE_WITH_NCCL) ncclUniqueId* nccl_id = var->GetMutable(); int nranks = Attr("nranks"); -- GitLab