未验证 提交 49a80b45 编写于 作者: W Wilber 提交者: GitHub

cherry-pick 22509. test=develop test=release/1.7 (#22527)

[cherry-pick] #22509

支持不依赖nccl进行编译。

多卡下,如果没有打开WITH_NCCL开关编译,多卡不能通信,则只能选择一张卡使用
上级 59bb29db
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_NCCL
DECLARE_bool(sync_nccl_allreduce); DECLARE_bool(sync_nccl_allreduce);
#endif #endif
......
...@@ -303,7 +303,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, ...@@ -303,7 +303,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
const std::string &loss_var_name, const std::string &loss_var_name,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const size_t &nranks, const size_t &nranks,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
const bool use_cuda, const bool use_cuda,
platform::NCCLCommunicator *nccl_ctxs) const { platform::NCCLCommunicator *nccl_ctxs) const {
#else #else
...@@ -326,7 +326,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, ...@@ -326,7 +326,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
pass->Erase(kNRanks); pass->Erase(kNRanks);
pass->Set<size_t>(kNRanks, new size_t(nranks)); pass->Set<size_t>(kNRanks, new size_t(nranks));
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr; platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase(kNCCLCtxs); pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx); pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
...@@ -339,7 +339,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, ...@@ -339,7 +339,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
pass->Erase(kLocalScopes); pass->Erase(kLocalScopes);
pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes, pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
&local_scopes); &local_scopes);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr; platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase(kNCCLCtxs); pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx); pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
...@@ -354,7 +354,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, ...@@ -354,7 +354,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
LOG(INFO) << "set enable_sequential_execution:" LOG(INFO) << "set enable_sequential_execution:"
<< enable_sequential_execution_; << enable_sequential_execution_;
} else if (pass->Type() == "all_reduce_deps_pass") { } else if (pass->Type() == "all_reduce_deps_pass") {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr; platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase(kNCCLCtxs); pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx); pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
......
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
...@@ -162,7 +162,7 @@ struct BuildStrategy { ...@@ -162,7 +162,7 @@ struct BuildStrategy {
const std::string &loss_var_name, const std::string &loss_var_name,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const size_t &nranks, const size_t &nranks,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
const bool use_cuda, const bool use_cuda,
platform::NCCLCommunicator *nccl_ctxs) const; platform::NCCLCommunicator *nccl_ctxs) const;
#else #else
......
...@@ -36,7 +36,7 @@ struct TestReduceOpHandle { ...@@ -36,7 +36,7 @@ struct TestReduceOpHandle {
std::vector<p::Place> gpu_list_; std::vector<p::Place> gpu_list_;
std::vector<std::unique_ptr<p::DeviceContext>> ctxs_; std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_; std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif #endif
...@@ -44,7 +44,7 @@ struct TestReduceOpHandle { ...@@ -44,7 +44,7 @@ struct TestReduceOpHandle {
for (size_t j = 0; j < ctxs_.size(); ++j) { for (size_t j = 0; j < ctxs_.size(); ++j) {
ctxs_[j]->Wait(); ctxs_[j]->Wait();
} }
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
if (nccl_ctxs_) { if (nccl_ctxs_) {
nccl_ctxs_->WaitAll(); nccl_ctxs_->WaitAll();
} }
...@@ -54,7 +54,7 @@ struct TestReduceOpHandle { ...@@ -54,7 +54,7 @@ struct TestReduceOpHandle {
void InitCtxOnGpu(bool use_gpu) { void InitCtxOnGpu(bool use_gpu) {
use_gpu_ = use_gpu; use_gpu_ = use_gpu;
if (use_gpu) { if (use_gpu) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
int count = p::GetCUDADeviceCount(); int count = p::GetCUDADeviceCount();
if (count <= 1) { if (count <= 1) {
LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
...@@ -78,7 +78,7 @@ struct TestReduceOpHandle { ...@@ -78,7 +78,7 @@ struct TestReduceOpHandle {
gpu_list_.push_back(p); gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CPUDeviceContext(p)); ctxs_.emplace_back(new p::CPUDeviceContext(p));
} }
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
nccl_ctxs_.reset(nullptr); nccl_ctxs_.reset(nullptr);
#endif #endif
} }
...@@ -99,14 +99,14 @@ struct TestReduceOpHandle { ...@@ -99,14 +99,14 @@ struct TestReduceOpHandle {
nodes.emplace_back(new ir::Node("node")); nodes.emplace_back(new ir::Node("node"));
if (use_gpu_) { if (use_gpu_) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
gpu_list_, nccl_ctxs_.get())); gpu_list_, nccl_ctxs_.get()));
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW("CUDA is not support.");
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
gpu_list_, nccl_ctxs_.get())); gpu_list_, nccl_ctxs_.get()));
#else #else
......
...@@ -38,7 +38,7 @@ limitations under the License. */ ...@@ -38,7 +38,7 @@ limitations under the License. */
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/timer.h" #include "paddle/fluid/platform/timer.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
...@@ -257,7 +257,7 @@ class DownpourWorker : public HogwildWorker { ...@@ -257,7 +257,7 @@ class DownpourWorker : public HogwildWorker {
std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_; std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
}; };
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
using ScopeQueue = operators::reader::BlockingQueue<Scope*>; using ScopeQueue = operators::reader::BlockingQueue<Scope*>;
class SyncFunctor { class SyncFunctor {
......
...@@ -24,7 +24,7 @@ std::shared_ptr<NCCLWrapper> NCCLWrapper::s_instance_ = NULL; ...@@ -24,7 +24,7 @@ std::shared_ptr<NCCLWrapper> NCCLWrapper::s_instance_ = NULL;
bool NCCLWrapper::is_initialized_ = false; bool NCCLWrapper::is_initialized_ = false;
void NCCLWrapper::InitNCCL() { void NCCLWrapper::InitNCCL() {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
&(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_, &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_,
nccl_info_.my_global_rank_)); nccl_info_.my_global_rank_));
...@@ -33,14 +33,14 @@ void NCCLWrapper::InitNCCL() { ...@@ -33,14 +33,14 @@ void NCCLWrapper::InitNCCL() {
} }
void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) { void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
nccl_info_.nccl_id_ = nccl_info.nccl_id_; nccl_info_.nccl_id_ = nccl_info.nccl_id_;
#endif #endif
return; return;
} }
NCCLInfo NCCLWrapper::GetNCCLId() { NCCLInfo NCCLWrapper::GetNCCLId() {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_))); PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
#endif #endif
return nccl_info_; return nccl_info_;
...@@ -48,7 +48,7 @@ NCCLInfo NCCLWrapper::GetNCCLId() { ...@@ -48,7 +48,7 @@ NCCLInfo NCCLWrapper::GetNCCLId() {
void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank, void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank,
const int ranks) { const int ranks) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
nccl_info_.local_rank_ = local_rank; nccl_info_.local_rank_ = local_rank;
nccl_info_.my_global_rank_ = global_rank; nccl_info_.my_global_rank_ = global_rank;
nccl_info_.global_ranks_ = ranks; nccl_info_.global_ranks_ = ranks;
...@@ -60,7 +60,7 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank, ...@@ -60,7 +60,7 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank,
void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope, void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
const std::vector<std::string>& var_names) { const std::vector<std::string>& var_names) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
for (auto& name : var_names) { for (auto& name : var_names) {
auto var = scope.FindVar(name); auto var = scope.FindVar(name);
LoDTensor* tensor = var->GetMutable<LoDTensor>(); LoDTensor* tensor = var->GetMutable<LoDTensor>();
......
...@@ -24,7 +24,7 @@ limitations under the License. */ ...@@ -24,7 +24,7 @@ limitations under the License. */
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/framework/variable_helper.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
#endif #endif
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
...@@ -41,7 +41,7 @@ class NCCLInfo { ...@@ -41,7 +41,7 @@ class NCCLInfo {
int local_rank_; int local_rank_;
int global_ranks_; int global_ranks_;
int my_global_rank_; int my_global_rank_;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
ncclUniqueId nccl_id_; ncclUniqueId nccl_id_;
ncclComm_t comm_; ncclComm_t comm_;
cudaStream_t stream_; cudaStream_t stream_;
......
...@@ -247,7 +247,7 @@ class ParallelExecutorPrivate { ...@@ -247,7 +247,7 @@ class ParallelExecutorPrivate {
std::unordered_map<std::string, bool> is_persistable_; std::unordered_map<std::string, bool> is_persistable_;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
platform::NCCLCommunicator *nccl_ctxs_{nullptr}; platform::NCCLCommunicator *nccl_ctxs_{nullptr};
#endif #endif
bool own_local_scope_; bool own_local_scope_;
...@@ -427,6 +427,16 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -427,6 +427,16 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
} }
#endif #endif
#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE_EQ(
places.size(), 1,
platform::errors::PermissionDenied(
"Your machine has multiple cards, "
"but the WITH_NCCL option is not turned on during compilation, "
"and you cannot use multi-card training or prediction. "
"Please recompile and turn on the WITH_NCCL option."));
#endif
LOG(INFO) << string::Sprintf( LOG(INFO) << string::Sprintf(
"The Program will be executed on %s using ParallelExecutor, %lu " "The Program will be executed on %s using ParallelExecutor, %lu "
"cards are used, so %lu programs are executed in parallel.", "cards are used, so %lu programs are executed in parallel.",
...@@ -516,7 +526,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -516,7 +526,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp // ncclOp
std::vector<ir::Graph *> async_graphs(places.size()); std::vector<ir::Graph *> async_graphs(places.size());
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
if (member_->build_strategy_.async_mode_) { if (member_->build_strategy_.async_mode_) {
VLOG(3) << "use local async mode"; VLOG(3) << "use local async mode";
graph = member_->build_strategy_.Apply( graph = member_->build_strategy_.Apply(
......
...@@ -110,7 +110,7 @@ class DistMultiTrainer : public MultiTrainer { ...@@ -110,7 +110,7 @@ class DistMultiTrainer : public MultiTrainer {
int dump_file_num_; int dump_file_num_;
}; };
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
class PipelineTrainer : public TrainerBase { class PipelineTrainer : public TrainerBase {
public: public:
PipelineTrainer() {} PipelineTrainer() {}
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#ifndef _WIN32 #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cudnn.h> #include <cudnn.h>
#ifndef _WIN32 #if defined(PADDLE_WITH_NCCL)
#include <nccl.h> #include <nccl.h>
#endif #endif
#endif #endif
...@@ -34,7 +34,7 @@ namespace paddle { ...@@ -34,7 +34,7 @@ namespace paddle {
namespace platform { namespace platform {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#ifndef _WIN32 #if defined(PADDLE_WITH_NCCL)
class Communicator; class Communicator;
class NCCLCommunicator; class NCCLCommunicator;
#endif #endif
...@@ -140,7 +140,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< ...@@ -140,7 +140,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *, LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *,
operators::reader::LoDTensorBlockingQueueHolder, operators::reader::LoDTensorBlockingQueueHolder,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#ifndef _WIN32 #if defined(PADDLE_WITH_NCCL)
ncclUniqueId, platform::Communicator, platform::NCCLCommunicator, ncclUniqueId, platform::Communicator, platform::NCCLCommunicator,
#endif #endif
operators::CudnnRNNCache, operators::CudnnRNNCache,
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
#include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#ifndef _WIN32 #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
......
...@@ -29,7 +29,7 @@ imperative::ParallelStrategy GetStrategy(int local_rank) { ...@@ -29,7 +29,7 @@ imperative::ParallelStrategy GetStrategy(int local_rank) {
return strategy; return strategy;
} }
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
void BcastNCCLId(int local_rank, ncclUniqueId *nccl_id) { void BcastNCCLId(int local_rank, ncclUniqueId *nccl_id) {
auto strategy = GetStrategy(local_rank); auto strategy = GetStrategy(local_rank);
platform::CUDAPlace gpu(local_rank); platform::CUDAPlace gpu(local_rank);
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include <memory> #include <memory>
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
...@@ -28,7 +28,7 @@ template <typename T> ...@@ -28,7 +28,7 @@ template <typename T>
class CAllGatherOpCUDAKernel : public framework::OpKernel<T> { class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
auto in = ctx.Input<framework::Tensor>("X"); auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out"); auto out = ctx.Output<framework::Tensor>("Out");
ncclDataType_t dtype = platform::ToNCCLDataType(in->type()); ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
......
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
...@@ -58,7 +58,7 @@ template <ReduceType red_type, typename T> ...@@ -58,7 +58,7 @@ template <ReduceType red_type, typename T>
class CAllReduceOpCUDAKernel : public framework::OpKernel<T> { class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
auto in = ctx.Input<framework::Tensor>("X"); auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out"); auto out = ctx.Output<framework::Tensor>("Out");
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_broadcast_op.h" #include "paddle/fluid/operators/collective/c_broadcast_op.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
...@@ -26,7 +26,7 @@ template <typename T> ...@@ -26,7 +26,7 @@ template <typename T>
class CBroadcastOpCUDAKernel : public framework::OpKernel<T> { class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
auto x = ctx.Input<framework::LoDTensor>("X"); auto x = ctx.Input<framework::LoDTensor>("X");
auto out = ctx.Output<framework::LoDTensor>("Out"); auto out = ctx.Output<framework::LoDTensor>("Out");
int numel = x->numel(); int numel = x->numel();
......
...@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include <nccl.h> #include <nccl.h>
#endif #endif
#include <stdint.h> #include <stdint.h>
...@@ -25,7 +25,7 @@ limitations under the License. */ ...@@ -25,7 +25,7 @@ limitations under the License. */
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
...@@ -52,7 +52,7 @@ class CCommInitAllOp : public framework::OperatorBase { ...@@ -52,7 +52,7 @@ class CCommInitAllOp : public framework::OperatorBase {
PADDLE_ENFORCE_EQ(is_gpu_place(place), true, PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
"CCommInitAllOp can run on gpu place only."); "CCommInitAllOp can run on gpu place only.");
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
std::vector<int> devices = Attr<std::vector<int>>("devices"); std::vector<int> devices = Attr<std::vector<int>>("devices");
if (devices.empty()) { if (devices.empty()) {
devices = platform::GetSelectedDevices(); devices = platform::GetSelectedDevices();
......
...@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include <nccl.h> #include <nccl.h>
#endif #endif
#include <stdint.h> #include <stdint.h>
...@@ -24,7 +24,7 @@ limitations under the License. */ ...@@ -24,7 +24,7 @@ limitations under the License. */
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
...@@ -46,7 +46,7 @@ class CCommInitOp : public framework::OperatorBase { ...@@ -46,7 +46,7 @@ class CCommInitOp : public framework::OperatorBase {
auto var = scope.FindVar(Input("X")); auto var = scope.FindVar(Input("X"));
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(var);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
ncclUniqueId* nccl_id = var->GetMutable<ncclUniqueId>(); ncclUniqueId* nccl_id = var->GetMutable<ncclUniqueId>();
int nranks = Attr<int>("nranks"); int nranks = Attr<int>("nranks");
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册