未验证 提交 7bc4b095 编写于 作者: W Wilber 提交者: GitHub

add WITH_NCCL option for cmake. (#22384)

cmake选项中添加了WITH_NCCL,显示指定是否编译NCCL的部分代码,WITH_NCCL默认打开,但如果WITH_GPU为OFF,则关闭WITH_NCCL

添加了PADDLE_WITH_NCCL定义

单机单卡能够关闭NCCL编译,多卡的话需要默认打开NCCL,如果关闭NCCL,则只能使用单卡
Co-authored-by: N石晓伟 <39303645+Shixiaowei02@users.noreply.github.com>
上级 c8b90d8f
...@@ -89,6 +89,7 @@ option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VER ...@@ -89,6 +89,7 @@ option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VER
option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}) option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF) option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF) option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF)
option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON)
# PY_VERSION # PY_VERSION
if(NOT PY_VERSION) if(NOT PY_VERSION)
...@@ -121,6 +122,27 @@ if(WIN32) ...@@ -121,6 +122,27 @@ if(WIN32)
set(WITH_DISTRIBUTE OFF CACHE STRING set(WITH_DISTRIBUTE OFF CACHE STRING
"Disable DISTRIBUTE when compiling for Windows" FORCE) "Disable DISTRIBUTE when compiling for Windows" FORCE)
endif() endif()
if(WITH_NCCL)
MESSAGE(WARNING
"Disable NCCL when compiling for Windows. Force WITH_NCCL=OFF.")
set(WITH_NCCL OFF CACHE STRING
"Disable NCCL when compiling for Windows" FORCE)
endif()
endif()
if (NOT WITH_GPU AND WITH_NCCL)
MESSAGE(WARNING
"Disable NCCL when compiling without GPU. Force WITH_NCCL=OFF.")
set(WITH_NCCL OFF CACHE STRING
"Disable NCCL when compiling without GPU" FORCE)
endif()
if(WITH_NCCL)
add_definitions("-DPADDLE_WITH_NCCL")
else()
if(WITH_GPU)
MESSAGE(WARNING "If the environment is multi-card, the WITH_NCCL option needs to be turned on, otherwise only a single card can be used.")
endif()
endif() endif()
if(WITH_BRPC_RDMA) if(WITH_BRPC_RDMA)
......
...@@ -28,7 +28,7 @@ namespace paddle { ...@@ -28,7 +28,7 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
...@@ -121,7 +121,7 @@ void AllReduceOpHandle::AllReduceFunc( ...@@ -121,7 +121,7 @@ void AllReduceOpHandle::AllReduceFunc(
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::vector<std::string> &out_var_names) { const std::vector<std::string> &out_var_names) {
if (is_gpu_place(places[0])) { if (is_gpu_place(places[0])) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr."); PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype); ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
std::vector<std::function<void()>> all_reduce_calls; std::vector<std::function<void()>> all_reduce_calls;
...@@ -161,7 +161,7 @@ void AllReduceOpHandle::AllReduceFunc( ...@@ -161,7 +161,7 @@ void AllReduceOpHandle::AllReduceFunc(
VLOG(10) << Name() << " size:" << numel * SizeOfType(dtype); VLOG(10) << Name() << " size:" << numel * SizeOfType(dtype);
} }
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
void AllReduceOpHandle::NCCLAllReduceFunc( void AllReduceOpHandle::NCCLAllReduceFunc(
const std::vector<std::function<void()>> &all_reduce_calls) { const std::vector<std::function<void()>> &all_reduce_calls) {
this->RunAndRecordEvent([&] { this->RunAndRecordEvent([&] {
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
...@@ -29,7 +29,7 @@ namespace paddle { ...@@ -29,7 +29,7 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
class AllReduceOpHandle : public NCCLOpHandleBase { class AllReduceOpHandle : public NCCLOpHandleBase {
public: public:
AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes, AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
...@@ -54,13 +54,13 @@ class AllReduceOpHandle : public OpHandleBase { ...@@ -54,13 +54,13 @@ class AllReduceOpHandle : public OpHandleBase {
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32)) #ifndef PADDLE_WITH_NCCL
// NCCLOpHandleBase already have these attributes. // NCCLOpHandleBase already have these attributes.
// Will polish it by class inheritance framework. // Will polish it by class inheritance framework.
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
#endif #endif
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
void NCCLAllReduceFunc( void NCCLAllReduceFunc(
const std::vector<std::function<void()>> &all_reduce_calls); const std::vector<std::function<void()>> &all_reduce_calls);
......
...@@ -73,7 +73,7 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -73,7 +73,7 @@ void BroadcastOpHandle::BroadcastOneVar(
}); });
} }
} else { } else {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
VarHandle *out_handle = nullptr; VarHandle *out_handle = nullptr;
int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device; int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device;
std::vector<std::function<void()>> broadcast_calls; std::vector<std::function<void()>> broadcast_calls;
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
...@@ -34,7 +34,7 @@ namespace details { ...@@ -34,7 +34,7 @@ namespace details {
struct BroadcastOpHandle : public OpHandleBase { struct BroadcastOpHandle : public OpHandleBase {
public: public:
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes, BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const platform::NCCLContextMap *nccl_ctxs) const platform::NCCLContextMap *nccl_ctxs)
...@@ -70,7 +70,7 @@ struct BroadcastOpHandle : public OpHandleBase { ...@@ -70,7 +70,7 @@ struct BroadcastOpHandle : public OpHandleBase {
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
const platform::NCCLContextMap *nccl_ctxs_; const platform::NCCLContextMap *nccl_ctxs_;
#endif #endif
......
...@@ -44,7 +44,7 @@ struct TestBroadcastOpHandle { ...@@ -44,7 +44,7 @@ struct TestBroadcastOpHandle {
std::vector<std::unique_ptr<ir::Node>> nodes_; std::vector<std::unique_ptr<ir::Node>> nodes_;
std::vector<p::Place> place_list_; std::vector<p::Place> place_list_;
bool use_gpu_; bool use_gpu_;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_; std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif #endif
...@@ -52,7 +52,7 @@ struct TestBroadcastOpHandle { ...@@ -52,7 +52,7 @@ struct TestBroadcastOpHandle {
for (size_t j = 0; j < ctxs_.size(); ++j) { for (size_t j = 0; j < ctxs_.size(); ++j) {
ctxs_[j]->Wait(); ctxs_[j]->Wait();
} }
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
if (nccl_ctxs_) { if (nccl_ctxs_) {
nccl_ctxs_->WaitAll(); nccl_ctxs_->WaitAll();
} }
...@@ -62,7 +62,7 @@ struct TestBroadcastOpHandle { ...@@ -62,7 +62,7 @@ struct TestBroadcastOpHandle {
void InitCtxOnGpu(bool use_gpu) { void InitCtxOnGpu(bool use_gpu) {
use_gpu_ = use_gpu; use_gpu_ = use_gpu;
if (use_gpu_) { if (use_gpu_) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
int count = p::GetCUDADeviceCount(); int count = p::GetCUDADeviceCount();
if (count <= 1) { if (count <= 1) {
LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
...@@ -86,7 +86,7 @@ struct TestBroadcastOpHandle { ...@@ -86,7 +86,7 @@ struct TestBroadcastOpHandle {
place_list_.push_back(p); place_list_.push_back(p);
ctxs_.emplace_back(new p::CPUDeviceContext(p)); ctxs_.emplace_back(new p::CPUDeviceContext(p));
} }
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
nccl_ctxs_.reset(nullptr); nccl_ctxs_.reset(nullptr);
#endif #endif
} }
...@@ -107,14 +107,14 @@ struct TestBroadcastOpHandle { ...@@ -107,14 +107,14 @@ struct TestBroadcastOpHandle {
nodes_.emplace_back( nodes_.emplace_back(
ir::CreateNodeForTest("node0", ir::Node::Type::kOperation)); ir::CreateNodeForTest("node0", ir::Node::Type::kOperation));
if (use_gpu_) { if (use_gpu_) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
place_list_, nccl_ctxs_.get()); place_list_, nccl_ctxs_.get());
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW("CUDA is not support.");
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
place_list_, nccl_ctxs_.get()); place_list_, nccl_ctxs_.get());
#else #else
......
...@@ -28,7 +28,7 @@ namespace details { ...@@ -28,7 +28,7 @@ namespace details {
typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>> typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
GradientAndLoDTensor; GradientAndLoDTensor;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
FusedAllReduceOpHandle::FusedAllReduceOpHandle( FusedAllReduceOpHandle::FusedAllReduceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes, ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const size_t num_of_all_reduce, const std::vector<platform::Place> &places, const size_t num_of_all_reduce,
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
...@@ -30,7 +30,7 @@ namespace paddle { ...@@ -30,7 +30,7 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
struct FusedAllReduceOpHandle : public AllReduceOpHandle { struct FusedAllReduceOpHandle : public AllReduceOpHandle {
FusedAllReduceOpHandle(ir::Node *node, FusedAllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
......
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
...@@ -35,7 +35,7 @@ namespace details { ...@@ -35,7 +35,7 @@ namespace details {
struct FusedBroadcastOpHandle : public BroadcastOpHandle { struct FusedBroadcastOpHandle : public BroadcastOpHandle {
public: public:
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
FusedBroadcastOpHandle(ir::Node *node, FusedBroadcastOpHandle(ir::Node *node,
const std::vector<Scope *> local_scopes, const std::vector<Scope *> local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
......
...@@ -45,14 +45,14 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ...@@ -45,14 +45,14 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
nodes_.emplace_back( nodes_.emplace_back(
ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation)); ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation));
if (use_gpu_) { if (use_gpu_) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
op_handle_ = new FusedBroadcastOpHandle( op_handle_ = new FusedBroadcastOpHandle(
nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
#else #else
PADDLE_THROW("CUDA is not supported."); PADDLE_THROW("CUDA is not supported.");
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
op_handle_ = new FusedBroadcastOpHandle( op_handle_ = new FusedBroadcastOpHandle(
nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
#else #else
......
...@@ -264,7 +264,7 @@ void ReduceOpHandle::RunImpl() { ...@@ -264,7 +264,7 @@ void ReduceOpHandle::RunImpl() {
} }
}); });
} else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) { } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
auto pre_in = pre_in_var->Get<framework::LoDTensor>(); auto pre_in = pre_in_var->Get<framework::LoDTensor>();
VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var); VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
VariableVisitor::GetMutableTensor(out_var).mutable_data( VariableVisitor::GetMutableTensor(out_var).mutable_data(
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
...@@ -62,7 +62,7 @@ struct ReduceOpHandle : public OpHandleBase { ...@@ -62,7 +62,7 @@ struct ReduceOpHandle : public OpHandleBase {
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
const platform::NCCLContextMap *nccl_ctxs_; const platform::NCCLContextMap *nccl_ctxs_;
ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes, ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
......
...@@ -61,7 +61,7 @@ std::shared_ptr<DeviceWorker> DeviceWorkerFactory::CreateDeviceWorker( ...@@ -61,7 +61,7 @@ std::shared_ptr<DeviceWorker> DeviceWorkerFactory::CreateDeviceWorker(
REGISTER_DEVICE_WORKER_CLASS(HogwildWorker); REGISTER_DEVICE_WORKER_CLASS(HogwildWorker);
REGISTER_DEVICE_WORKER_CLASS(DownpourWorker); REGISTER_DEVICE_WORKER_CLASS(DownpourWorker);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
REGISTER_DEVICE_WORKER_CLASS(SectionWorker); REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
#endif #endif
} // namespace framework } // namespace framework
......
...@@ -4,7 +4,9 @@ else() ...@@ -4,7 +4,9 @@ else()
cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope) cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope)
endif(WITH_PSLIB) endif(WITH_PSLIB)
cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope) if(WITH_NCCL)
cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope)
endif()
if(WITH_BOX_PS) if(WITH_BOX_PS)
cc_library(box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor box_ps) cc_library(box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor box_ps)
else() else()
......
...@@ -39,7 +39,7 @@ class AllReduceDepsPass : public ir::Pass { ...@@ -39,7 +39,7 @@ class AllReduceDepsPass : public ir::Pass {
std::vector<details::OpHandleBase*> all_reduce_op_handles = std::vector<details::OpHandleBase*> all_reduce_op_handles =
GetSortedAllReduceOps(*graph); GetSortedAllReduceOps(*graph);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
auto use_hierarchical_allreduce = auto use_hierarchical_allreduce =
Get<bool>(details::kUseHierarchicalAllReduce); Get<bool>(details::kUseHierarchicalAllReduce);
for (size_t i = 0; i < all_reduce_op_handles.size(); ++i) { for (size_t i = 0; i < all_reduce_op_handles.size(); ++i) {
......
...@@ -38,7 +38,7 @@ class FuseAllReduceOpPass : public ir::Pass { ...@@ -38,7 +38,7 @@ class FuseAllReduceOpPass : public ir::Pass {
auto &places = Get<const std::vector<platform::Place>>(details::kPlaces); auto &places = Get<const std::vector<platform::Place>>(details::kPlaces);
auto &local_scopes = Get<const std::vector<Scope *>>(details::kLocalScopes); auto &local_scopes = Get<const std::vector<Scope *>>(details::kLocalScopes);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
auto *multi_nccl_ctxs = auto *multi_nccl_ctxs =
&Get<platform::NCCLCommunicator>(details::kNCCLCtxs); &Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
#endif #endif
...@@ -85,7 +85,7 @@ class FuseAllReduceOpPass : public ir::Pass { ...@@ -85,7 +85,7 @@ class FuseAllReduceOpPass : public ir::Pass {
for (auto &p_g : group_p_g) { for (auto &p_g : group_p_g) {
group_all_reduce_ops.emplace_back(all_reduce_ops.at(p_g.second)); group_all_reduce_ops.emplace_back(all_reduce_ops.at(p_g.second));
} }
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
InsertFusedAllReduce(places, local_scopes, group_size, InsertFusedAllReduce(places, local_scopes, group_size,
group_all_reduce_ops, multi_nccl_ctxs, &result); group_all_reduce_ops, multi_nccl_ctxs, &result);
#else #else
...@@ -134,7 +134,7 @@ class FuseAllReduceOpPass : public ir::Pass { ...@@ -134,7 +134,7 @@ class FuseAllReduceOpPass : public ir::Pass {
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const size_t num_of_all_reduce, const size_t num_of_all_reduce,
const std::vector<ir::Node *> &all_reduce_ops, const std::vector<ir::Node *> &all_reduce_ops,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
const platform::NCCLCommunicator *multi_nccl_ctxs, const platform::NCCLCommunicator *multi_nccl_ctxs,
#endif #endif
ir::Graph *result) const { ir::Graph *result) const {
...@@ -161,7 +161,7 @@ class FuseAllReduceOpPass : public ir::Pass { ...@@ -161,7 +161,7 @@ class FuseAllReduceOpPass : public ir::Pass {
result->RemoveNode(op_handle.Node()); result->RemoveNode(op_handle.Node());
} }
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places, CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
local_scopes, multi_nccl_ctxs, result); local_scopes, multi_nccl_ctxs, result);
#else #else
...@@ -177,11 +177,11 @@ class FuseAllReduceOpPass : public ir::Pass { ...@@ -177,11 +177,11 @@ class FuseAllReduceOpPass : public ir::Pass {
const size_t num_of_all_reduce, const size_t num_of_all_reduce,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
const platform::NCCLCommunicator *multi_nccl_ctxs, const platform::NCCLCommunicator *multi_nccl_ctxs,
#endif #endif
ir::Graph *result) const { ir::Graph *result) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
auto *op_handle = new details::FusedAllReduceOpHandle( auto *op_handle = new details::FusedAllReduceOpHandle(
result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation), result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
local_scopes, places, num_of_all_reduce, multi_nccl_ctxs); local_scopes, places, num_of_all_reduce, multi_nccl_ctxs);
...@@ -199,7 +199,7 @@ class FuseAllReduceOpPass : public ir::Pass { ...@@ -199,7 +199,7 @@ class FuseAllReduceOpPass : public ir::Pass {
op_handle->AddOutput(out); op_handle->AddOutput(out);
} }
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
if (!multi_nccl_ctxs) { if (!multi_nccl_ctxs) {
SetCommunicationContext(places, op_handle); SetCommunicationContext(places, op_handle);
} }
......
...@@ -156,7 +156,7 @@ void MultiDevSSAGraphBuilderBase::Init() const { ...@@ -156,7 +156,7 @@ void MultiDevSSAGraphBuilderBase::Init() const {
places_ = Get<const std::vector<platform::Place>>(details::kPlaces); places_ = Get<const std::vector<platform::Place>>(details::kPlaces);
local_scopes_ = Get<const std::vector<Scope *>>(details::kLocalScopes); local_scopes_ = Get<const std::vector<Scope *>>(details::kLocalScopes);
strategy_ = Get<const details::BuildStrategy>(kStrategy); strategy_ = Get<const details::BuildStrategy>(kStrategy);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
multi_nccl_ctxs_ = &Get<platform::NCCLCommunicator>(details::kNCCLCtxs); multi_nccl_ctxs_ = &Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
nccl_ctxs_ = nullptr; nccl_ctxs_ = nullptr;
if (multi_nccl_ctxs_) { if (multi_nccl_ctxs_) {
...@@ -298,7 +298,7 @@ std::vector<ir::Node *> MultiDevSSAGraphBuilderBase::SortOperations( ...@@ -298,7 +298,7 @@ std::vector<ir::Node *> MultiDevSSAGraphBuilderBase::SortOperations(
bool MultiDevSSAGraphBuilderBase::UseGPU() const { bool MultiDevSSAGraphBuilderBase::UseGPU() const {
bool use_gpu = false; bool use_gpu = false;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
use_gpu = nccl_ctxs_ != nullptr; use_gpu = nccl_ctxs_ != nullptr;
#endif #endif
return use_gpu; return use_gpu;
...@@ -348,7 +348,7 @@ void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result, ...@@ -348,7 +348,7 @@ void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result,
void MultiDevSSAGraphBuilderBase::SetCommunicationContext( void MultiDevSSAGraphBuilderBase::SetCommunicationContext(
details::OpHandleBase *op_handle, const platform::Place &p) const { details::OpHandleBase *op_handle, const platform::Place &p) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
if (nccl_ctxs_ == nullptr) { if (nccl_ctxs_ == nullptr) {
op_handle->SetDeviceContext(p, op_handle->SetDeviceContext(p,
platform::DeviceContextPool::Instance().Get(p)); platform::DeviceContextPool::Instance().Get(p));
...@@ -362,7 +362,7 @@ void MultiDevSSAGraphBuilderBase::SetCommunicationContext( ...@@ -362,7 +362,7 @@ void MultiDevSSAGraphBuilderBase::SetCommunicationContext(
void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result, void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result,
const std::string &p_name, const std::string &p_name,
size_t src_dev_id) const { size_t src_dev_id) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
auto *op_handle = new details::BroadcastOpHandle( auto *op_handle = new details::BroadcastOpHandle(
result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_); local_scopes_, places_, nccl_ctxs_);
...@@ -395,7 +395,7 @@ void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result, ...@@ -395,7 +395,7 @@ void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result,
void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp( void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp(
ir::Graph *result, ir::Graph *result,
const std::vector<std::unordered_set<std::string>> &bcast_varnames) const { const std::vector<std::unordered_set<std::string>> &bcast_varnames) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
auto *op_handle = new details::FusedBroadcastOpHandle( auto *op_handle = new details::FusedBroadcastOpHandle(
result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation), result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_); local_scopes_, places_, nccl_ctxs_);
...@@ -451,7 +451,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result, ...@@ -451,7 +451,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
auto append_allreduce_op = [&]( auto append_allreduce_op = [&](
const std::vector<Scope *> &scopes, const std::vector<Scope *> &scopes,
const std::vector<platform::Place> &places) -> details::OpHandleBase * { const std::vector<platform::Place> &places) -> details::OpHandleBase * {
#if defined(PADDLE_WITH_DGC) #if defined(PADDLE_WITH_DGC) && defined(PADDLE_WITH_NCCL)
if (is_encoded) { if (is_encoded) {
result->Get<GraphOps>(kGraphOps).emplace_back( result->Get<GraphOps>(kGraphOps).emplace_back(
new details::SparseAllReduceOpHandle( new details::SparseAllReduceOpHandle(
...@@ -464,7 +464,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result, ...@@ -464,7 +464,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
scopes, places, multi_nccl_ctxs_)); scopes, places, multi_nccl_ctxs_));
} }
#elif defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #elif defined(PADDLE_WITH_NCCL)
result->Get<GraphOps>(kGraphOps).emplace_back( result->Get<GraphOps>(kGraphOps).emplace_back(
new details::AllReduceOpHandle( new details::AllReduceOpHandle(
result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
...@@ -539,7 +539,7 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOps( ...@@ -539,7 +539,7 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOps(
details::VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp( details::VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(
ir::Graph *result, const std::string &og, size_t dst_dev_id) const { ir::Graph *result, const std::string &og, size_t dst_dev_id) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
result->Get<GraphOps>(kGraphOps).emplace_back(new details::ReduceOpHandle( result->Get<GraphOps>(kGraphOps).emplace_back(new details::ReduceOpHandle(
result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_)); local_scopes_, places_, nccl_ctxs_));
......
...@@ -94,7 +94,7 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass { ...@@ -94,7 +94,7 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
void CreateOpHandleIOs(ir::Graph *result, ir::Node *node, void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
size_t device_id) const; size_t device_id) const;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
mutable platform::NCCLContextMap *nccl_ctxs_{nullptr}; mutable platform::NCCLContextMap *nccl_ctxs_{nullptr};
mutable platform::NCCLCommunicator *multi_nccl_ctxs_{nullptr}; mutable platform::NCCLCommunicator *multi_nccl_ctxs_{nullptr};
#endif #endif
......
...@@ -109,7 +109,7 @@ class ParallelExecutorPrivate { ...@@ -109,7 +109,7 @@ class ParallelExecutorPrivate {
} }
} }
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) { void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) {
VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_ VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_
<< ", num_trainers:" << bst.num_trainers_ << ", num_trainers:" << bst.num_trainers_
...@@ -473,7 +473,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -473,7 +473,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
} }
if (member_->use_cuda_ && member_->nranks_ > 1) { if (member_->use_cuda_ && member_->nranks_ > 1) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_); member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_);
// Initialize device context's nccl comm, will be used by normal // Initialize device context's nccl comm, will be used by normal
...@@ -652,7 +652,7 @@ void ParallelExecutor::BCastParamsToDevices( ...@@ -652,7 +652,7 @@ void ParallelExecutor::BCastParamsToDevices(
} }
auto &dims = main_tensor.dims(); auto &dims = main_tensor.dims();
if (paddle::platform::is_gpu_place(main_tensor.place())) { if (paddle::platform::is_gpu_place(main_tensor.place())) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
std::vector<void *> buffers; std::vector<void *> buffers;
buffers.reserve(member_->places_.size()); buffers.reserve(member_->places_.size());
size_t numel = main_tensor.numel(); size_t numel = main_tensor.numel();
......
...@@ -32,7 +32,7 @@ limitations under the License. */ ...@@ -32,7 +32,7 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/trainer.h" #include "paddle/fluid/framework/trainer.h"
......
...@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "google/protobuf/io/zero_copy_stream_impl.h" #include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/message.h" #include "google/protobuf/message.h"
#include "google/protobuf/text_format.h" #include "google/protobuf/text_format.h"
......
...@@ -63,7 +63,7 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer( ...@@ -63,7 +63,7 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
REGISTER_TRAINER_CLASS(MultiTrainer); REGISTER_TRAINER_CLASS(MultiTrainer);
REGISTER_TRAINER_CLASS(DistMultiTrainer); REGISTER_TRAINER_CLASS(DistMultiTrainer);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
REGISTER_TRAINER_CLASS(PipelineTrainer); REGISTER_TRAINER_CLASS(PipelineTrainer);
#endif #endif
} // namespace framework } // namespace framework
......
...@@ -9,7 +9,9 @@ cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer) ...@@ -9,7 +9,9 @@ cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer)
cc_library(engine SRCS engine.cc DEPS layer gradient_accumulator) cc_library(engine SRCS engine.cc DEPS layer gradient_accumulator)
cc_library(imperative_profiler SRCS profiler.cc) cc_library(imperative_profiler SRCS profiler.cc)
if(NOT WIN32) if(NOT WIN32)
cc_library(nccl_context SRCS nccl_context.cc DEPS device_context) if(WITH_NCCL)
cc_library(nccl_context SRCS nccl_context.cc DEPS device_context)
endif()
cc_library(data_loader SRCS data_loader.cc DEPS enforce) cc_library(data_loader SRCS data_loader.cc DEPS enforce)
endif(NOT WIN32) endif(NOT WIN32)
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
void NCCLParallelContext::RecvNCCLID(const std::string &ep, void NCCLParallelContext::RecvNCCLID(const std::string &ep,
ncclUniqueId *nccl_id) { ncclUniqueId *nccl_id) {
auto addr = paddle::string::Split(ep, ':'); auto addr = paddle::string::Split(ep, ':');
......
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
#endif #endif
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -57,7 +57,7 @@ class ParallelContext { ...@@ -57,7 +57,7 @@ class ParallelContext {
platform::Place place_; platform::Place place_;
}; };
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
class NCCLParallelContext : ParallelContext { class NCCLParallelContext : ParallelContext {
public: public:
explicit NCCLParallelContext(const ParallelStrategy& strategy, explicit NCCLParallelContext(const ParallelStrategy& strategy,
......
if(WIN32) if(WIN32)
cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS device_context) cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS device_context)
else() else()
cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context) if (WITH_NCCL)
cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
endif()
endif(WIN32) endif(WIN32)
......
...@@ -69,8 +69,10 @@ if (WITH_GPU) ...@@ -69,8 +69,10 @@ if (WITH_GPU)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
endif() endif()
if (NOT WIN32) if (NOT WIN32)
op_library(sync_batch_norm_op) if (WITH_NCCL)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n") op_library(sync_batch_norm_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
endif()
endif() endif()
else() else()
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
......
...@@ -30,7 +30,7 @@ endforeach() ...@@ -30,7 +30,7 @@ endforeach()
register_operators(EXCLUDES c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) register_operators(EXCLUDES c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
if(WITH_GPU AND NOT WIN32) if(WITH_NCCL)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} nccl_common) op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} nccl_common)
endif() endif()
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/data_norm_op.h" #include "paddle/fluid/operators/data_norm_op.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
...@@ -178,7 +178,7 @@ class DataNormGradKernel<platform::CUDADeviceContext, T> ...@@ -178,7 +178,7 @@ class DataNormGradKernel<platform::CUDADeviceContext, T>
d_batch_sum, d_batch_square_sum); d_batch_sum, d_batch_square_sum);
if (need_sync_stats) { if (need_sync_stats) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
auto comm = platform::NCCLCommContext::Instance().Get(0, ctx.GetPlace()); auto comm = platform::NCCLCommContext::Instance().Get(0, ctx.GetPlace());
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
reinterpret_cast<const void *>(d_batch_size), reinterpret_cast<const void *>(d_batch_size),
......
...@@ -30,7 +30,7 @@ endforeach() ...@@ -30,7 +30,7 @@ endforeach()
register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS}) register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS})
if(WITH_GPU AND NOT WIN32) if(WITH_NCCL)
set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common) set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common)
op_library(gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS} nccl_common) op_library(gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS} nccl_common)
endif() endif()
......
if (NOT WITH_NCCL)
return()
endif()
if(WITH_GPU AND NOT WIN32) if(WITH_GPU AND NOT WIN32)
nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator ) nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
endif() endif()
......
...@@ -305,7 +305,7 @@ CUDADeviceContext::~CUDADeviceContext() { ...@@ -305,7 +305,7 @@ CUDADeviceContext::~CUDADeviceContext() {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_), PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_),
"Failed to destory Cudnn handle"); "Failed to destory Cudnn handle");
} }
#if !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
if (nccl_comm_) { if (nccl_comm_) {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
} }
......
...@@ -22,7 +22,7 @@ limitations under the License. */ ...@@ -22,7 +22,7 @@ limitations under the License. */
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_helper.h"
#include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cudnn.h"
#if !defined(__APPLE__) && !defined(_WIN32) #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
#endif #endif
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
...@@ -144,7 +144,7 @@ class CUDADeviceContext : public DeviceContext { ...@@ -144,7 +144,7 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return cuda stream in the device context. */ /*! \brief Return cuda stream in the device context. */
cudaStream_t stream() const; cudaStream_t stream() const;
#if !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
/*! \brief Return nccl communicators. */ /*! \brief Return nccl communicators. */
ncclComm_t nccl_comm() const { return nccl_comm_; } ncclComm_t nccl_comm() const { return nccl_comm_; }
...@@ -180,7 +180,7 @@ class CUDADeviceContext : public DeviceContext { ...@@ -180,7 +180,7 @@ class CUDADeviceContext : public DeviceContext {
std::unique_ptr<CublasHandleHolder> cublas_handle_; std::unique_ptr<CublasHandleHolder> cublas_handle_;
std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_; std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
#if !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
// NCCL communicator (single process version) for NCCL collective operations. // NCCL communicator (single process version) for NCCL collective operations.
// NCCL collective operations provides fast collectives over multiple GPUs // NCCL collective operations provides fast collectives over multiple GPUs
// both within and across nodes. // both within and across nodes.
......
...@@ -5,7 +5,10 @@ list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc) ...@@ -5,7 +5,10 @@ list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc)
# There is no macOS version of NCCL. # There is no macOS version of NCCL.
# Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux. # Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux.
if (NOT APPLE AND NOT WIN32) if (NOT APPLE AND NOT WIN32)
list(APPEND CUDA_SRCS nccl.cc nvrtc.cc cuda_driver.cc) list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
if (WITH_NCCL)
list(APPEND CUDA_SRCS nccl.cc)
endif()
endif() endif()
if (TENSORRT_FOUND) if (TENSORRT_FOUND)
......
...@@ -292,7 +292,7 @@ TEST(enforce, cuda_success) { ...@@ -292,7 +292,7 @@ TEST(enforce, cuda_success) {
EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS)); EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS));
EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED)); EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED));
EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE)); EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE));
#if !defined(__APPLE__) && !defined(_WIN32) #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess)); EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError)); EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError));
EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError)); EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError));
......
set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper nccl_wrapper prune set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
gloo_wrapper) gloo_wrapper)
if (WITH_NCCL)
set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
endif()
if(NOT WIN32) if(NOT WIN32)
set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context data_loader) set(PYBIND_DEPS ${PYBIND_DEPS} data_loader)
if (WITH_NCCL)
set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context)
endif()
endif(NOT WIN32) endif(NOT WIN32)
if(WITH_PYTHON) if(WITH_PYTHON)
...@@ -25,7 +32,6 @@ set(PYBIND_SRCS ...@@ -25,7 +32,6 @@ set(PYBIND_SRCS
fleet_wrapper_py.cc fleet_wrapper_py.cc
gloo_wrapper_py.cc gloo_wrapper_py.cc
box_helper_py.cc box_helper_py.cc
nccl_wrapper_py.cc
data_set_py.cc data_set_py.cc
imperative.cc imperative.cc
ir.cc ir.cc
...@@ -35,15 +41,19 @@ if (WITH_DISTRIBUTE) ...@@ -35,15 +41,19 @@ if (WITH_DISTRIBUTE)
list(APPEND PYBIND_SRCS communicator_py.cc) list(APPEND PYBIND_SRCS communicator_py.cc)
endif() endif()
if (WITH_NCCL)
list(APPEND PYBIND_SRCS nccl_wrapper_py.cc)
endif()
if(WITH_PYTHON) if(WITH_PYTHON)
# generate op pybind functions automatically for dygraph. # generate op pybind functions automatically for dygraph.
set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag) set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB}) list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB})
list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS}) list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS})
if(NOT WIN32) if(WITH_NCCL)
list(APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context) list(APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context)
endif(NOT WIN32) endif(WITH_NCCL)
add_executable(op_function_generator op_function_generator.cc) add_executable(op_function_generator op_function_generator.cc)
target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS}) target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
......
...@@ -611,7 +611,7 @@ void BindImperative(py::module *m_ptr) { ...@@ -611,7 +611,7 @@ void BindImperative(py::module *m_ptr) {
}, },
[](imperative::ParallelStrategy &self, [](imperative::ParallelStrategy &self,
const std::string &ep) { self.current_endpoint_ = ep; }); const std::string &ep) { self.current_endpoint_ = ep; });
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
py::class_<imperative::NCCLParallelContext> nccl_ctx(m, py::class_<imperative::NCCLParallelContext> nccl_ctx(m,
"NCCLParallelContext"); "NCCLParallelContext");
......
...@@ -2209,7 +2209,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2209,7 +2209,7 @@ All parameter, weight, gradient are variables in Paddle.
BindFleetWrapper(&m); BindFleetWrapper(&m);
BindGlooWrapper(&m); BindGlooWrapper(&m);
BindBoxHelper(&m); BindBoxHelper(&m);
#ifndef _WIN32 #ifdef PADDLE_WITH_NCCL
BindNCCLWrapper(&m); BindNCCLWrapper(&m);
#endif #endif
BindGraph(&m); BindGraph(&m);
......
...@@ -4,6 +4,9 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FL ...@@ -4,6 +4,9 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FL
set(dist_ENVS http_proxy="" https_proxy="") set(dist_ENVS http_proxy="" https_proxy="")
file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py") file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
if(NOT WITH_NCCL)
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl")
endif()
string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}") string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist) list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext) list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册