Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
65bbf950
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
65bbf950
编写于
5月 27, 2019
作者:
G
gongweibao
提交者:
GitHub
5月 27, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add multi-ncclcomm and 2D ncclallreduce support. (#17263)
上级
b1bd483a
变更
27
隐藏空白更改
内联
并排
Showing
27 changed file
with
862 addition
and
166 deletion
+862
-166
paddle/fluid/framework/details/all_reduce_op_handle.cc
paddle/fluid/framework/details/all_reduce_op_handle.cc
+8
-23
paddle/fluid/framework/details/all_reduce_op_handle.h
paddle/fluid/framework/details/all_reduce_op_handle.h
+12
-5
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+21
-12
paddle/fluid/framework/details/build_strategy.h
paddle/fluid/framework/details/build_strategy.h
+12
-1
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
...uid/framework/details/fast_threaded_ssa_graph_executor.cc
+1
-0
paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+5
-17
paddle/fluid/framework/details/fused_all_reduce_op_handle.h
paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+8
-5
paddle/fluid/framework/details/multi_devices_helper.h
paddle/fluid/framework/details/multi_devices_helper.h
+1
-0
paddle/fluid/framework/details/nccl_op_handle.h
paddle/fluid/framework/details/nccl_op_handle.h
+234
-0
paddle/fluid/framework/details/op_handle_base.cc
paddle/fluid/framework/details/op_handle_base.cc
+5
-0
paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
...le/fluid/framework/details/parallel_ssa_graph_executor.cc
+1
-0
paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
...le/fluid/framework/details/sparse_all_reduce_op_handle.cc
+3
-2
paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
+1
-1
paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
...ework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
+27
-13
paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
...rk/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+13
-12
paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
...k/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+8
-4
paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
...rk/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
+2
-1
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+97
-35
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+0
-3
paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
+166
-26
paddle/fluid/platform/nccl_helper.h
paddle/fluid/platform/nccl_helper.h
+130
-2
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+28
-0
python/paddle/fluid/compiler.py
python/paddle/fluid/compiler.py
+8
-0
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+4
-0
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+13
-1
python/paddle/fluid/tests/unittests/test_dist_mnist.py
python/paddle/fluid/tests/unittests/test_dist_mnist.py
+14
-0
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+40
-3
未找到文件。
paddle/fluid/framework/details/all_reduce_op_handle.cc
浏览文件 @
65bbf950
...
...
@@ -35,16 +35,9 @@ namespace details {
AllReduceOpHandle
::
AllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
NCCLContextMap
*
ctxs
)
:
OpHandleBase
(
node
),
local_scopes_
(
local_scopes
),
places_
(
places
),
nccl_ctxs_
(
ctxs
)
{
if
(
nccl_ctxs_
)
{
for
(
auto
&
p
:
places_
)
{
this
->
SetDeviceContext
(
p
,
nccl_ctxs_
->
DevCtx
(
p
));
}
}
const
platform
::
MultiNCCLContextMap
*
ctxs
)
:
NCCLOpHandleBase
(
node
,
places
,
ctxs
),
local_scopes_
(
local_scopes
)
{
PADDLE_ENFORCE_EQ
(
places_
.
size
(),
local_scopes_
.
size
());
}
#else
AllReduceOpHandle
::
AllReduceOpHandle
(
ir
::
Node
*
node
,
...
...
@@ -71,7 +64,9 @@ void AllReduceOpHandle::RunAllReduceFuncs(
if
(
FLAGS_sync_nccl_allreduce
)
{
for
(
auto
&
p
:
places_
)
{
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
).
device
;
auto
&
nccl_ctx
=
nccl_ctxs_
->
at
(
dev_id
);
auto
*
nccl_ctxs
=
nccl_ctxs_
->
GetRunEnvNCCLCtx
(
run_order_
,
use_hierarchical_allreduce_
);
auto
&
nccl_ctx
=
nccl_ctxs
->
at
(
dev_id
);
auto
stream
=
nccl_ctx
.
stream
();
cudaError_t
e_sync
=
cudaStreamSynchronize
(
stream
);
if
(
e_sync
!=
0
)
{
...
...
@@ -134,19 +129,9 @@ void AllReduceOpHandle::RunImpl() {
numel
=
static_cast
<
size_t
>
(
lod_tensor
.
numel
());
}
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
).
device
;
auto
&
nccl_ctx
=
nccl_ctxs_
->
at
(
dev_id
);
auto
stream
=
nccl_ctx
.
stream
();
auto
comm
=
nccl_ctx
.
comm_
;
VLOG
(
10
)
<<
"before all reduce buffer:"
<<
buffer
<<
", numel:"
<<
numel
<<
", dev_id:"
<<
dev_id
<<
", dtype:"
<<
dtype
<<
", place:"
<<
p
;
all_reduce_calls
.
emplace_back
([
=
]
{
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclAllReduce
(
buffer
,
buffer
,
numel
,
static_cast
<
ncclDataType_t
>
(
dtype
),
ncclSum
,
comm
,
stream
));
NCCLAllReduce
(
p
,
buffer
,
buffer
,
numel
,
static_cast
<
ncclDataType_t
>
(
dtype
),
ncclSum
);
});
}
RunAllReduceFuncs
(
all_reduce_calls
);
...
...
paddle/fluid/framework/details/all_reduce_op_handle.h
浏览文件 @
65bbf950
...
...
@@ -21,6 +21,7 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
...
...
@@ -28,13 +29,15 @@ namespace paddle {
namespace
framework
{
namespace
details
{
class
AllReduceOpHandle
:
public
OpHandleBase
{
public:
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
class
AllReduceOpHandle
:
public
NCCLOpHandleBase
{
public:
AllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
NCCLContextMap
*
ctxs
);
const
platform
::
Multi
NCCLContextMap
*
ctxs
);
#else
class
AllReduceOpHandle
:
public
OpHandleBase
{
public:
AllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
);
#endif
...
...
@@ -46,13 +49,17 @@ class AllReduceOpHandle : public OpHandleBase {
protected:
void
RunImpl
()
override
;
std
::
vector
<
Scope
*>
local_scopes_
;
#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
// NCCLOpHandleBase already have these attributes.
// Will polish it by class inheritance framework.
std
::
vector
<
platform
::
Place
>
places_
;
#endif
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void
RunAllReduceFuncs
(
const
std
::
vector
<
std
::
function
<
void
()
>>
&
all_reduce_calls
);
const
platform
::
NCCLContextMap
*
nccl_ctxs_
;
#endif
};
...
...
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
65bbf950
...
...
@@ -256,16 +256,14 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
return
framework
::
ir
::
MultiDevSSAGraphBuilder
().
count
(
pass_name
)
>
0
;
}
ir
::
Graph
*
BuildStrategy
::
Apply
(
ir
::
Graph
*
graph
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
string
&
loss_var_name
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
size_t
&
nranks
,
ir
::
Graph
*
BuildStrategy
::
Apply
(
ir
::
Graph
*
graph
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
string
&
loss_var_name
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
size_t
&
nranks
,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const
bool
use_cuda
,
platform
::
NCCLContextMap
*
nccl_ctxs
)
const
{
const
bool
use_cuda
,
platform
::
MultiNCCLContextMap
*
nccl_ctxs
)
const
{
#else
const
bool
use_cuda
)
const
{
const
bool
use_cuda
)
const
{
#endif
VLOG
(
3
)
<<
"apply all passes"
;
// Create a default one if not finalized by user.
...
...
@@ -285,9 +283,9 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
pass
->
Set
<
size_t
>
(
ir
::
kNRanks
,
new
size_t
(
nranks
));
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform
::
NCCLContextMap
*
nctx
=
use_cuda
?
nccl_ctxs
:
nullptr
;
platform
::
Multi
NCCLContextMap
*
nctx
=
use_cuda
?
nccl_ctxs
:
nullptr
;
pass
->
Erase
(
kNCCLCtxs
);
pass
->
SetNotOwned
<
platform
::
NCCLContextMap
>
(
kNCCLCtxs
,
nctx
);
pass
->
SetNotOwned
<
platform
::
Multi
NCCLContextMap
>
(
kNCCLCtxs
,
nctx
);
#endif
}
else
if
(
pass
->
Type
()
==
"alloc_continuous_space_for_grad_pass"
||
pass
->
Type
()
==
"fuse_adam_op_pass"
||
...
...
@@ -301,9 +299,12 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
&
local_scopes
);
if
(
pass
->
Type
()
==
"fuse_all_reduce_op_pass"
)
{
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform
::
NCCLContextMap
*
nctx
=
use_cuda
?
nccl_ctxs
:
nullptr
;
platform
::
Multi
NCCLContextMap
*
nctx
=
use_cuda
?
nccl_ctxs
:
nullptr
;
pass
->
Erase
(
kNCCLCtxs
);
pass
->
SetNotOwned
<
platform
::
NCCLContextMap
>
(
kNCCLCtxs
,
nctx
);
pass
->
SetNotOwned
<
platform
::
MultiNCCLContextMap
>
(
kNCCLCtxs
,
nctx
);
pass
->
Erase
(
kUseHierarchicalAllReduce
);
pass
->
Set
<
bool
>
(
kUseHierarchicalAllReduce
,
new
bool
(
use_hierarchical_allreduce_
));
#endif
}
}
else
if
(
pass
->
Type
()
==
"alloc_continuous_space_for_grad_pass"
)
{
...
...
@@ -316,6 +317,14 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
LOG
(
INFO
)
<<
"set enable_sequential_execution:"
<<
enable_sequential_execution_
;
}
else
if
(
pass
->
Type
()
==
"all_reduce_deps_pass"
)
{
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform
::
MultiNCCLContextMap
*
nctx
=
use_cuda
?
nccl_ctxs
:
nullptr
;
pass
->
Erase
(
kNCCLCtxs
);
pass
->
SetNotOwned
<
platform
::
MultiNCCLContextMap
>
(
kNCCLCtxs
,
nctx
);
pass
->
Erase
(
kUseHierarchicalAllReduce
);
pass
->
Set
<
bool
>
(
kUseHierarchicalAllReduce
,
new
bool
(
use_hierarchical_allreduce_
));
#endif
LOG
(
INFO
)
<<
"SeqOnlyAllReduceOps:"
<<
SeqOnlyAllReduceOps
(
*
this
)
<<
", num_trainers:"
<<
num_trainers_
;
}
else
if
(
pass
->
Type
()
==
"fuse_relu_depthwise_conv_pass"
)
{
...
...
paddle/fluid/framework/details/build_strategy.h
浏览文件 @
65bbf950
...
...
@@ -111,6 +111,17 @@ struct BuildStrategy {
bool
cache_runtime_context_
{
false
};
std
::
unordered_set
<
std
::
string
>
mkldnn_enabled_op_types_
;
size_t
nccl_comm_num_
{
1
};
// The picture is here:
// https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
bool
use_hierarchical_allreduce_
{
false
};
// Nccl ranks in a node when use hierarchical allreduce, it's setted to gpu
// cards' number in most cases.
size_t
hierarchical_allreduce_inter_nranks_
{
0
};
// Nccl ranks bewteen nodes when use hierarchical allreduce, it's setted to
// nodes number.
size_t
hierarchical_allreduce_exter_nranks_
{
0
};
// NOTE:
// Before you add new options, think if it's a general strategy that works
// with other strategy. If not, the strategy should be created through
...
...
@@ -136,7 +147,7 @@ struct BuildStrategy {
const
size_t
&
nranks
,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const
bool
use_cuda
,
platform
::
NCCLContextMap
*
nccl_ctxs
)
const
;
platform
::
Multi
NCCLContextMap
*
nccl_ctxs
)
const
;
#else
const
bool
use_cuda
)
const
;
#endif
...
...
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
浏览文件 @
65bbf950
...
...
@@ -49,6 +49,7 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
FeedFetchList
FastThreadedSSAGraphExecutor
::
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
)
{
VLOG
(
3
)
<<
"enter FastThreadedSSAGraphExecutor Run"
;
std
::
unique_ptr
<
std
::
unordered_map
<
OpHandleBase
*
,
std
::
atomic
<
int
>>>
op_deps
=
atomic_op_deps_
.
get
();
PrepareAtomicOpDeps
();
...
...
paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
浏览文件 @
65bbf950
...
...
@@ -44,17 +44,10 @@ typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
FusedAllReduceOpHandle
::
FusedAllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
size_t
num_of_all_reduce
,
const
platform
::
NCCLContextMap
*
ctxs
)
:
OpHandleBase
(
node
),
const
platform
::
Multi
NCCLContextMap
*
ctxs
)
:
NCCLOpHandleBase
(
node
,
places
,
ctxs
),
local_scopes_
(
local_scopes
),
places_
(
places
),
num_of_all_reduce_
(
num_of_all_reduce
),
nccl_ctxs_
(
ctxs
)
{
if
(
nccl_ctxs_
)
{
for
(
auto
&
p
:
places_
)
{
this
->
SetDeviceContext
(
p
,
nccl_ctxs_
->
DevCtx
(
p
));
}
}
num_of_all_reduce_
(
num_of_all_reduce
)
{
PADDLE_ENFORCE_EQ
(
places_
.
size
(),
local_scopes_
.
size
());
}
#else
...
...
@@ -167,14 +160,9 @@ void FusedAllReduceOpHandle::RunImpl() {
auto
&
p
=
places_
[
i
];
void
*
buffer
=
const_cast
<
void
*>
(
lod_tensor_data
.
at
(
i
));
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
).
device
;
auto
&
nccl_ctx
=
nccl_ctxs_
->
at
(
dev_id
);
auto
stream
=
nccl_ctx
.
stream
();
auto
comm
=
nccl_ctx
.
comm_
;
all_reduce_calls
.
emplace_back
([
=
]
{
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclAllReduce
(
buffer
,
buffer
,
numel
,
static_cast
<
ncclDataType_t
>
(
nccl_dtype
),
ncclSum
,
comm
,
stream
));
NCCLAllReduce
(
p
,
buffer
,
buffer
,
numel
,
static_cast
<
ncclDataType_t
>
(
nccl_dtype
),
ncclSum
);
});
}
...
...
paddle/fluid/framework/details/fused_all_reduce_op_handle.h
浏览文件 @
65bbf950
...
...
@@ -21,6 +21,7 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
...
...
@@ -28,14 +29,15 @@ namespace paddle {
namespace
framework
{
namespace
details
{
struct
FusedAllReduceOpHandle
:
public
OpHandleBase
{
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
struct
FusedAllReduceOpHandle
:
public
NCCLOpHandleBase
{
FusedAllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
size_t
num_of_all_reduce
,
const
platform
::
NCCLContextMap
*
ctxs
);
const
platform
::
Multi
NCCLContextMap
*
ctxs
);
#else
struct
FusedAllReduceOpHandle
:
public
OpHandleBase
{
FusedAllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
...
...
@@ -52,11 +54,12 @@ struct FusedAllReduceOpHandle : public OpHandleBase {
private:
std
::
vector
<
Scope
*>
local_scopes_
;
#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
// NCCLOpHandleBase already have these attributes.
// Will polish it by class inheritance framework.
std
::
vector
<
platform
::
Place
>
places_
;
size_t
num_of_all_reduce_
;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const
platform
::
NCCLContextMap
*
nccl_ctxs_
;
#endif
size_t
num_of_all_reduce_
;
// Check the dtype of the input
void
GetDTypeAndNumel
(
...
...
paddle/fluid/framework/details/multi_devices_helper.h
浏览文件 @
65bbf950
...
...
@@ -45,6 +45,7 @@ constexpr char kGraphVars[] = "vars";
constexpr
char
kPlaces
[]
=
"places"
;
constexpr
char
kLocalScopes
[]
=
"local_scopes"
;
constexpr
char
kNCCLCtxs
[]
=
"nccl_ctxs"
;
constexpr
char
kUseHierarchicalAllReduce
[]
=
"use_hierarchical_allreduce"
;
// aux variables to represent dependency. Useful to resolve data hazard.
typedef
std
::
unordered_set
<
VarHandleBase
*>
GraphDepVars
;
...
...
paddle/fluid/framework/details/nccl_op_handle.h
0 → 100644
浏览文件 @
65bbf950
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/fluid/platform/nccl_helper.h"
DECLARE_bool
(
sync_nccl_allreduce
);
namespace
paddle
{
namespace
framework
{
namespace
details
{
class
NCCLOpHandleBase
:
public
OpHandleBase
{
public:
NCCLOpHandleBase
(
ir
::
Node
*
node
,
const
std
::
vector
<
platform
::
Place
>&
places
,
const
platform
::
MultiNCCLContextMap
*
nccl_ctxs
)
:
OpHandleBase
(
node
),
places_
(
places
),
nccl_ctxs_
(
nccl_ctxs
)
{
if
(
nccl_ctxs
==
nullptr
)
{
return
;
}
// init device context
auto
default_nccl_ctxs
=
nccl_ctxs_
->
DefaultFlatCtx
();
for
(
auto
&
p
:
places_
)
{
this
->
SetDeviceContext
(
p
,
default_nccl_ctxs
->
DevCtx
(
p
));
}
}
virtual
~
NCCLOpHandleBase
()
{
for
(
auto
&
ev
:
inter_events_
)
{
PADDLE_ENFORCE
(
cudaEventDestroy
(
ev
.
second
));
}
for
(
auto
&
ev
:
exter_events_
)
{
PADDLE_ENFORCE
(
cudaEventDestroy
(
ev
.
second
));
}
}
void
SetRunEnv
(
int
run_order
,
bool
use_hierarchical_allreduce
)
{
PADDLE_ENFORCE
(
run_order
>=
0
,
"run_order must >= 0"
);
run_order_
=
run_order
;
use_hierarchical_allreduce_
=
use_hierarchical_allreduce
;
VLOG
(
10
)
<<
"SetRunEnv "
<<
" run_order:"
<<
run_order
<<
", use_hierarchical_allreduce:"
<<
use_hierarchical_allreduce
;
if
(
nccl_ctxs_
==
nullptr
)
{
return
;
}
if
(
!
use_hierarchical_allreduce_
)
{
auto
ctxs
=
nccl_ctxs_
->
GetFlatCtx
(
run_order
);
for
(
auto
&
p
:
places_
)
{
this
->
SetDeviceContext
(
p
,
ctxs
->
DevCtx
(
p
));
}
return
;
}
PADDLE_ENFORCE
(
places_
.
size
()
==
1
,
"HierarchicalAllReduce run one proc with one card mode."
);
for
(
auto
&
p
:
places_
)
{
auto
ctxs
=
nccl_ctxs_
->
GetHierarchicalInterCtx
(
run_order
);
this
->
SetDeviceContext
(
p
,
ctxs
->
DevCtx
(
p
));
}
for
(
auto
&
p
:
dev_ctxes_
)
{
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
.
first
).
device
;
if
(
inter_events_
.
find
(
dev_id
)
!=
inter_events_
.
end
())
{
continue
;
}
PADDLE_ENFORCE
(
cudaSetDevice
(
dev_id
));
PADDLE_ENFORCE
(
cudaEventCreateWithFlags
(
&
inter_events_
[
dev_id
],
cudaEventDisableTiming
));
PADDLE_ENFORCE
(
cudaEventCreateWithFlags
(
&
exter_events_
[
dev_id
],
cudaEventDisableTiming
));
VLOG
(
10
)
<<
"Create events on dev_id:"
<<
dev_id
<<
", inter_event:"
<<
&
inter_events_
[
dev_id
]
<<
", exter_event:"
<<
&
exter_events_
[
dev_id
];
}
}
void
FlatNCCLAllReduce
(
platform
::
Place
place
,
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
count
,
ncclDataType_t
datatype
,
ncclRedOp_t
op
)
{
PADDLE_ENFORCE
(
run_order_
>=
0
,
"run_order must > 0"
);
auto
flat_nccl_ctxs
=
nccl_ctxs_
->
GetFlatCtx
(
run_order_
);
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place
).
device
;
auto
&
nccl_ctx
=
flat_nccl_ctxs
->
at
(
dev_id
);
auto
stream
=
nccl_ctx
.
stream
();
auto
comm
=
nccl_ctx
.
comm_
;
VLOG
(
10
)
<<
"before all reduce buffer:"
<<
sendbuff
<<
", numel:"
<<
count
<<
", dev_id:"
<<
dev_id
<<
", dtype:"
<<
datatype
<<
", place:"
<<
place
;
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclAllReduce
(
sendbuff
,
recvbuff
,
count
,
datatype
,
op
,
comm
,
stream
));
}
void
NCCLAllReduce
(
platform
::
Place
place
,
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
count
,
ncclDataType_t
datatype
,
ncclRedOp_t
op
)
{
PADDLE_ENFORCE
(
run_order_
>=
0
,
"run_order must > 0"
);
if
(
!
use_hierarchical_allreduce_
)
{
FlatNCCLAllReduce
(
place
,
sendbuff
,
recvbuff
,
count
,
datatype
,
op
);
return
;
}
HierarchicalAllReduce
(
place
,
sendbuff
,
recvbuff
,
count
,
datatype
,
op
);
}
void
HierarchicalAllReduce
(
platform
::
Place
place
,
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
count
,
ncclDataType_t
datatype
,
ncclRedOp_t
op
)
{
PADDLE_ENFORCE
(
run_order_
>=
0
,
"run_order must > 0"
);
InterReduce
(
place
,
sendbuff
,
recvbuff
,
count
,
datatype
,
op
);
// When a trainer is not in exter allreduce ring
// they need not to call this.
if
(
nccl_ctxs_
->
NeedExterAllReduce
())
{
ExterAllReduce
(
place
,
recvbuff
,
recvbuff
,
count
,
datatype
,
op
);
}
InterBroadCast
(
place
,
recvbuff
,
count
,
datatype
,
op
);
}
protected:
void
InterReduce
(
platform
::
Place
place
,
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
count
,
ncclDataType_t
datatype
,
ncclRedOp_t
op
)
{
auto
nccl_ctxs
=
nccl_ctxs_
->
GetHierarchicalInterCtx
(
run_order_
);
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place
).
device
;
auto
&
nccl_ctx
=
nccl_ctxs
->
at
(
dev_id
);
auto
stream
=
nccl_ctx
.
stream
();
auto
comm
=
nccl_ctx
.
comm_
;
VLOG
(
10
)
<<
"before all reduce"
<<
" run_order:"
<<
run_order_
<<
", buffer:"
<<
sendbuff
<<
", numel:"
<<
count
<<
", dev_id:"
<<
dev_id
<<
", dtype:"
<<
datatype
<<
", place:"
<<
place
<<
", stream:"
<<
stream
;
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclReduce
(
sendbuff
,
recvbuff
,
count
,
datatype
,
ncclSum
,
0
,
comm
,
stream
));
cudaEventRecord
(
inter_events_
.
at
(
dev_id
),
stream
);
if
(
FLAGS_sync_nccl_allreduce
)
{
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
),
"sync HierarchicalAllReduce inter stream error"
);
}
}
void
ExterAllReduce
(
platform
::
Place
place
,
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
count
,
ncclDataType_t
datatype
,
ncclRedOp_t
op
)
{
auto
nccl_ctxs
=
nccl_ctxs_
->
GetHierarchicalExterCtx
(
run_order_
);
PADDLE_ENFORCE
(
nccl_ctxs_
,
"can't get exter %d nccl_ctxs"
,
run_order_
);
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place
).
device
;
auto
&
nccl_ctx
=
nccl_ctxs
->
at
(
dev_id
);
auto
stream
=
nccl_ctx
.
stream
();
auto
comm
=
nccl_ctx
.
comm_
;
VLOG
(
10
)
<<
"before all reduce run_order:"
<<
run_order_
<<
"buffer:"
<<
sendbuff
<<
", numel:"
<<
count
<<
", dev_id:"
<<
dev_id
<<
", dtype:"
<<
datatype
<<
", place:"
<<
place
<<
", stream:"
<<
stream
;
cudaStreamWaitEvent
(
stream
,
inter_events_
.
at
(
dev_id
),
0
);
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclAllReduce
(
sendbuff
,
recvbuff
,
count
,
datatype
,
op
,
comm
,
stream
));
cudaEventRecord
(
exter_events_
.
at
(
dev_id
),
stream
);
if
(
FLAGS_sync_nccl_allreduce
)
{
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
),
"sync HierarchicalAllReduce exter stream error"
);
}
}
void
InterBroadCast
(
platform
::
Place
place
,
void
*
sendbuff
,
size_t
count
,
ncclDataType_t
datatype
,
ncclRedOp_t
op
)
{
auto
nccl_ctxs
=
nccl_ctxs_
->
GetHierarchicalInterCtx
(
run_order_
);
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place
).
device
;
auto
&
nccl_ctx
=
nccl_ctxs
->
at
(
dev_id
);
auto
stream
=
nccl_ctx
.
stream
();
auto
comm
=
nccl_ctx
.
comm_
;
VLOG
(
10
)
<<
"before InterBroadCast buffer:"
<<
sendbuff
<<
", numel:"
<<
count
<<
", dev_id:"
<<
dev_id
<<
", dtype:"
<<
datatype
<<
", place:"
<<
place
<<
", stream:"
<<
stream
;
cudaStreamWaitEvent
(
stream
,
exter_events_
.
at
(
dev_id
),
0
);
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
sendbuff
,
count
,
datatype
,
0
,
comm
,
stream
));
}
protected:
std
::
vector
<
platform
::
Place
>
places_
;
const
platform
::
MultiNCCLContextMap
*
nccl_ctxs_
{
nullptr
};
// When multi trainer call collective function, they need run the same order.
// Or the program will hang.So we use allreduce_deps_pass to set this
// run_order_.
int
run_order_
{
0
};
// Use 2d allreduce or not.
bool
use_hierarchical_allreduce_
{
false
};
private:
// hierarchical needed events
std
::
unordered_map
<
int
,
cudaEvent_t
>
inter_events_
;
std
::
unordered_map
<
int
,
cudaEvent_t
>
exter_events_
;
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/op_handle_base.cc
浏览文件 @
65bbf950
...
...
@@ -187,6 +187,11 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
std
::
function
<
void
()
>
method
=
callback
;
for
(
auto
&
p
:
dev_ctxes_
)
{
method
=
[
method
,
p
,
this
]()
{
VLOG
(
10
)
<<
"cudadevicecontext:"
<<
static_cast
<
platform
::
CUDADeviceContext
*>
(
p
.
second
)
<<
", dev_id:"
<<
boost
::
get
<
platform
::
CUDAPlace
>
(
p
.
first
).
device
;
static_cast
<
platform
::
CUDADeviceContext
*>
(
p
.
second
)
->
RecordEvent
(
events_
.
at
(
boost
::
get
<
platform
::
CUDAPlace
>
(
p
.
first
).
device
),
method
);
...
...
paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
浏览文件 @
65bbf950
...
...
@@ -95,6 +95,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
auto
seq_allreduce_pass
=
ir
::
PassRegistry
::
Instance
().
Get
(
"all_reduce_deps_pass"
);
seq_allreduce_pass
->
Set
<
bool
>
(
kUseHierarchicalAllReduce
,
new
bool
(
false
));
for
(
size_t
i
=
0
;
i
<
graphs_
.
size
();
++
i
)
{
graphs_
[
i
].
reset
(
seq_allreduce_pass
->
Apply
(
graphs_
[
i
].
release
()));
}
...
...
paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
浏览文件 @
65bbf950
...
...
@@ -30,7 +30,7 @@ namespace details {
SparseAllReduceOpHandle
::
SparseAllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
NCCLContextMap
*
ctxs
,
bool
is_encoded
,
int
nranks
)
const
platform
::
Multi
NCCLContextMap
*
ctxs
,
bool
is_encoded
,
int
nranks
)
:
AllReduceOpHandle
(
node
,
local_scopes
,
places
,
ctxs
),
is_encoded_
(
is_encoded
),
nranks_
(
nranks
)
{
...
...
@@ -102,7 +102,8 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
out_numel
=
(
out_numel
==
0
)
?
static_cast
<
size_t
>
(
out
.
numel
())
:
out_numel
;
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place
).
device
;
auto
&
nccl_ctx
=
nccl_ctxs_
->
at
(
dev_id
);
auto
*
nccl_ctxs
=
nccl_ctxs_
->
GetRunEnvNCCLCtx
(
run_order_
,
false
);
auto
&
nccl_ctx
=
nccl_ctxs
->
at
(
dev_id
);
auto
stream
=
nccl_ctx
.
stream
();
auto
comm
=
nccl_ctx
.
comm_
;
...
...
paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
浏览文件 @
65bbf950
...
...
@@ -32,7 +32,7 @@ class SparseAllReduceOpHandle : public AllReduceOpHandle {
SparseAllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
NCCLContextMap
*
ctxs
,
const
platform
::
Multi
NCCLContextMap
*
ctxs
,
bool
is_encoded
=
false
,
int
nranks
=
-
1
);
std
::
string
Name
()
const
override
;
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
浏览文件 @
65bbf950
...
...
@@ -22,6 +22,7 @@
#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
...
...
@@ -35,9 +36,20 @@ namespace ir {
class
AllReduceDepsPass
:
public
ir
::
Pass
{
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
override
{
std
::
vector
<
details
::
AllReduceOpHandl
e
*>
all_reduce_op_handles
=
std
::
vector
<
details
::
OpHandleBas
e
*>
all_reduce_op_handles
=
GetSortedAllReduceOps
(
*
graph
);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto
use_hierarchical_allreduce
=
Get
<
bool
>
(
details
::
kUseHierarchicalAllReduce
);
for
(
size_t
i
=
0
;
i
<
all_reduce_op_handles
.
size
();
++
i
)
{
auto
op_handle
=
dynamic_cast
<
details
::
NCCLOpHandleBase
*>
(
all_reduce_op_handles
[
i
]);
PADDLE_ENFORCE
(
op_handle
,
"op_handle must be NCCLOpHandleBase"
);
op_handle
->
SetRunEnv
(
i
,
use_hierarchical_allreduce
);
}
#endif
for
(
size_t
i
=
1
;
i
<
all_reduce_op_handles
.
size
();
++
i
)
{
auto
*
dep_var
=
new
details
::
DummyVarHandle
(
graph
->
CreateControlDepVar
());
graph
->
Get
<
details
::
GraphDepVars
>
(
details
::
kGraphDepVars
)
...
...
@@ -51,13 +63,12 @@ class AllReduceDepsPass : public ir::Pass {
}
}
std
::
vector
<
details
::
AllReduceOpHandl
e
*>
GetSortedAllReduceOps
(
std
::
vector
<
details
::
OpHandleBas
e
*>
GetSortedAllReduceOps
(
const
ir
::
Graph
&
graph
)
const
{
std
::
vector
<
details
::
AllReduceOpHandl
e
*>
all_reduce_op_handles
;
std
::
vector
<
details
::
OpHandleBas
e
*>
all_reduce_op_handles
;
std
::
unordered_map
<
details
::
OpHandleBase
*
,
size_t
>
pending_ops
;
std
::
unordered_set
<
details
::
OpHandleBase
*>
ready_ops
;
std
::
unordered_set
<
details
::
OpHandleBase
*>
next_ready_ops
;
auto
op_handles
=
ir
::
FilterByNodeWrapper
<
details
::
OpHandleBase
>
(
graph
);
size_t
num_of_ops
=
op_handles
.
size
();
for
(
details
::
OpHandleBase
*
op
:
op_handles
)
{
...
...
@@ -95,13 +106,16 @@ class AllReduceDepsPass : public ir::Pass {
void
GetSortedAllReduceOps
(
const
std
::
unordered_set
<
details
::
OpHandleBase
*>&
ready_ops
,
std
::
vector
<
details
::
AllReduceOpHandl
e
*>*
all_reduce_op_handles
)
const
{
std
::
vector
<
details
::
AllReduceOpHandl
e
*>
current_all_reduce_op_handles
;
std
::
vector
<
details
::
OpHandleBas
e
*>*
all_reduce_op_handles
)
const
{
std
::
vector
<
details
::
OpHandleBas
e
*>
current_all_reduce_op_handles
;
for
(
auto
&
op_handle
:
ready_ops
)
{
auto
all_reduce_op_handle
=
dynamic_cast
<
details
::
AllReduceOpHandle
*>
(
op_handle
);
if
(
all_reduce_op_handle
)
{
current_all_reduce_op_handles
.
emplace_back
(
all_reduce_op_handle
);
auto
fused_all_reduce_op_handle
=
dynamic_cast
<
details
::
FusedAllReduceOpHandle
*>
(
op_handle
);
if
(
all_reduce_op_handle
||
fused_all_reduce_op_handle
)
{
current_all_reduce_op_handles
.
emplace_back
(
op_handle
);
}
}
...
...
@@ -110,8 +124,8 @@ class AllReduceDepsPass : public ir::Pass {
// Sort the current_all_reduce_op_handles according to the name of input.
sort
(
current_all_reduce_op_handles
.
begin
(),
current_all_reduce_op_handles
.
end
(),
[](
const
details
::
AllReduceOpHandl
e
*
left
,
const
details
::
AllReduceOpHandl
e
*
right
)
->
bool
{
[](
const
details
::
OpHandleBas
e
*
left
,
const
details
::
OpHandleBas
e
*
right
)
->
bool
{
auto
left_in_vars
=
details
::
DynamicCast
<
details
::
VarHandle
>
(
left
->
Inputs
());
auto
right_in_vars
=
...
...
@@ -126,9 +140,9 @@ class AllReduceDepsPass : public ir::Pass {
current_all_reduce_op_handles
.
end
());
}
void
DebugString
(
const
ir
::
Graph
&
graph
,
const
std
::
vector
<
details
::
AllReduceOpHandle
*>&
all_reduce_op_handles
)
const
{
void
DebugString
(
const
ir
::
Graph
&
graph
,
const
std
::
vector
<
details
::
OpHandleBase
*>&
all_reduce_op_handles
)
const
{
// get vars order
std
::
map
<
int
,
std
::
vector
<
std
::
string
>>
vars
=
GetSoredGradientsFromStaleProgram
(
graph
);
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
浏览文件 @
65bbf950
...
...
@@ -34,7 +34,8 @@ class FuseAllReduceOpPass : public ir::Pass {
auto
&
places
=
Get
<
const
std
::
vector
<
platform
::
Place
>>
(
details
::
kPlaces
);
auto
&
local_scopes
=
Get
<
const
std
::
vector
<
Scope
*>>
(
details
::
kLocalScopes
);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto
*
nccl_ctxs
=
&
Get
<
platform
::
NCCLContextMap
>
(
details
::
kNCCLCtxs
);
auto
*
multi_nccl_ctxs
=
&
Get
<
platform
::
MultiNCCLContextMap
>
(
details
::
kNCCLCtxs
);
#endif
std
::
unordered_set
<
std
::
string
>
grads
;
...
...
@@ -94,7 +95,7 @@ class FuseAllReduceOpPass : public ir::Pass {
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
InsertFusedAllReduce
(
places
,
local_scopes
,
group_size
,
group_all_reduce_ops
,
nccl_ctxs
,
&
result
);
group_all_reduce_ops
,
multi_
nccl_ctxs
,
&
result
);
#else
InsertFusedAllReduce
(
places
,
local_scopes
,
group_size
,
group_all_reduce_ops
,
&
result
);
...
...
@@ -102,14 +103,14 @@ class FuseAllReduceOpPass : public ir::Pass {
}
}
void
InsertFusedAllReduce
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
Scope
*>
&
local_scop
es
,
const
size_t
num_of_all_reduce
,
const
std
::
vector
<
ir
::
Node
*>
&
all_reduce_ops
,
void
InsertFusedAllReduce
(
const
std
::
vector
<
platform
::
Place
>
&
plac
es
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
size_t
num_of_all_reduce
,
const
std
::
vector
<
ir
::
Node
*>
&
all_reduce_ops
,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const
platform
::
NCCLContextMap
*
nccl_ctxs
,
const
platform
::
MultiNCCLContextMap
*
multi_
nccl_ctxs
,
#endif
ir
::
Graph
*
result
)
const
{
ir
::
Graph
*
result
)
const
{
std
::
vector
<
details
::
VarHandleBase
*>
inputs
;
std
::
vector
<
details
::
VarHandleBase
*>
outputs
;
for
(
auto
&
op
:
all_reduce_ops
)
{
...
...
@@ -135,7 +136,7 @@ class FuseAllReduceOpPass : public ir::Pass {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
CreateFusedAllReduceOp
(
inputs
,
outputs
,
num_of_all_reduce
,
places
,
local_scopes
,
nccl_ctxs
,
result
);
local_scopes
,
multi_
nccl_ctxs
,
result
);
#else
CreateFusedAllReduceOp
(
inputs
,
outputs
,
num_of_all_reduce
,
places
,
local_scopes
,
result
);
...
...
@@ -150,13 +151,13 @@ class FuseAllReduceOpPass : public ir::Pass {
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const
platform
::
NCCLContextMap
*
nccl_ctxs
,
const
platform
::
MultiNCCLContextMap
*
multi_
nccl_ctxs
,
#endif
ir
::
Graph
*
result
)
const
{
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto
*
op_handle
=
new
details
::
FusedAllReduceOpHandle
(
result
->
CreateEmptyNode
(
"fused_all_reduce"
,
ir
::
Node
::
Type
::
kOperation
),
local_scopes
,
places
,
num_of_all_reduce
,
nccl_ctxs
);
local_scopes
,
places
,
num_of_all_reduce
,
multi_
nccl_ctxs
);
#else
auto
*
op_handle
=
new
details
::
FusedAllReduceOpHandle
(
result
->
CreateEmptyNode
(
"fused_all_reduce"
,
ir
::
Node
::
Type
::
kOperation
),
...
...
@@ -172,7 +173,7 @@ class FuseAllReduceOpPass : public ir::Pass {
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if
(
!
nccl_ctxs
)
{
if
(
!
multi_
nccl_ctxs
)
{
SetCommunicationContext
(
places
,
op_handle
);
}
#else
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
浏览文件 @
65bbf950
...
...
@@ -157,7 +157,11 @@ void MultiDevSSAGraphBuilderBase::Init() const {
local_scopes_
=
Get
<
const
std
::
vector
<
Scope
*>>
(
details
::
kLocalScopes
);
strategy_
=
Get
<
const
details
::
BuildStrategy
>
(
kStrategy
);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
nccl_ctxs_
=
&
Get
<
platform
::
NCCLContextMap
>
(
details
::
kNCCLCtxs
);
multi_nccl_ctxs_
=
&
Get
<
platform
::
MultiNCCLContextMap
>
(
details
::
kNCCLCtxs
);
nccl_ctxs_
=
nullptr
;
if
(
multi_nccl_ctxs_
)
{
nccl_ctxs_
=
multi_nccl_ctxs_
->
DefaultFlatCtx
();
}
#endif
PADDLE_ENFORCE_EQ
(
places_
.
size
(),
local_scopes_
.
size
());
}
...
...
@@ -460,20 +464,20 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
details
::
SparseAllReduceOpHandle
(
result
->
CreateEmptyNode
(
"allreduce"
,
ir
::
Node
::
Type
::
kOperation
),
scopes
,
places
,
nccl_ctxs_
,
is_encoded
,
scopes
,
places
,
multi_
nccl_ctxs_
,
is_encoded
,
static_cast
<
int
>
(
strategy_
.
trainers_endpoints_
.
size
())
*
places_
.
size
()));
}
else
{
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
details
::
AllReduceOpHandle
(
result
->
CreateEmptyNode
(
"allreduce"
,
ir
::
Node
::
Type
::
kOperation
),
scopes
,
places
,
nccl_ctxs_
));
scopes
,
places
,
multi_
nccl_ctxs_
));
}
#elif defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
details
::
AllReduceOpHandle
(
result
->
CreateEmptyNode
(
"allreduce"
,
ir
::
Node
::
Type
::
kOperation
),
scopes
,
places
,
nccl_ctxs_
));
scopes
,
places
,
multi_
nccl_ctxs_
));
#else
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
details
::
AllReduceOpHandle
(
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
浏览文件 @
65bbf950
...
...
@@ -96,7 +96,8 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
size_t
device_id
)
const
;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
mutable
platform
::
NCCLContextMap
*
nccl_ctxs_
;
mutable
platform
::
NCCLContextMap
*
nccl_ctxs_
{
nullptr
};
mutable
platform
::
MultiNCCLContextMap
*
multi_nccl_ctxs_
{
nullptr
};
#endif
mutable
std
::
string
loss_var_name_
;
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
65bbf950
...
...
@@ -94,6 +94,89 @@ class ParallelExecutorPrivate {
}
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void
InitNCCLCtxs
(
framework
::
Scope
*
scope
,
const
BuildStrategy
&
bst
)
{
VLOG
(
1
)
<<
"nccl comm num:"
<<
bst
.
nccl_comm_num_
<<
", nranks:"
<<
nranks_
<<
", num_trainers:"
<<
bst
.
num_trainers_
<<
", trainer_id:"
<<
bst
.
trainer_id_
;
if
(
bst
.
use_hierarchical_allreduce_
)
{
VLOG
(
1
)
<<
", use_hierarchical_allreduce:"
<<
bst
.
use_hierarchical_allreduce_
<<
", inter_trainers_num:"
<<
bst
.
hierarchical_allreduce_inter_nranks_
<<
", exter_trainers_num:"
<<
bst
.
hierarchical_allreduce_exter_nranks_
;
}
std
::
vector
<
ncclUniqueId
*>
flat_nccl_ids
;
if
(
nranks_
==
1
)
{
// FIXME(gongwb): need not to create ncclid when nranks==1
nccl_ctxs_
.
InitFlatCtxs
(
places_
,
flat_nccl_ids
,
bst
.
num_trainers_
,
bst
.
trainer_id_
);
return
;
}
if
(
bst
.
enable_parallel_graph_
)
{
VLOG
(
1
)
<<
"use only one ncclid in pg model"
;
ncclUniqueId
*
nccl_id
=
nullptr
;
std
::
string
var_name
=
platform
::
GetFlatNCCLVarName
(
0
);
auto
nccl_id_var
=
scope
->
FindVar
(
var_name
);
if
(
nccl_id_var
)
{
nccl_id
=
nccl_id_var
->
GetMutable
<
ncclUniqueId
>
();
}
else
{
nccl_id
=
new
ncclUniqueId
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclGetUniqueId
(
nccl_id
));
}
flat_nccl_ids
.
push_back
(
nccl_id
);
nccl_ctxs_
.
InitFlatCtxs
(
places_
,
flat_nccl_ids
,
bst
.
num_trainers_
,
bst
.
trainer_id_
);
VLOG
(
1
)
<<
"init bst nccl context complete!"
;
return
;
}
// num_trainers ==1 && places > 1
if
(
bst
.
num_trainers_
==
1
)
{
nccl_ctxs_
.
InitFlatCtxs
(
places_
,
flat_nccl_ids
,
bst
.
num_trainers_
,
bst
.
trainer_id_
);
return
;
}
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
bst
.
nccl_comm_num_
);
i
++
)
{
std
::
string
var_name
=
platform
::
GetFlatNCCLVarName
(
i
);
auto
nccl_id_var
=
scope
->
FindVar
(
var_name
);
PADDLE_ENFORCE
(
nccl_id_var
,
"can't find %s nccl_id_var"
,
var_name
);
auto
nccl_id
=
nccl_id_var
->
GetMutable
<
ncclUniqueId
>
();
flat_nccl_ids
.
push_back
(
nccl_id
);
}
nccl_ctxs_
.
InitFlatCtxs
(
places_
,
flat_nccl_ids
,
bst
.
num_trainers_
,
bst
.
trainer_id_
);
if
(
bst
.
use_hierarchical_allreduce_
)
{
std
::
string
var_name
=
platform
::
GetHierarchicalInterNCCLVarName
();
auto
nccl_id_var
=
scope
->
FindVar
(
var_name
);
auto
inter_nccl_id
=
nccl_id_var
->
GetMutable
<
ncclUniqueId
>
();
std
::
vector
<
ncclUniqueId
*>
exter_nccl_ids
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
bst
.
nccl_comm_num_
);
i
++
)
{
std
::
string
var_name
=
platform
::
GetHierarchicalExterNCCLVarName
(
i
);
auto
nccl_id_var
=
scope
->
FindVar
(
var_name
);
PADDLE_ENFORCE
(
nccl_id_var
,
"can't find %s nccl_id_var"
,
var_name
);
auto
nccl_id
=
nccl_id_var
->
GetMutable
<
ncclUniqueId
>
();
exter_nccl_ids
.
push_back
(
nccl_id
);
}
nccl_ctxs_
.
InitHierarchicalCtxs
(
places_
,
inter_nccl_id
,
exter_nccl_ids
,
bst
.
num_trainers_
,
bst
.
trainer_id_
,
bst
.
hierarchical_allreduce_inter_nranks_
,
bst
.
hierarchical_allreduce_exter_nranks_
);
}
}
#endif
BuildStrategy
build_strategy_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
Scope
*>
local_scopes_
;
...
...
@@ -101,7 +184,7 @@ class ParallelExecutorPrivate {
std
::
unique_ptr
<
details
::
SSAGraphExecutor
>
executor_
;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
platform
::
MultiNCCLContextMap
nccl_ctxs_
;
#endif
bool
own_local_scope_
;
bool
use_cuda_
;
...
...
@@ -254,24 +337,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
if
(
member_
->
use_cuda_
)
{
// Bcast Parameters to all GPUs
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
ncclUniqueId
*
nccl_id
=
nullptr
;
// gen_nccl_id operator can broadcast the ncclUniqueId for nccl2 collective
// distributed training
auto
*
nccl_id_var
=
scope
->
FindVar
(
NCCL_ID_VARNAME
);
if
(
nccl_id_var
!=
nullptr
)
{
nccl_id
=
nccl_id_var
->
GetMutable
<
ncclUniqueId
>
();
}
if
(
build_strategy
.
enable_parallel_graph_
&&
member_
->
nranks_
>
1UL
)
{
if
(
nccl_id
==
nullptr
)
{
local_nccl_id_
.
reset
(
new
ncclUniqueId
());
platform
::
dynload
::
ncclGetUniqueId
(
local_nccl_id_
.
get
());
nccl_id
=
local_nccl_id_
.
get
();
}
}
member_
->
nccl_ctxs_
.
reset
(
new
platform
::
NCCLContextMap
(
member_
->
places_
,
nccl_id
,
build_strategy
.
num_trainers_
,
build_strategy
.
trainer_id_
));
member_
->
InitNCCLCtxs
(
scope
,
build_strategy
);
// Initialize device context's nccl comm, will be used by normal
// Operators like sync_batch_norm, and collective ops.
...
...
@@ -280,22 +346,14 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
// NOTE: NCCL group-calls and non-group-calls can not use the same
// NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use
// same communicators.
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
dev_nccl_ctxs
;
if
(
nccl_id
==
nullptr
)
{
dev_nccl_ctxs
.
reset
(
new
platform
::
NCCLContextMap
(
member_
->
places_
));
}
for
(
size_t
dev_id
=
0
;
dev_id
<
member_
->
places_
.
size
();
++
dev_id
)
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
member_
->
places_
[
dev_id
]));
if
(
nccl_id
!=
nullptr
)
{
auto
&
nccl_ctx
=
member_
->
nccl_ctxs_
->
at
(
member_
->
places_
[
dev_id
]);
dev_ctx
->
set_nccl_comm
(
nccl_ctx
.
comm
());
}
else
{
auto
&
nccl_ctx
=
dev_nccl_ctxs
->
at
(
member_
->
places_
[
dev_id
]);
dev_ctx
->
set_nccl_comm
(
nccl_ctx
.
comm
());
}
auto
&
nccl_ctx
=
member_
->
nccl_ctxs_
.
DefaultFlatCtx
()
->
at
(
member_
->
places_
[
dev_id
]);
dev_ctx
->
set_nccl_comm
(
nccl_ctx
.
comm
());
}
#else
PADDLE_THROW
(
"Not compiled with CUDA"
);
...
...
@@ -327,18 +385,18 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
VLOG
(
3
)
<<
"use local async mode"
;
graph
=
build_strategy
.
Apply
(
graph
,
{
member_
->
places_
[
0
]},
loss_var_name
,
{
member_
->
local_scopes_
[
0
]},
1
,
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
.
get
()
);
member_
->
use_cuda_
,
&
member_
->
nccl_ctxs_
);
for
(
size_t
i
=
1
;
i
<
member_
->
places_
.
size
();
++
i
)
{
graphs
[
i
]
=
build_strategy
.
Apply
(
graphs
[
i
],
{
member_
->
places_
[
i
]},
loss_var_name
,
{
member_
->
local_scopes_
[
i
]},
1
,
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
.
get
()
);
member_
->
use_cuda_
,
&
member_
->
nccl_ctxs_
);
async_graphs
[
i
]
=
graphs
[
i
];
}
}
else
{
graph
=
build_strategy
.
Apply
(
graph
,
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
.
get
()
);
member_
->
use_cuda_
,
&
member_
->
nccl_ctxs_
);
}
#else
if
(
build_strategy
.
async_mode_
)
{
...
...
@@ -471,13 +529,14 @@ void ParallelExecutor::BCastParamsToDevices(
PADDLE_ENFORCE_EQ
(
member_
->
places_
.
size
(),
buffers
.
size
(),
"variables' buffer size to bcast NOT equal to places"
);
{
auto
*
nccl_ctxs
=
member_
->
nccl_ctxs_
.
DefaultFlatCtx
();
platform
::
NCCLGroupGuard
guard
;
for
(
size_t
i
=
0
;
i
<
member_
->
places_
.
size
();
++
i
)
{
auto
&
nccl_ctx
=
member_
->
nccl_ctxs_
->
at
(
member_
->
places_
[
i
]);
auto
&
nccl_ctx
=
nccl_ctxs
->
at
(
member_
->
places_
[
i
]);
platform
::
dynload
::
ncclBcast
(
buffers
[
i
],
numel
,
data_type
,
0
,
nccl_ctx
.
comm_
,
nccl_ctx
.
stream
());
}
member_
->
nccl_ctxs_
->
WaitAll
();
nccl_ctxs
->
WaitAll
();
}
#else
PADDLE_THROW
(
"Not compiled with CUDA"
);
...
...
@@ -512,6 +571,7 @@ void ParallelExecutor::BCastParamsToDevices(
void
ParallelExecutor
::
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
,
const
std
::
string
&
fetched_var_name
)
{
VLOG
(
3
)
<<
"enter ParallelExecutor Run"
;
#ifdef WITH_GPERFTOOLS
if
(
gProfileStarted
)
{
ProfilerFlush
();
...
...
@@ -522,6 +582,8 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
if
(
member_
->
HasGarbageCollectors
())
{
member_
->
ResetRuntimeReferenceCount
(
fetch_tensors
,
fetched_var_name
);
}
VLOG
(
3
)
<<
"ParallelExecutor begin to run member_->executor_->Run"
;
auto
fetch_data
=
member_
->
executor_
->
Run
(
fetch_tensors
);
*
member_
->
global_scope_
->
Var
(
fetched_var_name
)
->
GetMutable
<
FeedFetchList
>
()
=
fetch_data
;
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
65bbf950
...
...
@@ -87,9 +87,6 @@ class ParallelExecutor {
ParallelExecutorPrivate
*
member_
;
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
async_graphs_
;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
std
::
unique_ptr
<
ncclUniqueId
>
local_nccl_id_
;
#endif
};
}
// namespace framework
...
...
paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
浏览文件 @
65bbf950
...
...
@@ -41,31 +41,129 @@ class GenNCCLIdOp : public framework::OperatorBase {
// put nccl id in CPUPlace
auto
&
dev_ctx
=
*
pool
.
Get
(
platform
::
CPUPlace
());
int
trainer_id
=
Attr
<
int
>
(
"trainer_id"
);
std
::
vector
<
std
::
string
>
trainers
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"trainers"
);
PADDLE_ENFORCE
(
trainer_id
>=
0
&&
trainer_id
<
static_cast
<
int
>
(
trainers
.
size
()),
"trainer_id:%d must be in trainers.size range"
,
trainer_id
);
std
::
string
endpoint
=
trainers
[
trainer_id
];
framework
::
Scope
&
local_scope
=
scope
.
NewScope
();
int
nccl_comm_num
=
Attr
<
int
>
(
"nccl_comm_num"
);
int
use_hierarchical_allreduce
=
Attr
<
bool
>
(
"use_hierarchical_allreduce"
);
int
inter_nranks
=
Attr
<
int
>
(
"hierarchical_allreduce_inter_nranks"
);
int
inter_trainer_id
=
-
1
;
int
exter_trainer_id
=
-
1
;
if
(
use_hierarchical_allreduce
)
{
PADDLE_ENFORCE
(
trainers
.
size
()
>
1
,
"trainers.size():%llu < 1"
,
trainers
.
size
());
PADDLE_ENFORCE
(
inter_nranks
>
1
,
"inter_nranks:%d < 1"
,
inter_nranks
);
PADDLE_ENFORCE
((
trainers
.
size
()
%
inter_nranks
==
0
),
"trainers.size():%llu mod inter_nranks:%d != 0"
,
trainers
.
size
(),
inter_nranks
);
inter_trainer_id
=
trainer_id
%
inter_nranks
;
if
(
trainer_id
%
inter_nranks
==
0
)
{
exter_trainer_id
=
trainer_id
/
inter_nranks
;
}
}
if
(
trainer_id
!=
0
)
{
GetIdByServer
(
endpoint
,
&
local_scope
,
dev_ctx
,
nccl_comm_num
,
use_hierarchical_allreduce
,
trainer_id
,
inter_trainer_id
,
exter_trainer_id
);
}
std
::
ostringstream
ss
;
for
(
size_t
i
=
0
;
i
<
trainers
.
size
();
i
++
)
{
ss
<<
trainers
[
i
]
<<
","
;
}
VLOG
(
1
)
<<
"trainer_id:"
<<
trainer_id
<<
", use_hierarchical_allreduce:"
<<
use_hierarchical_allreduce
<<
", inter_nranks:"
<<
inter_nranks
<<
", inter_trainer_id:"
<<
inter_trainer_id
<<
", exter_trainer_id:"
<<
exter_trainer_id
<<
", trainers:"
<<
ss
.
str
();
// init flat
if
(
trainer_id
==
0
)
{
GenerateAndSend
(
&
local_scope
,
dev_ctx
);
}
else
{
GetIdByServer
(
&
local_scope
,
dev_ctx
);
std
::
vector
<
std
::
string
>
flat_endpoints
;
flat_endpoints
.
insert
(
flat_endpoints
.
begin
(),
trainers
.
begin
()
+
1
,
trainers
.
end
());
// flat nccl_id
for
(
int
i
=
0
;
i
<
nccl_comm_num
;
i
++
)
{
std
::
string
var_name
=
platform
::
GetFlatNCCLVarName
(
i
);
GenerateAndSend
(
&
local_scope
,
dev_ctx
,
var_name
,
flat_endpoints
);
}
}
if
(
!
use_hierarchical_allreduce
)
{
return
;
}
PADDLE_ENFORCE
(
trainers
.
size
()
%
inter_nranks
==
0
,
"enpoints.size:%llu mod inter_nranks:%d should ==0"
,
trainers
.
size
(),
inter_nranks
);
PADDLE_ENFORCE
(
inter_nranks
>
1
,
"inter_nranks:%d must > 1"
,
inter_nranks
);
// hierarchical inter ncclid
if
(
inter_trainer_id
==
0
)
{
std
::
ostringstream
ss
;
ss
<<
endpoint
;
std
::
vector
<
std
::
string
>
inter_endpoints
;
for
(
int
i
=
trainer_id
+
1
;
i
<
trainer_id
+
inter_nranks
&&
i
<
static_cast
<
int
>
(
trainers
.
size
());
i
++
)
{
ss
<<
","
;
inter_endpoints
.
push_back
(
trainers
[
i
]);
ss
<<
trainers
[
i
];
}
VLOG
(
1
)
<<
"Hierarchical inter ring endpoints:"
<<
ss
.
str
();
std
::
string
nccl_var_name
=
platform
::
GetHierarchicalInterNCCLVarName
();
GenerateAndSend
(
&
local_scope
,
dev_ctx
,
nccl_var_name
,
inter_endpoints
);
}
// hierarchical exter ncclid
if
(
exter_trainer_id
==
0
)
{
std
::
ostringstream
ss
;
std
::
vector
<
std
::
string
>
exter_endpoints
;
ss
<<
endpoint
;
for
(
size_t
i
=
inter_nranks
;
i
<
trainers
.
size
();
i
+=
inter_nranks
)
{
ss
<<
","
;
exter_endpoints
.
push_back
(
trainers
[
i
]);
ss
<<
trainers
[
i
];
}
VLOG
(
1
)
<<
"Hierarchical exter ring endpoints:"
<<
ss
.
str
();
for
(
int
i
=
0
;
i
<
nccl_comm_num
;
i
++
)
{
std
::
string
nccl_var_name
=
platform
::
GetHierarchicalExterNCCLVarName
(
i
);
GenerateAndSend
(
&
local_scope
,
dev_ctx
,
nccl_var_name
,
exter_endpoints
);
}
}
}
private:
void
GenerateAndSend
(
framework
::
Scope
*
scope
,
const
platform
::
DeviceContext
&
dev_ctx
)
const
{
auto
var
=
scope
->
FindVar
(
NCCL_ID_VARNAME
);
PADDLE_ENFORCE_NOT_NULL
(
var
);
const
platform
::
DeviceContext
&
dev_ctx
,
const
std
::
string
&
nccl_id_name
,
const
std
::
vector
<
std
::
string
>&
endpoint_list
)
const
{
auto
var
=
scope
->
FindVar
(
nccl_id_name
);
PADDLE_ENFORCE_NOT_NULL
(
var
,
"can't find nccl_id_var_name:%s"
,
nccl_id_name
);
auto
id
=
var
->
GetMutable
<
ncclUniqueId
>
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclGetUniqueId
(
id
));
std
::
vector
<
std
::
string
>
endpoint_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"endpoint_list"
);
distributed
::
RPCClient
*
client
=
distributed
::
RPCClient
::
GetInstance
<
RPCCLIENT_T
>
(
0
);
for
(
auto
&
ep
:
endpoint_list
)
{
VLOG
(
3
)
<<
"sending nccl
id
to "
<<
ep
;
client
->
AsyncSendVar
(
ep
,
dev_ctx
,
*
scope
,
NCCL_ID_VARNAME
);
VLOG
(
3
)
<<
"sending nccl
_id_var:"
<<
nccl_id_name
<<
"
to "
<<
ep
;
client
->
AsyncSendVar
(
ep
,
dev_ctx
,
*
scope
,
nccl_id_name
);
}
client
->
Wait
();
for
(
auto
&
ep
:
endpoint_list
)
{
...
...
@@ -75,9 +173,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
VLOG
(
3
)
<<
"sending completed..."
;
}
void
GetIdByServer
(
framework
::
Scope
*
scope
,
const
platform
::
DeviceContext
&
dev_ctx
)
const
{
std
::
string
endpoint
=
Attr
<
std
::
string
>
(
"endpoint"
);
void
GetIdByServer
(
const
std
::
string
&
endpoint
,
framework
::
Scope
*
scope
,
const
platform
::
DeviceContext
&
dev_ctx
,
int
nccl_comm_num
,
bool
use_hierarchical_allreduce
,
int
trainer_id
,
int
inter_trainer_id
,
int
exter_trainer_id
)
const
{
// std::string endpoint = Attr<std::string>("endpoint");
// NOTE: Can not use unique_ptr here because the default
// deleter will call GRPC Server's base class's dtor and
// that will cause a wired crash.
...
...
@@ -98,10 +198,42 @@ class GenNCCLIdOp : public framework::OperatorBase {
std
::
thread
server_thread
(
std
::
bind
(
&
distributed
::
RPCServer
::
StartServer
,
rpc_service
.
get
()));
rpc_service
->
SetCond
(
distributed
::
kRequestSend
);
VLOG
(
3
)
<<
"start getting nccl id from trainer 0..."
;
rpc_service
->
WaitBarrier
(
distributed
::
kRequestSend
);
VLOG
(
3
)
<<
"got nccl id and stop server..."
;
for
(
int
i
=
0
;
i
<
nccl_comm_num
;
i
++
)
{
rpc_service
->
SetCond
(
distributed
::
kRequestSend
);
VLOG
(
3
)
<<
"trainer_id:"
<<
trainer_id
<<
" start getting nccl id from trainer 0, nccl_comm_no:"
<<
i
;
rpc_service
->
WaitBarrier
(
distributed
::
kRequestSend
);
rpc_service
->
ResetBarrierCounter
();
}
if
(
use_hierarchical_allreduce
)
{
if
(
inter_trainer_id
>
0
)
{
rpc_service
->
SetCond
(
distributed
::
kRequestSend
);
VLOG
(
3
)
<<
"trainer_id:"
<<
trainer_id
<<
", inter_trainer_id:"
<<
inter_trainer_id
<<
" start getting nccl id from inter_trainer 0"
;
rpc_service
->
WaitBarrier
(
distributed
::
kRequestSend
);
rpc_service
->
ResetBarrierCounter
();
}
if
(
exter_trainer_id
>
0
)
{
for
(
int
i
=
0
;
i
<
nccl_comm_num
;
i
++
)
{
rpc_service
->
SetCond
(
distributed
::
kRequestSend
);
VLOG
(
3
)
<<
"trainer_id:"
<<
trainer_id
<<
", exter_trainer_id:"
<<
exter_trainer_id
<<
" start getting nccl id from exter_trainer 0, nccl_comm_no:"
<<
i
;
rpc_service
->
WaitBarrier
(
distributed
::
kRequestSend
);
rpc_service
->
ResetBarrierCounter
();
}
}
}
VLOG
(
3
)
<<
"traier_id:"
<<
trainer_id
<<
", inter_trainer_id:"
<<
inter_trainer_id
<<
", exter_trainer_id:"
<<
exter_trainer_id
<<
" got nccl id and stop server..."
;
rpc_service
->
ShutDown
();
VLOG
(
3
)
<<
"rpc server stopped"
;
server_thread
.
join
();
...
...
@@ -118,18 +250,26 @@ GenNCCLId operator
For trainer 0: generate a new UniqueId and send it to all the other trainers.
For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
)DOC"
);
AddAttr
<
std
::
string
>
(
"endpoint"
,
"(string), e.g. 127.0.0.1:6175 "
"current listen endpoint"
);
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"
endpoint_list
"
,
"['trainer
1_ip:port', 'trainer2
_ip:port', ...] "
"list of
trainer endpoints start from trainer 1
"
)
"
trainers
"
,
"['trainer
0_ip:port', 'trainer1
_ip:port', ...] "
"list of
all trainer endpoints
"
)
.
SetDefault
({});
AddAttr
<
int
>
(
"trainer_id"
,
"(int default 0) "
"The index of the trainer in distributed training."
)
.
SetDefault
(
0
);
"(int) "
"The index of the trainer in distributed training."
);
AddAttr
<
int
>
(
"nccl_comm_num"
,
"(int default 1) "
"The number of nccl communicator num."
)
.
SetDefault
(
1
);
AddAttr
<
bool
>
(
"use_hierarchical_allreduce"
,
"(bool default false) "
"Wheter to use hierarchical allreduce."
)
.
SetDefault
(
false
);
AddAttr
<
int
>
(
"hierarchical_allreduce_inter_nranks"
,
"(int default 1) "
"Wheter to use hierarchical allreduce."
)
.
SetDefault
(
-
1
);
}
};
...
...
paddle/fluid/platform/nccl_helper.h
浏览文件 @
65bbf950
...
...
@@ -124,8 +124,8 @@ struct NCCLContextMap {
}
else
{
rank
=
trainer_id
;
}
VLOG
(
3
)
<<
"init nccl rank: "
<<
rank
<<
" nranks:
"
<<
nranks
<<
"
gpu id: "
<<
gpu_id
;
VLOG
(
1
)
<<
"init nccl rank:"
<<
rank
<<
", nranks:
"
<<
nranks
<<
"
, gpu_id:"
<<
gpu_id
<<
", dev_id:"
<<
order_
[
i
]
;
PADDLE_ENFORCE
(
cudaSetDevice
(
gpu_id
));
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclCommInitRank
(
comms
.
get
()
+
i
,
nranks
,
*
nccl_id
,
rank
));
...
...
@@ -160,6 +160,134 @@ struct NCCLContextMap {
}
};
inline
std
::
string
GetFlatNCCLVarName
(
size_t
pos
)
{
if
(
pos
==
0
)
{
return
NCCL_ID_VARNAME
;
}
return
string
::
Sprintf
(
"%s_%d"
,
NCCL_ID_VARNAME
,
static_cast
<
int
>
(
pos
));
}
inline
std
::
string
GetHierarchicalExterNCCLVarName
(
size_t
pos
)
{
return
string
::
Sprintf
(
"Hierarchical_exter_%s_%d"
,
NCCL_ID_VARNAME
,
static_cast
<
int
>
(
pos
));
}
inline
std
::
string
GetHierarchicalInterNCCLVarName
()
{
return
string
::
Sprintf
(
"Hierarchical_inter_%s"
,
NCCL_ID_VARNAME
);
}
class
MultiNCCLContextMap
{
public:
MultiNCCLContextMap
()
{}
virtual
~
MultiNCCLContextMap
()
{}
NCCLContextMap
*
DefaultFlatCtx
()
const
{
if
(
flat_ctxs_
.
size
()
==
0
)
{
return
nullptr
;
}
return
flat_ctxs_
[
0
].
get
();
}
std
::
vector
<
std
::
unique_ptr
<
NCCLContextMap
>>
*
GetFlatCtxs
()
{
return
&
flat_ctxs_
;
}
NCCLContextMap
*
GetFlatCtx
(
size_t
run_order
)
const
{
return
flat_ctxs_
[
run_order
%
flat_ctxs_
.
size
()].
get
();
}
NCCLContextMap
*
GetRunEnvNCCLCtx
(
size_t
run_order
,
bool
use_hierarchical_allreduce
)
const
{
if
(
!
use_hierarchical_allreduce
)
{
return
GetFlatCtx
(
run_order
);
}
return
GetHierarchicalInterCtx
(
run_order
);
}
void
InitFlatCtxs
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
ncclUniqueId
*>
&
nccl_ids
,
size_t
trainers_num
,
size_t
trainer_id
)
{
if
(
nccl_ids
.
size
()
==
0
)
{
auto
ptr
=
new
platform
::
NCCLContextMap
(
places
);
VLOG
(
1
)
<<
"init local trainer"
;
flat_ctxs_
.
emplace_back
(
ptr
);
return
;
}
for
(
size_t
i
=
0
;
i
<
nccl_ids
.
size
();
i
++
)
{
auto
ptr
=
new
platform
::
NCCLContextMap
(
places
,
nccl_ids
[
i
],
trainers_num
,
trainer_id
);
VLOG
(
1
)
<<
"init trainer_id:"
<<
trainer_id
<<
", comm no:"
<<
i
;
flat_ctxs_
.
emplace_back
(
ptr
);
}
}
void
InitHierarchicalCtxs
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
ncclUniqueId
*
inter_nccl_id
,
const
std
::
vector
<
ncclUniqueId
*>
&
exter_nccl_id
,
size_t
trainers_num
,
size_t
trainer_id
,
size_t
inter_trainers_num
,
size_t
exter_trainers_num
)
{
PADDLE_ENFORCE
(
trainers_num
==
inter_trainers_num
*
exter_trainers_num
,
"trainers_num:%llu != inter_trainers_num:%llu * "
"exter_trainers_num:%llu"
,
trainers_num
,
inter_trainers_num
,
exter_trainers_num
);
PADDLE_ENFORCE
(
inter_trainers_num
>
1
,
"inter_trainers_num:%llu must > 1"
,
inter_trainers_num
);
int
inter_trainer_id
=
trainer_id
%
inter_trainers_num
;
VLOG
(
1
)
<<
"init inter_trainer_id:"
<<
inter_trainer_id
;
auto
local
=
new
NCCLContextMap
(
places
,
inter_nccl_id
,
inter_trainers_num
,
inter_trainer_id
);
h_inter_ctxs_
.
emplace_back
(
local
);
int
exter_trainer_id
=
-
1
;
if
(
trainer_id
%
inter_trainers_num
==
0
)
{
exter_trainer_id
=
trainer_id
/
inter_trainers_num
;
}
if
(
exter_trainer_id
>=
0
)
{
for
(
size_t
i
=
0
;
i
<
exter_nccl_id
.
size
();
i
++
)
{
auto
ex
=
new
NCCLContextMap
(
places
,
exter_nccl_id
[
i
],
exter_trainers_num
,
exter_trainer_id
);
VLOG
(
1
)
<<
"init exter_trainer_id:"
<<
exter_trainer_id
<<
", comm no:"
<<
i
;
h_exter_ctxs_
.
emplace_back
(
ex
);
}
}
}
bool
NeedExterAllReduce
()
const
{
return
h_exter_ctxs_
.
size
()
>
0
;
}
NCCLContextMap
*
GetHierarchicalInterCtx
(
size_t
run_order
)
const
{
return
h_inter_ctxs_
[
run_order
%
h_inter_ctxs_
.
size
()].
get
();
}
NCCLContextMap
*
GetHierarchicalExterCtx
(
size_t
run_order
)
const
{
return
h_exter_ctxs_
[
run_order
%
h_exter_ctxs_
.
size
()].
get
();
}
std
::
vector
<
std
::
unique_ptr
<
NCCLContextMap
>>
*
GetHierarchicalInterCtxs
()
{
return
&
h_inter_ctxs_
;
}
std
::
vector
<
std
::
unique_ptr
<
NCCLContextMap
>>
*
GetHierarchicalExterCtxs
()
{
return
&
h_exter_ctxs_
;
}
protected:
// Support multi nccl comm on default nccl ring while NCCLContextMap can't.
std
::
vector
<
std
::
unique_ptr
<
NCCLContextMap
>>
flat_ctxs_
;
// h_inter_ctxs_ and h_exter_ctxs_ are for 2d allreduce.
// And h_exter_ctxs_ can support multi comm too.
std
::
vector
<
std
::
unique_ptr
<
NCCLContextMap
>>
h_inter_ctxs_
;
std
::
vector
<
std
::
unique_ptr
<
NCCLContextMap
>>
h_exter_ctxs_
;
};
}
// namespace platform
}
// namespace paddle
#endif
paddle/fluid/pybind/pybind.cc
浏览文件 @
65bbf950
...
...
@@ -1483,6 +1483,34 @@ All parameter, weight, gradient are variables in Paddle.
[](
BuildStrategy
&
self
,
int
trainer_id
)
{
self
.
trainer_id_
=
trainer_id
;
})
.
def_property
(
"nccl_comm_num"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
nccl_comm_num_
;
},
[](
BuildStrategy
&
self
,
int
nccl_comm_num
)
{
self
.
nccl_comm_num_
=
nccl_comm_num
;
})
.
def_property
(
"use_hierarchical_allreduce_"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
use_hierarchical_allreduce_
;
},
[](
BuildStrategy
&
self
,
bool
use
)
{
self
.
use_hierarchical_allreduce_
=
use
;
})
.
def_property
(
"hierarchical_allreduce_inter_nranks_"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
hierarchical_allreduce_inter_nranks_
;
},
[](
BuildStrategy
&
self
,
int
nranks
)
{
self
.
hierarchical_allreduce_inter_nranks_
=
nranks
;
})
.
def_property
(
"hierarchical_allreduce_exter_nranks_"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
hierarchical_allreduce_exter_nranks_
;
},
[](
BuildStrategy
&
self
,
int
nranks
)
{
self
.
hierarchical_allreduce_exter_nranks_
=
nranks
;
})
.
def_property
(
"fuse_elewise_add_act_ops"
,
[](
const
BuildStrategy
&
self
)
{
...
...
python/paddle/fluid/compiler.py
浏览文件 @
65bbf950
...
...
@@ -98,6 +98,7 @@ class CompiledProgram(object):
def
__init__
(
self
,
program_or_graph
):
if
isinstance
(
program_or_graph
,
core
.
Graph
):
self
.
_graph
=
program_or_graph
# don't not create a new program here.
self
.
_program
=
None
elif
isinstance
(
program_or_graph
,
framework
.
Program
):
self
.
_graph
=
core
.
Graph
(
program_or_graph
.
desc
)
...
...
@@ -299,6 +300,7 @@ class CompiledProgram(object):
# TODO(wuyi): trainer endpoings should be passed in through
# build_strategy, not program.xxx.
# TODO(gongwb): let user to set them once.
if
self
.
_program
and
self
.
_build_strategy
.
num_trainers
>
1
and
\
self
.
_program
.
_trainers_endpoints
:
tps
=
self
.
_program
.
_trainers_endpoints
...
...
@@ -307,6 +309,12 @@ class CompiledProgram(object):
tps
),
"num_trainers == len(end_points)"
self
.
_build_strategy
.
trainers_endpoints
=
tps
if
self
.
_program
:
self
.
_build_strategy
.
nccl_comm_num
=
self
.
_program
.
_nccl_comm_num
self
.
_build_strategy
.
use_hierarchical_allreduce_
=
self
.
_program
.
_use_hierarchical_allreduce
self
.
_build_strategy
.
hierarchical_allreduce_inter_nranks_
=
self
.
_program
.
_hierarchical_allreduce_inter_nranks
self
.
_build_strategy
.
hierarchical_allreduce_exter_nranks_
=
self
.
_program
.
_hierarchical_allreduce_exter_nranks
if
self
.
_build_strategy
.
sync_batch_norm
:
self
.
_build_strategy
.
enable_sequential_execution
=
True
...
...
python/paddle/fluid/framework.py
浏览文件 @
65bbf950
...
...
@@ -2762,6 +2762,10 @@ class Program(object):
# use Deep gradient comrepssion or not
self
.
_enable_dgc
=
False
self
.
_nccl_comm_num
=
1
self
.
_use_hierarchical_allreduce
=
False
self
.
_hierarchical_allreduce_inter_nranks
=
0
self
.
_hierarchical_allreduce_exter_nranks
=
0
# @deprecated(the python memory optimize transpiler is deprecated)
# whether the program is optimized by memory_optimize_transpiler
...
...
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
65bbf950
...
...
@@ -51,11 +51,14 @@ class TestDistRunnerBase(object):
trainers
,
sync_mode
,
dc_asgd
=
False
,
current_endpoint
=
None
):
current_endpoint
=
None
,
nccl_comm_num
=
1
):
# NOTE: import fluid until runtime, or else forking processes will cause error.
config
=
fluid
.
DistributeTranspilerConfig
()
config
.
enable_dc_asgd
=
dc_asgd
config
.
sync_mode
=
sync_mode
if
nccl_comm_num
>
1
:
config
.
nccl_comm_num
=
nccl_comm_num
# config.runtime_split_send_recv = True
t
=
fluid
.
DistributeTranspiler
(
config
=
config
)
t
.
transpile
(
...
...
@@ -106,6 +109,7 @@ class TestDistRunnerBase(object):
# transpile for nccl2
config
=
fluid
.
DistributeTranspilerConfig
()
config
.
mode
=
"nccl2"
config
.
nccl_comm_num
=
args
.
nccl_comm_num
nccl2_t
=
fluid
.
DistributeTranspiler
(
config
=
config
)
nccl2_t
.
transpile
(
args
.
trainer_id
,
...
...
@@ -113,6 +117,7 @@ class TestDistRunnerBase(object):
startup_program
=
fluid
.
default_startup_program
(),
trainers
=
args
.
endpoints
,
current_endpoint
=
args
.
current_endpoint
)
trainer_prog
=
fluid
.
default_main_program
()
else
:
trainer_prog
=
fluid
.
default_main_program
()
...
...
@@ -268,6 +273,7 @@ def runtime_main(test_class):
choices
=
[
"pserver"
,
"nccl2"
,
"local"
,
"nccl2_reduce_layer"
])
parser
.
add_argument
(
'--trainer_id'
,
type
=
int
,
required
=
False
,
default
=
0
)
parser
.
add_argument
(
'--trainers'
,
type
=
int
,
required
=
False
,
default
=
1
)
parser
.
add_argument
(
'--nccl_comm_num'
,
type
=
int
,
required
=
False
,
default
=
1
)
parser
.
add_argument
(
'--current_endpoint'
,
type
=
str
,
required
=
False
,
default
=
""
)
parser
.
add_argument
(
'--sync_mode'
,
action
=
'store_true'
)
...
...
@@ -345,6 +351,7 @@ class TestDistBase(unittest.TestCase):
self
.
_lr
=
0.001
self
.
_use_dgc
=
False
self
.
_dygraph
=
False
self
.
_nccl_comm_num
=
1
self
.
_setup_config
()
self
.
_after_setup_config
()
...
...
@@ -590,6 +597,11 @@ class TestDistBase(unittest.TestCase):
if
self
.
_use_dgc
:
tr0_cmd
+=
" --use_dgc"
tr1_cmd
+=
" --use_dgc"
if
self
.
_nccl_comm_num
>
1
:
tr0_cmd
+=
" --nccl_comm_num {}"
.
format
(
self
.
_nccl_comm_num
)
tr1_cmd
+=
" --nccl_comm_num {}"
.
format
(
self
.
_nccl_comm_num
)
if
self
.
_mp_mode
:
env0
=
{
"FLAGS_selected_gpus"
:
"0"
}
env1
=
{
"FLAGS_selected_gpus"
:
"1"
}
...
...
python/paddle/fluid/tests/unittests/test_dist_mnist.py
浏览文件 @
65bbf950
...
...
@@ -39,6 +39,20 @@ class TestDistMnistNCCL2(TestDistBase):
self
.
check_with_place
(
"dist_mnist.py"
,
delta
=
1e-5
)
class
TestDistMnistNCCL2MultiNCCLComm
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_use_reduce
=
False
self
.
_use_reader_alloc
=
False
self
.
_nccl2_mode
=
True
self
.
_nccl_comm_num
=
3
def
test_dist_train
(
self
):
import
paddle.fluid
as
fluid
if
fluid
.
core
.
is_compiled_with_cuda
():
self
.
check_with_place
(
"dist_mnist.py"
,
delta
=
1e-5
)
class
TestDistMnistNCCL2DGC
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
65bbf950
...
...
@@ -165,6 +165,15 @@ class DistributeTranspilerConfig(object):
runtime_split_send_recv
=
False
sync_mode
=
True
nccl_comm_num
=
1
#The picture here illustrates the principle:
#https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
use_hierarchical_allreduce
=
False
#Nccl ranks in a node when use hierarchical allreduce, it's setted to gpu cards' number in most cases.
hierarchical_allreduce_inter_nranks
=
0
#Nccl ranks bewteen nodes when use hierarchical allreduce, it's setted to nodes number.
hierarchical_allreduce_exter_nranks
=
0
class
DistributeTranspiler
(
object
):
"""
...
...
@@ -261,14 +270,36 @@ class DistributeTranspiler(object):
nccl_id_var
=
startup_program
.
global_block
().
create_var
(
name
=
"NCCLID"
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
for
i
in
range
(
1
,
self
.
config
.
nccl_comm_num
):
startup_program
.
global_block
().
create_var
(
name
=
"NCCLID_{}"
.
format
(
i
),
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
if
self
.
config
.
use_hierarchical_allreduce
:
startup_program
.
global_block
().
create_var
(
name
=
"Hierarchical_inter_NCCLID"
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
for
i
in
range
(
0
,
self
.
config
.
nccl_comm_num
):
startup_program
.
global_block
().
create_var
(
name
=
"Hierarchical_exter_NCCLID_{}"
.
format
(
i
),
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
startup_program
.
global_block
().
append_op
(
type
=
"gen_nccl_id"
,
inputs
=
{},
outputs
=
{
"NCCLID"
:
nccl_id_var
},
attrs
=
{
"endpoint"
:
current_endpoint
,
"endpoint_list"
:
worker_endpoints
,
"trainer_id"
:
trainer_id
"trainers"
:
trainers
.
split
(
","
),
"trainer_id"
:
trainer_id
,
"nccl_comm_num"
:
self
.
config
.
nccl_comm_num
,
"use_hierarchical_allreduce"
:
self
.
config
.
use_hierarchical_allreduce
,
"hierarchical_allreduce_inter_nranks"
:
self
.
config
.
hierarchical_allreduce_inter_nranks
})
return
nccl_id_var
else
:
...
...
@@ -350,6 +381,12 @@ class DistributeTranspiler(object):
if
self
.
config
.
mode
==
"nccl2"
:
assert
(
isinstance
(
trainers
,
str
))
self
.
origin_program
.
_trainers_endpoints
=
trainers
.
split
(
","
)
self
.
origin_program
.
_nccl_comm_num
=
self
.
config
.
nccl_comm_num
self
.
origin_program
.
_use_hierarchical_allreduce
=
self
.
config
.
use_hierarchical_allreduce
self
.
origin_program
.
_hierarchical_allreduce_inter_nranks
=
\
int
(
self
.
config
.
hierarchical_allreduce_inter_nranks
)
self
.
origin_program
.
_hierarchical_allreduce_exter_nranks
=
\
int
(
self
.
config
.
hierarchical_allreduce_exter_nranks
)
self
.
_transpile_nccl2
(
trainer_id
,
trainers
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录