Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
0d561ef4
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
0d561ef4
编写于
5月 29, 2019
作者:
G
gongweibao
提交者:
GitHub
5月 29, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix 2dconn test=develop (#17681)
上级
ccf9e232
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
41 addition
and
26 deletion
+41
-26
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+10
-4
paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
+13
-8
paddle/fluid/platform/nccl_helper.h
paddle/fluid/platform/nccl_helper.h
+14
-10
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+4
-4
未找到文件。
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
0d561ef4
...
...
@@ -157,9 +157,14 @@ class ParallelExecutorPrivate {
bst
.
trainer_id_
);
if
(
bst
.
use_hierarchical_allreduce_
)
{
std
::
string
var_name
=
platform
::
GetHierarchicalInterNCCLVarName
();
auto
nccl_id_var
=
scope
->
FindVar
(
var_name
);
auto
inter_nccl_id
=
nccl_id_var
->
GetMutable
<
ncclUniqueId
>
();
std
::
vector
<
ncclUniqueId
*>
inter_nccl_ids
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
bst
.
nccl_comm_num_
);
i
++
)
{
std
::
string
var_name
=
platform
::
GetHierarchicalInterNCCLVarName
(
i
);
auto
nccl_id_var
=
scope
->
FindVar
(
var_name
);
PADDLE_ENFORCE
(
nccl_id_var
,
"can't find %s nccl_id_var"
,
var_name
);
auto
inter_nccl_id
=
nccl_id_var
->
GetMutable
<
ncclUniqueId
>
();
inter_nccl_ids
.
push_back
(
inter_nccl_id
);
}
std
::
vector
<
ncclUniqueId
*>
exter_nccl_ids
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
bst
.
nccl_comm_num_
);
i
++
)
{
...
...
@@ -169,7 +174,8 @@ class ParallelExecutorPrivate {
auto
nccl_id
=
nccl_id_var
->
GetMutable
<
ncclUniqueId
>
();
exter_nccl_ids
.
push_back
(
nccl_id
);
}
nccl_ctxs_
.
InitHierarchicalCtxs
(
places_
,
inter_nccl_id
,
exter_nccl_ids
,
nccl_ctxs_
.
InitHierarchicalCtxs
(
places_
,
inter_nccl_ids
,
exter_nccl_ids
,
bst
.
num_trainers_
,
bst
.
trainer_id_
,
bst
.
hierarchical_allreduce_inter_nranks_
,
bst
.
hierarchical_allreduce_exter_nranks_
);
...
...
paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
浏览文件 @
0d561ef4
...
...
@@ -124,8 +124,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
ss
<<
trainers
[
i
];
}
VLOG
(
1
)
<<
"Hierarchical inter ring endpoints:"
<<
ss
.
str
();
std
::
string
nccl_var_name
=
platform
::
GetHierarchicalInterNCCLVarName
();
GenerateAndSend
(
&
local_scope
,
dev_ctx
,
nccl_var_name
,
inter_endpoints
);
for
(
int
i
=
0
;
i
<
nccl_comm_num
;
i
++
)
{
std
::
string
nccl_var_name
=
platform
::
GetHierarchicalInterNCCLVarName
(
i
);
GenerateAndSend
(
&
local_scope
,
dev_ctx
,
nccl_var_name
,
inter_endpoints
);
}
}
// hierarchical exter ncclid
...
...
@@ -208,12 +211,14 @@ class GenNCCLIdOp : public framework::OperatorBase {
if
(
use_hierarchical_allreduce
)
{
if
(
inter_trainer_id
>
0
)
{
rpc_service
->
SetCond
(
distributed
::
kRequestSend
);
VLOG
(
3
)
<<
"trainer_id:"
<<
trainer_id
<<
", inter_trainer_id:"
<<
inter_trainer_id
<<
" start getting nccl id from inter_trainer 0"
;
rpc_service
->
WaitBarrier
(
distributed
::
kRequestSend
);
rpc_service
->
ResetBarrierCounter
();
for
(
int
i
=
0
;
i
<
nccl_comm_num
;
i
++
)
{
rpc_service
->
SetCond
(
distributed
::
kRequestSend
);
VLOG
(
3
)
<<
"trainer_id:"
<<
trainer_id
<<
", inter_trainer_id:"
<<
inter_trainer_id
<<
" start getting nccl id from inter_trainer:"
<<
i
;
rpc_service
->
WaitBarrier
(
distributed
::
kRequestSend
);
rpc_service
->
ResetBarrierCounter
();
}
}
if
(
exter_trainer_id
>
0
)
{
...
...
paddle/fluid/platform/nccl_helper.h
浏览文件 @
0d561ef4
...
...
@@ -171,8 +171,9 @@ inline std::string GetHierarchicalExterNCCLVarName(size_t pos) {
return
string
::
Sprintf
(
"Hierarchical_exter_%s_%d"
,
NCCL_ID_VARNAME
,
static_cast
<
int
>
(
pos
));
}
inline
std
::
string
GetHierarchicalInterNCCLVarName
()
{
return
string
::
Sprintf
(
"Hierarchical_inter_%s"
,
NCCL_ID_VARNAME
);
inline
std
::
string
GetHierarchicalInterNCCLVarName
(
size_t
pos
)
{
return
string
::
Sprintf
(
"Hierarchical_inter_%s_%d"
,
NCCL_ID_VARNAME
,
static_cast
<
int
>
(
pos
));
}
class
MultiNCCLContextMap
{
...
...
@@ -224,8 +225,8 @@ class MultiNCCLContextMap {
}
void
InitHierarchicalCtxs
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
ncclUniqueId
*
inter_nccl_id
,
const
std
::
vector
<
ncclUniqueId
*>
&
exter_nccl_id
,
const
std
::
vector
<
ncclUniqueId
*>
&
inter_nccl_ids
,
const
std
::
vector
<
ncclUniqueId
*>
&
exter_nccl_id
s
,
size_t
trainers_num
,
size_t
trainer_id
,
size_t
inter_trainers_num
,
size_t
exter_trainers_num
)
{
...
...
@@ -238,11 +239,14 @@ class MultiNCCLContextMap {
inter_trainers_num
);
int
inter_trainer_id
=
trainer_id
%
inter_trainers_num
;
VLOG
(
1
)
<<
"init inter_trainer_id:"
<<
inter_trainer_id
;
auto
local
=
new
NCCLContextMap
(
places
,
inter_nccl_id
,
inter_trainers_num
,
inter_trainer_id
);
for
(
size_t
i
=
0
;
i
<
inter_nccl_ids
.
size
();
i
++
)
{
VLOG
(
1
)
<<
"init inter_trainer_id:"
<<
inter_trainer_id
<<
", comm no:"
<<
i
;
auto
local
=
new
NCCLContextMap
(
places
,
inter_nccl_ids
[
i
],
inter_trainers_num
,
inter_trainer_id
);
h_inter_ctxs_
.
emplace_back
(
local
);
h_inter_ctxs_
.
emplace_back
(
local
);
}
int
exter_trainer_id
=
-
1
;
if
(
trainer_id
%
inter_trainers_num
==
0
)
{
...
...
@@ -250,8 +254,8 @@ class MultiNCCLContextMap {
}
if
(
exter_trainer_id
>=
0
)
{
for
(
size_t
i
=
0
;
i
<
exter_nccl_id
.
size
();
i
++
)
{
auto
ex
=
new
NCCLContextMap
(
places
,
exter_nccl_id
[
i
],
for
(
size_t
i
=
0
;
i
<
exter_nccl_id
s
.
size
();
i
++
)
{
auto
ex
=
new
NCCLContextMap
(
places
,
exter_nccl_id
s
[
i
],
exter_trainers_num
,
exter_trainer_id
);
VLOG
(
1
)
<<
"init exter_trainer_id:"
<<
exter_trainer_id
<<
", comm no:"
<<
i
;
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
0d561ef4
...
...
@@ -278,11 +278,11 @@ class DistributeTranspiler(object):
type
=
core
.
VarDesc
.
VarType
.
RAW
)
if
self
.
config
.
use_hierarchical_allreduce
:
startup_program
.
global_block
().
create_var
(
name
=
"Hierarchical_inter_NCCLID"
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
for
i
in
range
(
0
,
self
.
config
.
nccl_comm_num
):
startup_program
.
global_block
().
create_var
(
name
=
"Hierarchical_inter_NCCLID_{}"
.
format
(
i
),
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
startup_program
.
global_block
().
create_var
(
name
=
"Hierarchical_exter_NCCLID_{}"
.
format
(
i
),
persistable
=
True
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录