Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
f1ef3f22
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f1ef3f22
编写于
6月 27, 2018
作者:
W
Wu Yi
提交者:
typhoonzero
6月 27, 2018
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Merge pull request #11728 from typhoonzero/fix_paraexe_bcast
Fix dist train broadcasting bug
上级
fac1d477
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
37 addition
and
13 deletion
+37
-13
paddle/fluid/framework/details/multi_devices_graph_builder.cc
...le/fluid/framework/details/multi_devices_graph_builder.cc
+3
-0
paddle/fluid/framework/details/ssa_graph_builder.h
paddle/fluid/framework/details/ssa_graph_builder.h
+1
-1
paddle/fluid/framework/details/ssa_graph_checker.h
paddle/fluid/framework/details/ssa_graph_checker.h
+6
-0
paddle/fluid/framework/details/ssa_graph_printer.h
paddle/fluid/framework/details/ssa_graph_printer.h
+5
-0
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+17
-7
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+5
-5
未找到文件。
paddle/fluid/framework/details/multi_devices_graph_builder.cc
浏览文件 @
f1ef3f22
...
...
@@ -483,6 +483,9 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
}
}
else
if
(
op
.
Type
()
==
"concat"
)
{
op_dev_id
=
GetVarDeviceID
(
op
.
InputArgumentNames
()[
0
]);
for
(
auto
&
varname
:
op
.
OutputArgumentNames
())
{
var_name_on_devices_
.
emplace
(
varname
,
op_dev_id
);
}
}
else
{
PADDLE_ENFORCE
(
"the distribute training related op should be in [split_byref, "
...
...
paddle/fluid/framework/details/ssa_graph_builder.h
浏览文件 @
f1ef3f22
...
...
@@ -30,7 +30,7 @@ class SSAGraphBuilder {
SSAGraphBuilder
()
{}
virtual
~
SSAGraphBuilder
()
{}
virtual
std
::
unique_ptr
<
SSAGraph
>
Build
(
const
ProgramDesc
&
program
)
const
=
0
;
virtual
int
GetVarDeviceID
(
const
std
::
string
&
var_name
)
const
{
return
-
1
;
}
virtual
int
GetVarDeviceID
(
const
std
::
string
&
var_name
)
const
=
0
;
DISABLE_COPY_AND_ASSIGN
(
SSAGraphBuilder
);
...
...
paddle/fluid/framework/details/ssa_graph_checker.h
浏览文件 @
f1ef3f22
...
...
@@ -16,6 +16,8 @@
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
#include <string>
namespace
paddle
{
namespace
framework
{
namespace
details
{
...
...
@@ -33,6 +35,10 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
return
graph
;
}
int
GetVarDeviceID
(
const
std
::
string
&
var_name
)
const
override
{
return
builder_
->
GetVarDeviceID
(
var_name
);
}
bool
IsValidGraph
(
const
SSAGraph
*
graph
)
const
;
private:
...
...
paddle/fluid/framework/details/ssa_graph_printer.h
浏览文件 @
f1ef3f22
...
...
@@ -15,6 +15,7 @@
#pragma once
#include <iosfwd>
#include <string>
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
namespace
paddle
{
...
...
@@ -55,6 +56,10 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
return
graph
;
}
int
GetVarDeviceID
(
const
std
::
string
&
var_name
)
const
override
{
return
builder_
->
GetVarDeviceID
(
var_name
);
}
private:
std
::
unique_ptr
<
SSAGraphPrinter
>
printer_
;
std
::
unique_ptr
<
SSAGraphBuilder
>
builder_
;
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
f1ef3f22
...
...
@@ -133,17 +133,18 @@ ParallelExecutor::ParallelExecutor(
void
ParallelExecutor
::
BCastParamsToGPUs
(
const
std
::
unordered_set
<
std
::
string
>
&
vars
)
const
{
// the the initialize bcast, all vars would be bcast from device(0), otherwise
// the the initializing bcast, all vars would be bcast from device(0),
// otherwise
// bcast from the specified device.
bool
initializ
e
=
builder_
.
get
()
==
nullptr
?
true
:
false
;
bool
initializ
ing
=
builder_
.
get
()
==
nullptr
?
true
:
false
;
for
(
auto
&
var
:
vars
)
{
int
var_dev_id
=
builder_
.
get
()
==
nullptr
?
-
1
:
builder_
->
GetVarDeviceID
(
var
);
if
(
!
initializ
e
&&
var_dev_id
==
-
1
)
continue
;
if
(
!
initializ
ing
&&
var_dev_id
==
-
1
)
continue
;
framework
::
Variable
*
main_var
=
nullptr
;
if
(
initializ
e
)
{
if
(
initializ
ing
)
{
main_var
=
member_
->
local_scopes_
[
0
]
->
FindVar
(
var
);
}
else
{
main_var
=
member_
->
local_scopes_
[
var_dev_id
]
->
FindVar
(
var
);
...
...
@@ -164,7 +165,8 @@ void ParallelExecutor::BCastParamsToGPUs(
auto
place
=
member_
->
places_
[
i
];
void
*
buffer
;
if
((
initialize
&&
i
==
0
)
||
(
!
initialize
&&
i
==
var_dev_id
))
{
if
((
initializing
&&
i
==
0
)
||
(
!
initializing
&&
static_cast
<
int
>
(
i
)
==
var_dev_id
))
{
buffer
=
const_cast
<
void
*>
(
main_tensor
.
data
<
void
>
());
}
else
{
auto
local_scope
=
member_
->
local_scopes_
[
i
];
...
...
@@ -181,8 +183,16 @@ void ParallelExecutor::BCastParamsToGPUs(
platform
::
NCCLGroupGuard
guard
;
for
(
size_t
i
=
0
;
i
<
member_
->
places_
.
size
();
++
i
)
{
auto
&
nccl_ctx
=
member_
->
nccl_ctxs_
->
at
(
member_
->
places_
[
i
]);
platform
::
dynload
::
ncclBcast
(
buffers
[
i
],
numel
,
data_type
,
0
,
nccl_ctx
.
comm_
,
nccl_ctx
.
stream
());
if
(
initializing
)
{
platform
::
dynload
::
ncclBcast
(
buffers
[
i
],
numel
,
data_type
,
0
,
nccl_ctx
.
comm_
,
nccl_ctx
.
stream
());
}
else
{
if
(
var_dev_id
>=
0
)
{
platform
::
dynload
::
ncclBcast
(
buffers
[
i
],
numel
,
data_type
,
var_dev_id
,
nccl_ctx
.
comm_
,
nccl_ctx
.
stream
());
}
}
}
member_
->
nccl_ctxs_
->
WaitAll
();
}
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
f1ef3f22
...
...
@@ -302,7 +302,6 @@ class DistributeTranspiler(object):
"""
# remove optimize ops and add a send op to main_program
delete_ops
(
self
.
origin_program
.
global_block
(),
self
.
optimize_ops
)
# FIXME(typhoonzero): serialize once will fix error occurs when clone.
self
.
origin_program
.
__str__
()
return
self
.
origin_program
...
...
@@ -383,11 +382,12 @@ class DistributeTranspiler(object):
if
self
.
_is_adam_connected_op
(
op
):
global_ops
.
append
(
op
)
def
__append_optimize_op__
(
op
,
block
,
grad_to_block_id
,
merged_var
):
def
__append_optimize_op__
(
op
,
block
,
grad_to_block_id
,
merged_var
,
lr_ops
):
if
self
.
_is_optimizer_op
(
op
):
self
.
_append_pserver_ops
(
block
,
op
,
endpoint
,
grad_to_block_id
,
self
.
origin_program
,
merged_var
)
el
se
:
el
if
op
not
in
lr_ops
:
self
.
_append_pserver_non_opt_ops
(
block
,
op
)
def
__op_have_grad_input__
(
op
):
...
...
@@ -447,7 +447,7 @@ class DistributeTranspiler(object):
# optimizer is connected to itself
if
ufind
.
is_connected
(
op
,
opt_op
)
and
op
not
in
global_ops
:
__append_optimize_op__
(
op
,
per_opt_block
,
grad_to_block_id
,
merged_var
)
merged_var
,
lr_ops
)
# append global ops
if
global_ops
:
...
...
@@ -455,7 +455,7 @@ class DistributeTranspiler(object):
pserver_program
.
num_blocks
-
1
)
for
glb_op
in
global_ops
:
__append_optimize_op__
(
glb_op
,
opt_state_block
,
grad_to_block_id
,
None
)
grad_to_block_id
,
None
,
lr_ops
)
# process distributed lookup_table
prefetch_var_name_to_block_id
=
[]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录