Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
ec2aed2a
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ec2aed2a
编写于
9月 01, 2018
作者:
X
Xin Pan
提交者:
GitHub
9月 01, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #13102 from typhoonzero/merge_dist_deps_fixes
Cherrypick dist fixes
上级
2aad01fc
43926959
变更
17
展开全部
显示空白变更内容
内联
并排
Showing
17 changed file
with
2171 addition
and
391 deletion
+2171
-391
paddle/fluid/API.spec
paddle/fluid/API.spec
+6
-4
paddle/fluid/framework/details/multi_devices_graph_pass.cc
paddle/fluid/framework/details/multi_devices_graph_pass.cc
+42
-25
paddle/fluid/framework/ir/graph.cc
paddle/fluid/framework/ir/graph.cc
+0
-57
paddle/fluid/operators/fetch_barrier_op.cc
paddle/fluid/operators/fetch_barrier_op.cc
+2
-0
paddle/fluid/operators/send_barrier_op.cc
paddle/fluid/operators/send_barrier_op.cc
+4
-0
python/paddle/fluid/layers/io.py
python/paddle/fluid/layers/io.py
+9
-2
python/paddle/fluid/tests/unittests/dist_se_resnext.py
python/paddle/fluid/tests/unittests/dist_se_resnext.py
+19
-8
python/paddle/fluid/tests/unittests/dist_transformer.py
python/paddle/fluid/tests/unittests/dist_transformer.py
+1649
-207
python/paddle/fluid/tests/unittests/dist_word2vec.py
python/paddle/fluid/tests/unittests/dist_word2vec.py
+10
-6
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+84
-40
python/paddle/fluid/tests/unittests/test_dist_mnist.py
python/paddle/fluid/tests/unittests/test_dist_mnist.py
+42
-1
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+11
-0
python/paddle/fluid/tests/unittests/test_dist_train.py
python/paddle/fluid/tests/unittests/test_dist_train.py
+1
-1
python/paddle/fluid/tests/unittests/test_dist_transformer.py
python/paddle/fluid/tests/unittests/test_dist_transformer.py
+45
-4
python/paddle/fluid/tests/unittests/test_dist_word2vec.py
python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+12
-1
python/paddle/fluid/transpiler/details/program_utils.py
python/paddle/fluid/transpiler/details/program_utils.py
+133
-0
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+102
-35
未找到文件。
paddle/fluid/API.spec
浏览文件 @
ec2aed2a
...
...
@@ -55,9 +55,10 @@ paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path
paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'
], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, Tru
e))
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'
, 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, Non
e))
paddle.fluid.InferenceTranspiler.__init__
paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
...
...
@@ -329,9 +330,10 @@ paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'
], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, Tru
e))
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'
, 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, Non
e))
paddle.fluid.transpiler.InferenceTranspiler.__init__
paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
...
...
paddle/fluid/framework/details/multi_devices_graph_pass.cc
浏览文件 @
ec2aed2a
...
...
@@ -736,7 +736,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
.
emplace
(
varname
,
op_dev_id
);
}
}
else
{
PADDLE_
ENFORCE
(
PADDLE_
THROW
(
"the distribute training related op should be in [split_byref, "
"concat]."
);
}
...
...
@@ -746,17 +746,26 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
node
->
Op
()
->
Type
());
CreateComputationalOp
(
result
,
node
,
op_dev_id
);
if
(
node
->
Op
()
->
Type
()
==
"concat"
)
{
ConnectOp
(
result
,
result
->
Get
<
GraphOps
>
(
kGraphOps
).
back
().
get
(),
"fetch_barrier"
);
}
void
SetOpInputsAllPlaces
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
int
num_places
)
{
auto
*
op_handle
=
result
->
Get
<
GraphOps
>
(
kGraphOps
).
back
().
get
();
for
(
ir
::
Node
*
input
:
node
->
inputs
)
{
VarHandle
*
var
=
nullptr
;
for
(
int
place_offset
=
0
;
place_offset
<
num_places
;
++
place_offset
)
{
auto
&
var_holders
=
result
->
Get
<
GraphVars
>
(
kGraphVars
)[
place_offset
];
auto
&
var_holder
=
var_holders
[
input
->
Name
()];
if
(
!
var_holder
.
empty
())
{
var
=
var_holder
.
rbegin
()
->
get
();
op_handle
->
AddInput
(
var
);
}
}
}
}
// Create RPC related op handles that connects its in ops and out ops.
void
MultiDevSSAGraphBuilder
::
CreateRPCOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
)
const
{
// FIXME(typhoonzero): Cleanup this deps for both sync mode and async mode
// put them into transpiler.
int
op_dev_id
=
-
1
;
if
(
node
->
Op
()
->
Type
()
==
"send"
)
{
// TODO(paddle-dev): getting the first var is not safe.
...
...
@@ -791,8 +800,6 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
}
auto
recv_param_grad
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
// FIXME(typhoonzero): assume each recv op output one param
// Use the same place as send.
if
(
recv_param_grad
.
size
()
==
2U
)
{
op_dev_id
=
GetVarDeviceID
(
*
result
,
recv_param_grad
[
1
]);
VLOG
(
10
)
<<
"recv param "
<<
recv_param_grad
[
0
]
...
...
@@ -806,34 +813,44 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
.
emplace
(
varname
,
op_dev_id
);
}
}
else
{
// send_barrier
and fetch_barrier op can be scheduled on device 0
// send_barrier
, fetch_barrier will run on place 0;
op_dev_id
=
0
;
}
PADDLE_ENFORCE
(
op_dev_id
!=
-
1
,
"can not find the right place for rpc op: %s"
,
node
->
Op
()
->
Type
());
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
RPCOpHandle
(
result
->
CreateOpNode
(
node
->
Op
()),
*
node
->
Op
(),
local_scopes_
[
op_dev_id
],
node
->
Op
()
->
Type
(),
places_
[
op_dev_id
]));
// TODO(panyx0718): This might not be needed anymore.
if
(
node
->
Op
()
->
Type
()
==
"send_barrier"
)
{
ConnectOp
(
result
,
result
->
Get
<
GraphOps
>
(
kGraphOps
).
back
().
get
(),
"send"
);
}
else
if
(
node
->
Op
()
->
Type
()
==
"recv"
)
{
ConnectOp
(
result
,
result
->
Get
<
GraphOps
>
(
kGraphOps
).
back
().
get
(),
"send_barrier"
);
}
else
if
(
node
->
Op
()
->
Type
()
==
"fetch_barrier"
)
{
ConnectOp
(
result
,
result
->
Get
<
GraphOps
>
(
kGraphOps
).
back
().
get
(),
"recv"
);
}
else
if
(
node
->
Op
()
->
Type
()
==
"send"
)
{
// do nothing
if
(
node
->
Op
()
->
Type
()
==
"send"
)
{
CreateOpHandleIOs
(
result
,
node
,
op_dev_id
);
}
else
{
PADDLE_THROW
(
"rpc op should be in ["
"send, send_barrier. recv, fetch_barrier]"
);
}
// send_barrier, recv, fetch_barrier's inputs are deps var, get them from
// all places
auto
p
=
places_
[
op_dev_id
];
auto
*
op_handle
=
result
->
Get
<
GraphOps
>
(
kGraphOps
).
back
().
get
();
op_handle
->
SetDeviceContext
(
p
,
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
));
CreateOpHandleIOs
(
result
,
node
,
op_dev_id
);
SetOpInputsAllPlaces
(
result
,
node
,
places_
.
size
());
for
(
ir
::
Node
*
output
:
node
->
outputs
)
{
int
outvar_dev_id
=
op_dev_id
;
if
(
node
->
Op
()
->
Type
()
==
"fetch_barrier"
)
{
outvar_dev_id
=
GetVarDeviceID
(
*
result
,
output
->
Name
());
PADDLE_ENFORCE_NE
(
outvar_dev_id
,
-
1
);
}
p
=
places_
[
outvar_dev_id
];
ir
::
Node
*
new_node
=
nullptr
;
if
(
output
->
Var
())
{
new_node
=
result
->
CreateVarNode
(
output
->
Var
());
}
else
{
new_node
=
result
->
CreateEmptyNode
(
output
->
Name
(),
ir
::
Node
::
Type
::
kVariable
);
}
CreateOpOutput
(
result
,
op_handle
,
new_node
,
p
,
outvar_dev_id
);
}
}
}
bool
MultiDevSSAGraphBuilder
::
IsScaleLossOp
(
ir
::
Node
*
node
)
const
{
...
...
paddle/fluid/framework/ir/graph.cc
浏览文件 @
ec2aed2a
...
...
@@ -132,63 +132,6 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
}
}
std
::
vector
<
ir
::
Node
*>
send_ops
;
ir
::
Node
*
send_bar
=
nullptr
;
std
::
vector
<
ir
::
Node
*>
recv_ops
;
ir
::
Node
*
fetch_bar
=
nullptr
;
for
(
ir
::
Node
*
node
:
Nodes
())
{
if
(
node
->
Name
()
==
"send"
)
{
send_ops
.
push_back
(
node
);
}
else
if
(
node
->
Name
()
==
"send_barrier"
)
{
PADDLE_ENFORCE
(
!
send_bar
,
"only has one send barrier"
);
send_bar
=
node
;
}
else
if
(
node
->
Name
()
==
"recv"
)
{
recv_ops
.
push_back
(
node
);
}
else
if
(
node
->
Name
()
==
"fetch_barrier"
)
{
PADDLE_ENFORCE
(
!
fetch_bar
,
"only has one fetch barrier"
);
fetch_bar
=
node
;
}
}
if
(
send_bar
)
{
for
(
ir
::
Node
*
send
:
send_ops
)
{
ir
::
Node
*
dep_var
=
CreateControlDepVar
();
send
->
outputs
.
push_back
(
dep_var
);
dep_var
->
inputs
.
push_back
(
send
);
send_bar
->
inputs
.
push_back
(
dep_var
);
dep_var
->
outputs
.
push_back
(
send_bar
);
}
for
(
ir
::
Node
*
recv
:
recv_ops
)
{
ir
::
Node
*
dep_var
=
CreateControlDepVar
();
recv
->
inputs
.
push_back
(
dep_var
);
dep_var
->
outputs
.
push_back
(
recv
);
send_bar
->
outputs
.
push_back
(
dep_var
);
dep_var
->
inputs
.
push_back
(
send_bar
);
}
}
if
(
fetch_bar
)
{
for
(
ir
::
Node
*
recv
:
recv_ops
)
{
ir
::
Node
*
dep_var
=
CreateControlDepVar
();
recv
->
outputs
.
push_back
(
dep_var
);
dep_var
->
inputs
.
push_back
(
recv
);
fetch_bar
->
inputs
.
push_back
(
dep_var
);
dep_var
->
outputs
.
push_back
(
fetch_bar
);
}
}
std
::
vector
<
std
::
string
>
send_vars
=
FindDistTrainSendVars
(
send_ops
);
std
::
vector
<
std
::
string
>
recv_vars
=
FindDistTrainRecvVars
(
recv_ops
);
for
(
ir
::
Node
*
node
:
Nodes
())
{
if
(
IsDistTrainOp
(
node
,
send_vars
,
recv_vars
))
{
if
(
fetch_bar
&&
node
->
Name
()
==
"concat"
)
{
ir
::
Node
*
dep_var
=
CreateControlDepVar
();
fetch_bar
->
outputs
.
push_back
(
dep_var
);
dep_var
->
inputs
.
push_back
(
fetch_bar
);
node
->
inputs
.
push_back
(
dep_var
);
dep_var
->
outputs
.
push_back
(
node
);
}
}
}
/**
* We should handle write after read(WAR) and write after write(WAW) here.
* Because some of the operators of the program can be executed parallelly.
...
...
paddle/fluid/operators/fetch_barrier_op.cc
浏览文件 @
ec2aed2a
...
...
@@ -52,6 +52,8 @@ class FetchBarrierOp : public framework::OperatorBase {
class
FetchBarrierOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
AddOutput
(
"Out"
,
"(Any) Dummy outputs, used for control dependency"
)
.
AsDuplicable
();
AddComment
(
R"DOC(
SendBarrier operator
...
...
paddle/fluid/operators/send_barrier_op.cc
浏览文件 @
ec2aed2a
...
...
@@ -56,6 +56,10 @@ class SendBarrierOp : public framework::OperatorBase {
class
SendBarrierOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
AddInput
(
"X"
,
"(Any) Dummy inputs, used for control dependency"
)
.
AsDuplicable
();
AddOutput
(
"Out"
,
"(Any) Dummy outputs, used for control dependency"
)
.
AsDuplicable
();
AddComment
(
R"DOC(
SendBarrier operator
...
...
python/paddle/fluid/layers/io.py
浏览文件 @
ec2aed2a
...
...
@@ -246,7 +246,11 @@ def Send(endpoints, send_vars, dummy_output=None, sync=True):
rpc_op_role_name
:
core
.
op_proto_and_checker_maker
.
OpRole
.
RPC
})
if
sync
:
helper
.
append_op
(
type
=
"send_barrier"
,
attrs
=
{
"endpoints"
:
endpoints
})
helper
.
append_op
(
type
=
"send_barrier"
,
inputs
=
{
"X"
:
dummy_output
},
outputs
=
{
"Out"
:
[]},
attrs
=
{
"endpoints"
:
endpoints
})
def
Recv
(
endpoints
,
get_vars
,
dummy_input
=
None
,
sync
=
True
):
...
...
@@ -282,7 +286,10 @@ def Recv(endpoints, get_vars, dummy_input=None, sync=True):
attrs
=
{
"endpoints"
:
endpoints
,
"epmap"
:
epmap
})
if
sync
:
helper
.
append_op
(
type
=
"fetch_barrier"
,
attrs
=
{
"endpoints"
:
endpoints
})
helper
.
append_op
(
type
=
"fetch_barrier"
,
outputs
=
{
"Out"
:
get_vars
},
attrs
=
{
"endpoints"
:
endpoints
})
return
get_vars
...
...
python/paddle/fluid/tests/unittests/dist_se_resnext.py
浏览文件 @
ec2aed2a
...
...
@@ -130,7 +130,12 @@ class SE_ResNeXt():
input
=
conv
,
pool_size
=
7
,
pool_type
=
'avg'
,
global_pooling
=
True
)
drop
=
fluid
.
layers
.
dropout
(
x
=
pool
,
dropout_prob
=
0.2
)
stdv
=
1.0
/
math
.
sqrt
(
drop
.
shape
[
1
]
*
1.0
)
out
=
fluid
.
layers
.
fc
(
input
=
drop
,
size
=
class_dim
,
act
=
'softmax'
)
out
=
fluid
.
layers
.
fc
(
input
=
drop
,
size
=
class_dim
,
act
=
'softmax'
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.05
)))
return
out
def
shortcut
(
self
,
input
,
ch_out
,
stride
):
...
...
@@ -180,7 +185,7 @@ class SE_ResNeXt():
act
=
None
,
# avoid pserver CPU init differs from GPU
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
()),
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.05
)),
bias_attr
=
False
)
return
fluid
.
layers
.
batch_norm
(
input
=
conv
,
act
=
act
)
...
...
@@ -188,12 +193,18 @@ class SE_ResNeXt():
pool
=
fluid
.
layers
.
pool2d
(
input
=
input
,
pool_size
=
0
,
pool_type
=
'avg'
,
global_pooling
=
True
)
stdv
=
1.0
/
math
.
sqrt
(
pool
.
shape
[
1
]
*
1.0
)
squeeze
=
fluid
.
layers
.
fc
(
input
=
pool
,
squeeze
=
fluid
.
layers
.
fc
(
input
=
pool
,
size
=
num_channels
//
reduction_ratio
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.05
)),
act
=
'relu'
)
stdv
=
1.0
/
math
.
sqrt
(
squeeze
.
shape
[
1
]
*
1.0
)
excitation
=
fluid
.
layers
.
fc
(
input
=
squeeze
,
excitation
=
fluid
.
layers
.
fc
(
input
=
squeeze
,
size
=
num_channels
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.05
)),
act
=
'sigmoid'
)
scale
=
fluid
.
layers
.
elementwise_mul
(
x
=
input
,
y
=
excitation
,
axis
=
0
)
return
scale
...
...
python/paddle/fluid/tests/unittests/dist_transformer.py
浏览文件 @
ec2aed2a
此差异已折叠。
点击以展开。
python/paddle/fluid/tests/unittests/dist_word2vec.py
浏览文件 @
ec2aed2a
...
...
@@ -49,28 +49,32 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
dtype
=
'float32'
,
is_sparse
=
IS_SPARSE
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'shared_w'
,
initializer
=
fluid
.
initializer
.
Constant
()))
name
=
'shared_w'
,
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
embed_second
=
fluid
.
layers
.
embedding
(
input
=
words
[
1
],
size
=
[
dict_size
,
EMBED_SIZE
],
dtype
=
'float32'
,
is_sparse
=
IS_SPARSE
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'shared_w'
,
initializer
=
fluid
.
initializer
.
Constant
()))
name
=
'shared_w'
,
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
embed_third
=
fluid
.
layers
.
embedding
(
input
=
words
[
2
],
size
=
[
dict_size
,
EMBED_SIZE
],
dtype
=
'float32'
,
is_sparse
=
IS_SPARSE
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'shared_w'
,
initializer
=
fluid
.
initializer
.
Constant
()))
name
=
'shared_w'
,
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
embed_forth
=
fluid
.
layers
.
embedding
(
input
=
words
[
3
],
size
=
[
dict_size
,
EMBED_SIZE
],
dtype
=
'float32'
,
is_sparse
=
IS_SPARSE
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'shared_w'
,
initializer
=
fluid
.
initializer
.
Constant
()))
name
=
'shared_w'
,
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
concat_embed
=
fluid
.
layers
.
concat
(
input
=
[
embed_first
,
embed_second
,
embed_third
,
embed_forth
],
...
...
@@ -80,13 +84,13 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
size
=
HIDDEN_SIZE
,
act
=
'sigmoid'
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
()))
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
predict_word
=
fluid
.
layers
.
fc
(
input
=
hidden1
,
size
=
dict_size
,
act
=
'softmax'
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
()))
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
predict_word
,
label
=
words
[
4
])
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
...
...
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
ec2aed2a
...
...
@@ -21,7 +21,7 @@ import sys
import
six
import
signal
import
subprocess
import
six
import
argparse
class
TestDistRunnerBase
(
object
):
...
...
@@ -30,7 +30,7 @@ class TestDistRunnerBase(object):
"get_model should be implemented by child classes."
)
def
get_transpiler
(
self
,
trainer_id
,
main_program
,
pserver_endpoints
,
trainers
):
trainers
,
sync_mode
):
# NOTE: import fluid until runtime, or else forking processes will cause error.
import
paddle
import
paddle.fluid
as
fluid
...
...
@@ -39,33 +39,35 @@ class TestDistRunnerBase(object):
trainer_id
=
trainer_id
,
program
=
main_program
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
)
trainers
=
trainers
,
sync_mode
=
sync_mode
)
return
t
def
run_pserver
(
self
,
pserver_endpoints
,
trainers
,
current_endpoint
,
trainer_id
):
def
run_pserver
(
self
,
args
):
import
paddle
import
paddle.fluid
as
fluid
self
.
get_model
(
batch_size
=
2
)
t
=
self
.
get_transpiler
(
trainer_id
,
fluid
.
default_main_program
(),
pserver_endpoints
,
trainers
)
pserver_prog
=
t
.
get_pserver_program
(
current_endpoint
)
startup_prog
=
t
.
get_startup_program
(
current_endpoint
,
pserver_prog
)
t
=
self
.
get_transpiler
(
args
.
trainer_id
,
fluid
.
default_main_program
(),
args
.
endpoints
,
args
.
trainers
,
args
.
sync_mode
)
pserver_prog
=
t
.
get_pserver_program
(
args
.
current_endpoint
)
startup_prog
=
t
.
get_startup_program
(
args
.
current_endpoint
,
pserver_prog
)
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup_prog
)
exe
.
run
(
pserver_prog
)
def
run_trainer
(
self
,
place
,
endpoints
,
trainer_id
,
trainers
,
is_dist
=
True
):
def
run_trainer
(
self
,
place
,
args
):
import
paddle
import
paddle.fluid
as
fluid
test_program
,
avg_cost
,
train_reader
,
test_reader
,
batch_acc
,
predict
=
\
self
.
get_model
(
batch_size
=
2
)
if
is_dist
:
t
=
self
.
get_transpiler
(
trainer_id
,
fluid
.
default_main_program
(),
endpoints
,
trainers
)
if
args
.
is_dist
:
t
=
self
.
get_transpiler
(
args
.
trainer_id
,
fluid
.
default_main_program
(),
args
.
endpoints
,
args
.
trainers
,
args
.
sync_mode
)
trainer_prog
=
t
.
get_trainer_program
()
else
:
trainer_prog
=
fluid
.
default_main_program
()
...
...
@@ -76,8 +78,18 @@ class TestDistRunnerBase(object):
strategy
=
fluid
.
ExecutionStrategy
()
strategy
.
num_threads
=
1
strategy
.
allow_op_delay
=
False
build_stra
=
fluid
.
BuildStrategy
()
if
args
.
use_reduce
:
build_stra
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
else
:
build_stra
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
exe
=
fluid
.
ParallelExecutor
(
True
,
loss_name
=
avg_cost
.
name
,
exec_strategy
=
strategy
)
True
,
loss_name
=
avg_cost
.
name
,
exec_strategy
=
strategy
,
build_strategy
=
build_stra
)
feed_var_list
=
[
var
for
var
in
trainer_prog
.
global_block
().
vars
.
values
()
...
...
@@ -106,45 +118,64 @@ def runtime_main(test_class):
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
if
len
(
sys
.
argv
)
!=
7
:
print
(
"Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
)
role
=
sys
.
argv
[
1
]
endpoints
=
sys
.
argv
[
2
]
trainer_id
=
int
(
sys
.
argv
[
3
])
current_endpoint
=
sys
.
argv
[
4
]
trainers
=
int
(
sys
.
argv
[
5
])
is_dist
=
True
if
sys
.
argv
[
6
]
==
"TRUE"
else
False
parser
=
argparse
.
ArgumentParser
(
description
=
'Run dist test.'
)
parser
.
add_argument
(
'--role'
,
type
=
str
,
required
=
True
,
choices
=
[
'pserver'
,
'trainer'
])
parser
.
add_argument
(
'--endpoints'
,
type
=
str
,
required
=
False
,
default
=
""
)
parser
.
add_argument
(
'--is_dist'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--trainer_id'
,
type
=
int
,
required
=
False
,
default
=
0
)
parser
.
add_argument
(
'--trainers'
,
type
=
int
,
required
=
False
,
default
=
1
)
parser
.
add_argument
(
'--current_endpoint'
,
type
=
str
,
required
=
False
,
default
=
""
)
parser
.
add_argument
(
'--sync_mode'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--mem_opt'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--use_reduce'
,
action
=
'store_true'
)
args
=
parser
.
parse_args
()
model
=
test_class
()
if
role
==
"pserver"
:
model
.
run_pserver
(
endpoints
,
trainers
,
current_endpoint
,
trainer_id
)
if
args
.
role
==
"pserver"
:
model
.
run_pserver
(
args
)
else
:
p
=
fluid
.
CUDAPlace
(
0
)
if
core
.
is_compiled_with_cuda
(
)
else
fluid
.
CPUPlace
()
model
.
run_trainer
(
p
,
endpoints
,
trainer_id
,
trainers
,
is_dist
)
model
.
run_trainer
(
p
,
args
)
import
paddle.compat
as
cpt
class
TestDistBase
(
unittest
.
TestCase
):
def
_setup_config
(
self
):
raise
NotImplementedError
(
"tests should have _setup_config implemented"
)
def
setUp
(
self
):
self
.
_trainers
=
2
self
.
_pservers
=
2
self
.
_ps_endpoints
=
"127.0.0.1:9123,127.0.0.1:9124"
self
.
_python_interp
=
"python"
self
.
_sync_mode
=
True
self
.
_mem_opt
=
False
self
.
_use_reduce
=
False
self
.
_setup_config
()
def
start_pserver
(
self
,
model_file
,
check_error_log
):
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
ps0_cmd
=
"%s %s pserver %s 0 %s %d TRUE"
%
\
ps_cmd
=
"%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
ps0_cmd
=
ps_cmd
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
ps0_ep
,
self
.
_trainers
)
ps1_cmd
=
"%s %s pserver %s 0 %s %d TRUE"
%
\
ps1_cmd
=
ps_cmd
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
ps1_ep
,
self
.
_trainers
)
if
self
.
_sync_mode
:
ps0_cmd
+=
" --sync_mode"
ps1_cmd
+=
" --sync_mode"
if
self
.
_mem_opt
:
ps0_cmd
+=
" --mem_opt"
ps1_cmd
+=
" --mem_opt"
ps0_pipe
=
subprocess
.
PIPE
ps1_pipe
=
subprocess
.
PIPE
if
check_error_log
:
...
...
@@ -195,9 +226,7 @@ class TestDistBase(unittest.TestCase):
# Run local to get a base line
env_local
=
{
"CUDA_VISIBLE_DEVICES"
:
"0"
}
env_local
.
update
(
required_envs
)
local_cmd
=
"%s %s trainer %s 0 %s %d FLASE"
%
\
(
self
.
_python_interp
,
model_file
,
"127.0.0.1:1234"
,
"127.0.0.1:1234"
,
1
)
local_cmd
=
"%s %s --role trainer"
%
(
self
.
_python_interp
,
model_file
)
if
not
check_error_log
:
local_proc
=
subprocess
.
Popen
(
local_cmd
.
split
(
" "
),
...
...
@@ -226,12 +255,23 @@ class TestDistBase(unittest.TestCase):
self
.
_wait_ps_ready
(
ps1
.
pid
)
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
tr0_cmd
=
"%s %s trainer %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
ps0_ep
,
self
.
_trainers
)
tr1_cmd
=
"%s %s trainer %s 1 %s %d TRUE"
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
ps1_ep
,
self
.
_trainers
)
tr_cmd
=
"%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
tr0_cmd
=
tr_cmd
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
0
,
ps0_ep
,
self
.
_trainers
)
tr1_cmd
=
tr_cmd
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
1
,
ps1_ep
,
self
.
_trainers
)
if
self
.
_sync_mode
:
tr0_cmd
+=
" --sync_mode"
tr1_cmd
+=
" --sync_mode"
if
self
.
_mem_opt
:
tr0_cmd
+=
" --mem_opt"
tr1_cmd
+=
" --mem_opt"
if
self
.
_use_reduce
:
tr0_cmd
+=
" --use_reduce"
tr1_cmd
+=
" --use_reduce"
env0
=
{
"CUDA_VISIBLE_DEVICES"
:
"0"
}
env1
=
{
"CUDA_VISIBLE_DEVICES"
:
"1"
}
...
...
@@ -282,6 +322,10 @@ class TestDistBase(unittest.TestCase):
# FIXME: use terminate() instead of sigkill.
os
.
kill
(
ps0
.
pid
,
signal
.
SIGKILL
)
os
.
kill
(
ps1
.
pid
,
signal
.
SIGKILL
)
ps0
.
terminate
()
ps1
.
terminate
()
ps0
.
wait
()
ps1
.
wait
()
FNULL
.
close
()
self
.
assertAlmostEqual
(
local_first_loss
,
dist_first_loss
,
delta
=
delta
)
...
...
python/paddle/fluid/tests/unittests/test_dist_mnist.py
浏览文件 @
ec2aed2a
...
...
@@ -17,10 +17,51 @@ import unittest
from
test_dist_base
import
TestDistBase
class
TestDistSeResneXt2x2
(
TestDistBase
):
class
TestDistMnist2x2
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_use_reduce
=
False
def
test_se_resnext
(
self
):
self
.
check_with_place
(
"dist_mnist.py"
,
delta
=
1e-7
)
class
TestDistMnist2x2WithMemopt
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_mem_opt
=
True
def
test_se_resnext
(
self
):
self
.
check_with_place
(
"dist_mnist.py"
,
delta
=
1e-7
)
class
TestDistMnistAsync
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
False
self
.
_use_reduce
=
False
def
test_se_resnext
(
self
):
self
.
check_with_place
(
"dist_mnist.py"
,
delta
=
200
)
# FIXME(typhoonzero): enable these tests once we have 4
# 4 GPUs on CI machine, and the base class should be updated.
#
# class TestDistMnist2x2ReduceMode(TestDistBase):
# def _setup_config(self):
# self._sync_mode = True
# self._use_reduce = True
# def test_se_resnext(self):
# self.check_with_place("dist_mnist.py", delta=1e-7)
# class TestDistMnistAsyncReduceMode(TestDistBase):
# def _setup_config(self):
# self._sync_mode = False
# self._use_reduce = True
# def test_se_resnext(self):
# self.check_with_place("dist_mnist.py", delta=200)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
浏览文件 @
ec2aed2a
...
...
@@ -18,9 +18,20 @@ from test_dist_base import TestDistBase
class
TestDistSeResneXt2x2
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
def
test_se_resnext
(
self
):
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
1e-7
)
class
TestDistSeResneXt2x2Async
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
False
def
test_se_resnext
(
self
):
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
100
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_dist_train.py
浏览文件 @
ec2aed2a
...
...
@@ -100,7 +100,7 @@ class TestSendOp(unittest.TestCase):
main
.
global_block
().
append_op
(
type
=
"fetch_barrier"
,
inputs
=
{},
outputs
=
{},
outputs
=
{
"Out"
:
[]
},
attrs
=
{
"endpoints"
:
[
"127.0.0.1:{0}"
.
format
(
port
)],
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
...
...
python/paddle/fluid/tests/unittests/test_dist_transformer.py
浏览文件 @
ec2aed2a
...
...
@@ -15,14 +15,55 @@
from
__future__
import
print_function
import
unittest
import
paddle
from
test_dist_base
import
TestDistBase
class
TestDistTransformer2x2
(
TestDistBase
):
def
download_files
():
url_prefix
=
'http://paddle-unittest-data.cdn.bcebos.com/dist_transformer/'
vocab_url
=
url_prefix
+
'vocab.bpe.32000'
vocab_md5
=
'a86d345ca6e27f6591d0dccb1b9be853'
paddle
.
dataset
.
common
.
download
(
vocab_url
,
'test_dist_transformer'
,
vocab_md5
)
local_train_url
=
url_prefix
+
'train.tok.clean.bpe.32000.en-de'
local_train_md5
=
'033eb02b9449e6dd823f050782ac8914'
paddle
.
dataset
.
common
.
download
(
local_train_url
,
'test_dist_transformer'
,
local_train_md5
)
train0_url
=
url_prefix
+
'train.tok.clean.bpe.32000.en-de.train_0'
train0_md5
=
'ddce7f602f352a0405267285379a38b1'
paddle
.
dataset
.
common
.
download
(
train0_url
,
'test_dist_transformer'
,
train0_md5
)
train1_url
=
url_prefix
+
'train.tok.clean.bpe.32000.en-de.train_1'
train1_md5
=
'8757798200180285b1a619cd7f408747'
paddle
.
dataset
.
common
.
download
(
train1_url
,
'test_dist_transformer'
,
train1_md5
)
test_url
=
url_prefix
+
'newstest2013.tok.bpe.32000.en-de'
test_md5
=
'9dd74a266dbdb25314183899f269b4a2'
paddle
.
dataset
.
common
.
download
(
test_url
,
'test_dist_transformer'
,
test_md5
)
class
TestDistTransformer2x2Sync
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
def
test_transformer
(
self
):
download_files
()
#Note: loss on test dataset of the first 5 batch are:
# 10.518872, 10.518871, 10.518868, 10.518862, 10.518855
self
.
check_with_place
(
"dist_transformer.py"
,
delta
=
1e-7
)
class
TestDistTransformer2x2Async
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
False
def
test_transformer
(
self
):
# TODO(paddle-dev): check if the delta is OK.
# Usually start around ~8000 and converge to ~5000
self
.
check_with_place
(
"dist_transformer.py"
,
delta
=
400
)
download_files
()
self
.
check_with_place
(
"dist_transformer.py"
,
delta
=
1.0
)
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_dist_word2vec.py
浏览文件 @
ec2aed2a
...
...
@@ -18,8 +18,19 @@ from test_dist_base import TestDistBase
class
TestDistSeResneXt2x2
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
def
test_se_resnext
(
self
):
self
.
check_with_place
(
"dist_word2vec.py"
,
delta
=
1e-4
)
class
TestDistSeResneXt2x2Async
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
False
def
test_se_resnext
(
self
):
self
.
check_with_place
(
"dist_word2vec.py"
,
delta
=
1
e-7
)
self
.
check_with_place
(
"dist_word2vec.py"
,
delta
=
1
)
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/transpiler/details/program_utils.py
浏览文件 @
ec2aed2a
...
...
@@ -39,3 +39,136 @@ def find_op_by_output_arg(block, arg_name):
if
arg_name
in
op
.
output_arg_names
:
return
index
return
-
1
def
get_indent_space
(
indent
,
space_num
=
4
):
ret
=
""
for
i
in
range
(
0
,
indent
*
space_num
):
ret
+=
" "
return
ret
def
variable_to_code
(
var
):
"""
Get readable codes of fluid variable.
Args:
var: A fluid operator.
Returns:
string: The formatted string.
"""
if
var
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
or
var
.
type
==
core
.
VarDesc
.
VarType
.
LOD_TENSOR
:
var_str
=
"{name} : fluid.{type}.shape{shape}.astype({dtype})"
.
\
format
(
i
=
"{"
,
e
=
"}"
,
name
=
var
.
name
,
type
=
var
.
type
,
shape
=
var
.
shape
,
dtype
=
var
.
dtype
)
else
:
var_str
=
"{name} : fluid.{type})"
.
\
format
(
i
=
"{"
,
e
=
"}"
,
name
=
var
.
name
,
type
=
var
.
type
)
if
type
(
var
)
==
paddle
.
fluid
.
framework
.
Parameter
:
if
var
.
trainable
:
var_str
=
"trainable parameter "
+
var_str
else
:
var_str
=
"parameter "
+
var_str
else
:
var_str
=
"var "
+
var_str
if
var
.
persistable
:
var_str
=
"persist "
+
var_str
return
var_str
def
op_to_code
(
op
):
"""
Get readable codes of fluid operator.
Args:
op: A fluid operator.
Returns:
string: The foramtted string.
"""
outputs_str
=
"{"
for
i
in
range
(
0
,
len
(
op
.
output_names
)):
outputs_str
+=
"{name}="
.
format
(
name
=
op
.
output_names
[
i
])
o
=
op
.
output
(
op
.
output_names
[
i
])
outputs_str
+=
"{value}"
.
format
(
value
=
o
)
if
i
!=
len
(
op
.
output_names
)
-
1
:
outputs_str
+=
", "
outputs_str
+=
"}"
inputs_str
=
"{"
for
i
in
range
(
0
,
len
(
op
.
input_names
)):
inputs_str
+=
"{name}="
.
format
(
name
=
op
.
input_names
[
i
])
o
=
op
.
input
(
op
.
input_names
[
i
])
inputs_str
+=
"{value}"
.
format
(
value
=
o
)
if
i
!=
len
(
op
.
input_names
)
-
1
:
inputs_str
+=
", "
inputs_str
+=
"}"
attrs_str
=
""
for
i
in
range
(
0
,
len
(
op
.
attr_names
)):
name
=
op
.
attr_names
[
i
]
attr_type
=
op
.
desc
.
attr_type
(
name
)
if
attr_type
==
core
.
AttrType
.
BLOCK
:
a
=
"{name} = block[{value}]"
.
format
(
name
=
name
,
type
=
attr_type
,
value
=
op
.
block_attr_id
(
name
))
attrs_str
+=
a
continue
if
attr_type
==
core
.
AttrType
.
BLOCKS
:
a
=
"{name} = blocks{value}"
.
format
(
name
=
name
,
type
=
attr_type
,
value
=
op
.
blocks_attr_ids
(
name
))
attrs_str
+=
a
continue
a
=
"{name} = {value}"
.
format
(
name
=
name
,
type
=
attr_type
,
value
=
op
.
desc
.
attr
(
name
))
attrs_str
+=
a
if
i
!=
len
(
op
.
attr_names
)
-
1
:
attrs_str
+=
", "
if
outputs_str
!=
"{}"
:
op_str
=
"{outputs} = {op_type}(inputs={inputs}, {attrs})"
.
\
format
(
outputs
=
outputs_str
,
op_type
=
op
.
type
,
inputs
=
inputs_str
,
attrs
=
attrs_str
)
else
:
op_str
=
"{op_type}(inputs={inputs}, {attrs})"
.
\
format
(
op_type
=
op
.
type
,
inputs
=
inputs_str
,
attrs
=
attrs_str
)
return
op_str
def
block_to_code
(
block
,
block_idx
):
indent
=
0
print
(
"{0}{1} // block {2}"
.
format
(
get_indent_space
(
indent
),
'{'
,
block_idx
))
indent
+=
1
# sort all vars
all_vars
=
sorted
(
block
.
vars
.
iteritems
(),
key
=
lambda
x
:
x
[
0
])
for
var
in
all_vars
:
print
(
"{}{}"
.
format
(
get_indent_space
(
indent
),
variable_to_code
(
var
[
1
])))
if
len
(
all_vars
)
>
0
:
print
(
""
)
for
op
in
block
.
ops
:
print
(
"{}{}"
.
format
(
get_indent_space
(
indent
),
op_to_code
(
op
)))
indent
-=
1
print
(
"{0}{1}"
.
format
(
get_indent_space
(
indent
),
'}'
))
def
program_to_code
(
prog
):
"""
Print readable codes of fluid program.
Args:
prog : A fluid program.
An example result like bellow:
https://github.com/PaddlePaddle/Paddle/pull/12673
"""
block_idx
=
0
for
block
in
prog
.
blocks
:
block_to_code
(
block
,
block_idx
)
block_idx
+=
1
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
ec2aed2a
...
...
@@ -31,9 +31,10 @@ Steps to transpile pserver:
"""
import
math
import
random
import
sys
import
numpy
as
np
import
collections
import
random
from
.ps_dispatcher
import
RoundRobin
,
HashName
,
PSDispatcher
from
..
import
core
,
framework
...
...
@@ -181,7 +182,8 @@ class DistributeTranspiler(object):
program
=
None
,
pservers
=
"127.0.0.1:6174"
,
trainers
=
1
,
sync_mode
=
True
):
sync_mode
=
True
,
startup_program
=
None
):
"""
Run the transpiler.
...
...
@@ -194,13 +196,17 @@ class DistributeTranspiler(object):
list.
trainers (int): number of trainers in the distributed job.
sync_mode (bool): Do sync training or not, default is True.
startup_program (Program|None): startup_program to transpile,
default is fluid.default_main_program().
"""
if
program
is
None
:
program
=
default_main_program
()
if
startup_program
is
None
:
startup_program
=
default_startup_program
()
self
.
origin_program
=
program
self
.
origin_startup_program
=
default_startup_program
().
clone
()
self
.
startup_program
=
startup_program
self
.
origin_startup_program
=
self
.
startup_program
.
clone
()
self
.
startup_program
=
default_startup_program
()
self
.
trainer_num
=
trainers
self
.
sync_mode
=
sync_mode
self
.
trainer_id
=
trainer_id
...
...
@@ -260,6 +266,10 @@ class DistributeTranspiler(object):
name
=
framework
.
generate_control_dev_var_name
())
grad_name_to_send_dummy_out
[
grad_varname
]
=
dummy_output
# get send op_role_var, if not splited, the grad should have .trainer suffix
# if splited, grad should be the original grad var name (split_by_ref and send
# will be on the same place). ParallelExecutor
# will use op_role_var to get expected device place to run this op.
program
.
global_block
().
_insert_op
(
index
=
index
+
1
,
type
=
"send"
,
...
...
@@ -268,18 +278,23 @@ class DistributeTranspiler(object):
attrs
=
{
"epmap"
:
eplist
,
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
,
OP_ROLE_VAR_ATTR_NAME
:
[
self
.
grad_name_to_param_name
[
grad_varname
],
grad_varname
],
OP_ROLE_VAR_ATTR_NAME
:
[
self
.
grad_name_to_param_name
[
grad_varname
],
splited_grad_varname
],
"sync_mode"
:
not
self
.
sync_mode
,
})
for
_
,
var
in
enumerate
(
splited_vars
):
send_vars
.
append
(
var
)
if
self
.
sync_mode
:
send_barrier_out
=
program
.
global_block
().
create_var
(
name
=
framework
.
generate_control_dev_var_name
())
input_deps
=
grad_name_to_send_dummy_out
.
values
()
program
.
global_block
().
append_op
(
type
=
"send_barrier"
,
inputs
=
{},
outputs
=
{},
inputs
=
{
"X"
:
input_deps
},
outputs
=
{
"Out"
:
send_barrier_out
},
attrs
=
{
"endpoints"
:
pserver_endpoints
,
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
...
...
@@ -297,32 +312,46 @@ class DistributeTranspiler(object):
self
.
param_grad_ep_mapping
[
ep
][
"grads"
].
append
(
send_vars
[
i
])
# step4: Concat the parameters splits together after recv.
all_recv_outputs
=
[]
for
param_varname
,
splited_var
in
six
.
iteritems
(
self
.
param_var_mapping
):
eps
=
[]
for
var
in
splited_var
:
index
=
[
v
.
name
for
v
in
recv_vars
].
index
(
var
.
name
)
eps
.
append
(
eplist
[
index
])
grad_send_dummy_out
=
grad_name_to_send_dummy_out
[
if
self
.
sync_mode
:
recv_dep_in
=
send_barrier_out
else
:
# connect deps to send op in async mode
recv_dep_in
=
grad_name_to_send_dummy_out
[
self
.
param_name_to_grad_name
[
param_varname
]]
all_recv_outputs
.
extend
(
splited_var
)
# get recv op_role_var, if not splited, the grad should have .trainer suffix
# if splited, grad should be the original grad var name. ParallelExecutor
# will use op_role_var to get expected device place to run this op.
orig_grad_name
=
self
.
param_name_to_grad_name
[
param_varname
]
recv_op_role_var_name
=
orig_grad_name
splited_trainer_grad
=
self
.
grad_var_mapping
[
orig_grad_name
]
if
len
(
splited_trainer_grad
)
==
1
:
recv_op_role_var_name
=
splited_trainer_grad
[
0
].
name
program
.
global_block
().
append_op
(
type
=
"recv"
,
inputs
=
{
"X"
:
[
grad_send_dummy_out
]},
inputs
=
{
"X"
:
[
recv_dep_in
]},
outputs
=
{
"Out"
:
splited_var
},
attrs
=
{
"epmap"
:
eps
,
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
,
OP_ROLE_VAR_ATTR_NAME
:
[
param_varname
,
self
.
param_name_to_grad_name
[
param_varname
]
],
OP_ROLE_VAR_ATTR_NAME
:
[
param_varname
,
recv_op_role_var_name
],
"sync_mode"
:
not
self
.
sync_mode
})
if
self
.
sync_mode
:
# form a WAW dependency
program
.
global_block
().
append_op
(
type
=
"fetch_barrier"
,
inputs
=
{},
outputs
=
{},
outputs
=
{
"Out"
:
all_recv_outputs
},
attrs
=
{
"endpoints"
:
pserver_endpoints
,
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
...
...
@@ -359,20 +388,17 @@ class DistributeTranspiler(object):
return
self
.
origin_program
def
_get_trainer_startup_program
(
self
,
recv_vars
,
eplist
,
startup_program
=
None
):
def
_get_trainer_startup_program
(
self
,
recv_vars
,
eplist
):
"""
Get transpiled trainer side startup program.
Args:
startup_program(Program): Startup program.
recv_vars (list): Variable list to recv for current trainer_id
eplist (list): A list of strings indicating
Returns:
Program: trainer side startup program.
"""
if
startup_program
is
None
:
startup_program
=
self
.
startup_program
# FIXME(gongwb): delete not need ops.
...
...
@@ -406,10 +432,12 @@ class DistributeTranspiler(object):
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
})
fetch_barrier_out
=
startup_program
.
global_block
().
create_var
(
name
=
framework
.
generate_control_dev_var_name
())
startup_program
.
global_block
().
append_op
(
type
=
"fetch_barrier"
,
inputs
=
{},
outputs
=
{},
outputs
=
{
"Out"
:
fetch_barrier_out
},
attrs
=
{
"endpoints"
:
self
.
pserver_endpoints
,
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
...
...
@@ -419,7 +447,18 @@ class DistributeTranspiler(object):
#add concat ops to merge splited parameters received from parameter servers.
if
len
(
splited_var
)
<=
1
:
continue
# NOTE: if enable memory optimization, origin vars maybe removed.
if
startup_program
.
global_block
().
vars
.
has_key
(
varname
):
orig_param
=
startup_program
.
global_block
().
vars
[
varname
]
else
:
origin_param_var
=
self
.
origin_program
.
global_block
().
vars
[
varname
]
orig_param
=
startup_program
.
global_block
().
create_var
(
name
=
varname
,
persistable
=
origin_param_var
.
persistable
,
type
=
origin_param_var
.
type
,
dtype
=
origin_param_var
.
dtype
,
shape
=
origin_param_var
.
shape
)
startup_program
.
global_block
().
append_op
(
type
=
"concat"
,
inputs
=
{
"X"
:
splited_var
},
...
...
@@ -442,7 +481,9 @@ class DistributeTranspiler(object):
# NOTE: assume blocks of the same variable is not distributed
# on the same pserver, only change param/grad varnames for
# trainers to fetch.
sys
.
stderr
.
write
(
"get_pserver_program() is deprecated, call
\
get_pserver_programs() to get pserver main and startup
\
in a single call."
)
# step1
pserver_program
=
Program
()
pserver_program
.
random_seed
=
self
.
origin_program
.
random_seed
...
...
@@ -626,32 +667,58 @@ class DistributeTranspiler(object):
attrs
=
attrs
)
pserver_program
.
_sync_with_cpp
()
# save pserver program to generate pserver side startup relatively.
self
.
pserver_program
=
pserver_program
return
pserver_program
def
get_pserver_programs
(
self
,
endpoint
):
"""
Get pserver side main program and startup program for distributed training.
Args:
endpoint (str): current pserver endpoint.
Returns:
tuple: (main_program, startup_program), of type "Program"
"""
pserver_prog
=
self
.
get_pserver_program
(
endpoint
)
pserver_startup
=
self
.
get_startup_program
(
endpoint
)
return
pserver_prog
,
pserver_startup
def
get_startup_program
(
self
,
endpoint
,
pserver_program
,
pserver_program
=
None
,
startup_program
=
None
):
"""
**Deprecated**
Get startup program for current parameter server.
Modify operator input variables if there are variables that
were split to several blocks.
Args:
endpoint (str): current pserver endpoint.
pserver_program (Program): call get_pserver_program first and
pass the result here.
startup_program (Program): if pass None, will use
default_startup_program
pserver_program (Program): deprecated, call get_pserver_program first.
startup_program (Program): deprecated, should pass startup_program
when initalizing
Returns:
Program: parameter server side startup program.
"""
sys
.
stderr
.
write
(
"get_startup_program() is deprecated, call
\
get_pserver_programs() to get pserver main and startup
\
in a single call."
)
if
pserver_program
!=
None
:
sys
.
stderr
.
write
(
"passing pserver_program to get_startup_program()
\
is deprecated, you can use new API get_pserver_programs() to
\
get both pserver main program and startup program."
)
if
startup_program
!=
None
:
sys
.
stderr
.
write
(
"passing startup_program to get_startup_program()
\
is deprecated, use fluid.program_guard() or pass this argument
\
to transpile() call."
)
s_prog
=
Program
()
if
not
startup_program
:
orig_s_prog
=
default_startup_program
()
else
:
orig_s_prog
=
startup_program
orig_s_prog
=
self
.
startup_program
s_prog
.
random_seed
=
orig_s_prog
.
random_seed
params
=
self
.
param_grad_ep_mapping
[
endpoint
][
"params"
]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录