Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
d0684930
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
d0684930
编写于
8月 15, 2018
作者:
G
gongweibao
提交者:
GitHub
8月 15, 2018
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
parameter dispather. (#12666)
上级
efc5392d
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
162 addition
and
31 deletion
+162
-31
paddle/fluid/framework/threadpool.cc
paddle/fluid/framework/threadpool.cc
+7
-0
paddle/fluid/operators/distributed/variable_response.cc
paddle/fluid/operators/distributed/variable_response.cc
+6
-1
paddle/fluid/operators/listen_and_serv_op.cc
paddle/fluid/operators/listen_and_serv_op.cc
+4
-1
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-1
python/paddle/fluid/initializer.py
python/paddle/fluid/initializer.py
+0
-1
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+2
-2
python/paddle/fluid/tests/unittests/test_dist_train.py
python/paddle/fluid/tests/unittests/test_dist_train.py
+17
-0
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+37
-13
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+88
-12
未找到文件。
paddle/fluid/framework/threadpool.cc
浏览文件 @
d0684930
...
...
@@ -20,6 +20,9 @@
DEFINE_int32
(
io_threadpool_size
,
100
,
"number of threads used for doing IO, default 100"
);
DEFINE_int32
(
dist_threadpool_size
,
0
,
"number of threads used for distributed executed."
);
namespace
paddle
{
namespace
framework
{
...
...
@@ -35,6 +38,10 @@ void ThreadPool::Init() {
if
(
threadpool_
.
get
()
==
nullptr
)
{
// TODO(Yancey1989): specify the max threads number
int
num_threads
=
std
::
thread
::
hardware_concurrency
();
if
(
FLAGS_dist_threadpool_size
>
0
)
{
num_threads
=
FLAGS_dist_threadpool_size
;
VLOG
(
1
)
<<
"set dist_threadpool_size to "
<<
num_threads
;
}
PADDLE_ENFORCE_GT
(
num_threads
,
0
);
threadpool_
.
reset
(
new
ThreadPool
(
num_threads
));
}
...
...
paddle/fluid/operators/distributed/variable_response.cc
浏览文件 @
d0684930
...
...
@@ -190,12 +190,15 @@ bool VariableResponse::ProcSerializedField(
#endif
}
VLOG
(
7
)
<<
"ProcSerializedField:"
<<
meta_
.
varname
()
<<
", type:"
<<
meta_
.
type
()
<<
std
::
endl
;
framework
::
DDim
dims
=
GetDims
(
meta_
.
dims
());
if
(
meta_
.
type
()
==
sendrecv
::
LOD_TENSOR
)
{
PADDLE_ENFORCE
(
meta_
.
lod_size
()
>=
0
,
"lod info should be got first!"
);
if
(
!
CopyLodTensorData
(
input
,
*
dev_ctx_
,
dims
,
num_bytes
))
{
return
false
;
}
return
true
;
}
...
...
@@ -206,7 +209,9 @@ bool VariableResponse::ProcSerializedField(
return
true
;
}
return
true
;
PADDLE_ENFORCE
(
"not supported var types:"
,
meta_
.
varname
(),
meta_
.
type
());
return
false
;
}
};
// namespace distributed
...
...
paddle/fluid/operators/listen_and_serv_op.cc
浏览文件 @
d0684930
...
...
@@ -123,8 +123,11 @@ void ListenAndServOp::RunSyncLoop(
optimize_prepared
.
begin
(),
std
::
shared_ptr
<
framework
::
ExecutorPrepareContext
>
(
nullptr
));
// Trainers will get all parameters from pserver in the
// startup program, so we will wait RequestGet first
rpc_service_
->
SetCond
(
distributed
::
kRequestGet
);
rpc_service_
->
WaitBarrier
(
distributed
::
kRequestGet
);
rpc_service_
->
ResetBarrierCounter
();
while
(
true
)
{
rpc_service_
->
Profiler
().
OneStep
();
// Get from multiple trainers, we don't care about the order in which
...
...
python/paddle/fluid/__init__.py
浏览文件 @
d0684930
...
...
@@ -122,7 +122,7 @@ def __bootstrap__():
'use_pinned_memory'
,
'check_nan_inf'
,
'benchmark'
,
'warpctc_dir'
,
'eager_delete_scope'
,
'use_mkldnn'
,
'initial_cpu_memory_in_mb'
,
'init_allocated_mem'
,
'free_idle_memory'
,
'paddle_num_threads'
,
'cpu_deterministic'
"dist_threadpool_size"
,
'cpu_deterministic'
]
if
core
.
is_compiled_with_dist
():
read_env_flags
.
append
(
'rpc_deadline'
)
...
...
python/paddle/fluid/initializer.py
浏览文件 @
d0684930
...
...
@@ -15,7 +15,6 @@
from
.
import
framework
import
numpy
as
np
import
contextlib
from
.framework
import
convert_np_dtype_to_dtype_
from
.core
import
VarDesc
__all__
=
[
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
d0684930
...
...
@@ -59,8 +59,8 @@ py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=$
if
(
WITH_DISTRIBUTE
)
py_test_modules
(
test_dist_train MODULES test_dist_train SERIAL
)
set_tests_properties
(
test_listen_and_serv_op PROPERTIES TIMEOUT 20
)
set_tests_properties
(
test_dist_mnist PROPERTIES TIMEOUT
18
0
)
set_tests_properties
(
test_dist_word2vec PROPERTIES TIMEOUT
18
0
)
set_tests_properties
(
test_dist_mnist PROPERTIES TIMEOUT
20
0
)
set_tests_properties
(
test_dist_word2vec PROPERTIES TIMEOUT
20
0
)
endif
()
py_test_modules
(
test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL
)
py_test_modules
(
test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL
)
...
...
python/paddle/fluid/tests/unittests/test_dist_train.py
浏览文件 @
d0684930
...
...
@@ -26,6 +26,12 @@ from paddle.fluid.layers.io import ListenAndServ
from
paddle.fluid.layers.io
import
Recv
from
paddle.fluid.layers.io
import
Send
from
paddle.fluid
import
core
RPC_OP_ROLE_ATTR_NAME
=
op_role_attr_name
=
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
(
)
RPC_OP_ROLE_ATTR_VALUE
=
core
.
op_proto_and_checker_maker
.
OpRole
.
RPC
class
TestSendOp
(
unittest
.
TestCase
):
def
test_send
(
self
):
...
...
@@ -89,18 +95,29 @@ class TestSendOp(unittest.TestCase):
def
init_client
(
self
,
place
,
port
):
main
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main
):
main
.
global_block
().
append_op
(
type
=
"fetch_barrier"
,
inputs
=
{},
outputs
=
{},
attrs
=
{
"endpoints"
:
[
"127.0.0.1:{0}"
.
format
(
port
)],
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
})
x
=
layers
.
data
(
shape
=
[
32
,
32
],
dtype
=
'float32'
,
name
=
'X'
,
append_batch_size
=
False
)
fluid
.
initializer
.
Constant
(
value
=
2.3
)(
x
,
main
.
global_block
())
get_var
=
main
.
global_block
().
create_var
(
name
=
"scale_0.tmp_0"
,
# server side var
dtype
=
"float32"
,
persistable
=
False
,
shape
=
[
32
,
32
])
fluid
.
initializer
.
Constant
(
value
=
2.3
)(
get_var
,
main
.
global_block
())
Send
(
"127.0.0.1:%d"
%
port
,
[
x
])
o
=
Recv
(
"127.0.0.1:%d"
%
port
,
[
get_var
])
...
...
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
浏览文件 @
d0684930
...
...
@@ -18,6 +18,7 @@ import unittest
import
paddle.fluid
as
fluid
from
paddle.fluid.transpiler.distribute_transpiler
import
delete_ops
import
traceback
import
collections
class
TranspilerTest
(
unittest
.
TestCase
):
...
...
@@ -53,9 +54,18 @@ class TranspilerTest(unittest.TestCase):
self
.
origin_prog
=
main
.
clone
()
return
main
def
get_trainer
(
self
,
config
=
None
,
sync_mode
=
True
):
t
=
self
.
_transpiler_instance
(
config
,
sync_mode
)
return
t
.
get_trainer_program
()
def
get_trainer
(
self
,
config
=
None
):
src
=
fluid
.
default_startup_program
().
clone
()
t
=
self
.
_transpiler_instance
(
config
)
trainer_main
=
t
.
get_trainer_program
()
trainer_startup
=
fluid
.
default_startup_program
()
assert
(
src
.
num_blocks
==
1
)
assert
(
trainer_startup
.
num_blocks
==
src
.
num_blocks
)
return
trainer_main
,
trainer_startup
def
get_pserver
(
self
,
ep
,
config
=
None
,
sync_mode
=
True
):
t
=
self
.
_transpiler_instance
(
config
,
sync_mode
)
...
...
@@ -91,7 +101,21 @@ class TestBasicModel(TranspilerTest):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
)
pserver2
,
startup2
=
self
.
get_pserver
(
self
.
pserver2_ep
)
trainer
=
self
.
get_trainer
()
trainer
,
trainer_startup
=
self
.
get_trainer
()
# splited var blocks should be in startup program
self
.
assertTrue
(
"fc_w.block0"
in
trainer_startup
.
global_block
().
vars
)
self
.
assertTrue
(
"fc_w.block1"
in
trainer_startup
.
global_block
().
vars
)
self
.
assertTrue
(
"fc_w"
in
trainer_startup
.
global_block
().
vars
)
self
.
assertTrue
(
"fc_b"
in
trainer_startup
.
global_block
().
vars
)
self
.
assertTrue
(
"fc_w@GRAD"
not
in
trainer_startup
.
global_block
().
vars
)
self
.
assertTrue
(
"fc_b@GRAD"
not
in
trainer_startup
.
global_block
().
vars
)
src
=
[
op
.
type
for
op
in
trainer_startup
.
global_block
().
ops
]
dst
=
[
'fill_constant'
,
'fill_constant'
,
'uniform_random'
,
'recv'
,
'recv'
,
\
'fetch_barrier'
,
'concat'
]
self
.
assertEqual
(
src
,
dst
)
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
global_block
().
ops
],
[
'mul'
,
'elementwise_add'
,
'elementwise_sub'
,
'square'
,
'mean'
,
...
...
@@ -142,7 +166,7 @@ class TestBasicModelWithLargeBlockSize(TranspilerTest):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
,
config
)
pserver2
,
startup2
=
self
.
get_pserver
(
self
.
pserver2_ep
,
config
)
trainer
=
self
.
get_trainer
(
config
)
trainer
,
_
=
self
.
get_trainer
(
config
)
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
global_block
().
ops
],
[
'mul'
,
'elementwise_add'
,
'elementwise_sub'
,
'square'
,
'mean'
,
...
...
@@ -226,7 +250,7 @@ class TestLRDecay(TranspilerTest):
def
transpiler_test_impl
(
self
):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
)
trainer
=
self
.
get_trainer
()
trainer
,
_
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
pserver
.
blocks
),
4
)
lr_decay_ops
=
[
op
.
type
for
op
in
pserver
.
blocks
[
1
].
ops
]
...
...
@@ -256,7 +280,7 @@ class TestLRDecayConditional(TranspilerTest):
def
transpiler_test_impl
(
self
):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
)
trainer
=
self
.
get_trainer
()
trainer
,
_
=
self
.
get_trainer
()
serv_op
=
pserver
.
blocks
[
0
].
ops
[
0
]
sub_blocks
=
[]
...
...
@@ -305,7 +329,7 @@ class TestL2Decay(TranspilerTest):
def
transpiler_test_impl
(
self
):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
)
trainer
=
self
.
get_trainer
()
trainer
,
_
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
pserver
.
blocks
),
3
)
self
.
assertEqual
([
op
.
type
for
op
in
pserver
.
blocks
[
1
].
ops
],
...
...
@@ -340,7 +364,7 @@ class TestL2DecayWithPiecewise(TranspilerTest):
def
transpiler_test_impl
(
self
):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
)
trainer
=
self
.
get_trainer
()
trainer
,
_
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
pserver
.
blocks
),
9
)
self
.
assertEqual
([
op
.
type
for
op
in
pserver
.
blocks
[
1
].
ops
],
[
...
...
@@ -415,7 +439,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
2
].
ops
],
[
"sum"
,
"adam"
,
"scale"
,
"scale"
])
trainer
=
self
.
get_trainer
()
trainer
,
_
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'lookup_table'
,
'sequence_pool'
,
'lookup_table'
,
'sequence_pool'
,
...
...
@@ -453,7 +477,7 @@ class TestDistLookupTable(TestDistLookupTableBase):
# 5 save table
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
5
].
ops
],
[
"save"
])
trainer
=
self
.
get_trainer
()
trainer
,
_
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'split_ids'
,
'prefetch'
,
'merge_ids'
,
'sequence_pool'
,
'split_ids'
,
...
...
@@ -486,7 +510,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
2
].
ops
],
[
"adam"
,
"scale"
,
"scale"
])
trainer
=
self
.
get_trainer
(
config
)
trainer
,
_
=
self
.
get_trainer
(
config
)
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'lookup_table'
,
'sequence_pool'
,
'lookup_table'
,
'sequence_pool'
,
...
...
@@ -525,7 +549,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
# 5 save table
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
5
].
ops
],
[
"save"
])
trainer
=
self
.
get_trainer
(
config
)
trainer
,
_
=
self
.
get_trainer
(
config
)
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'split_ids'
,
'prefetch'
,
'merge_ids'
,
'sequence_pool'
,
'split_ids'
,
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
d0684930
...
...
@@ -195,6 +195,9 @@ class DistributeTranspiler(object):
if
program
is
None
:
program
=
default_main_program
()
self
.
origin_program
=
program
self
.
origin_startup_program
=
default_startup_program
().
clone
()
self
.
startup_program
=
default_startup_program
()
self
.
trainer_num
=
trainers
self
.
sync_mode
=
sync_mode
self
.
trainer_id
=
trainer_id
...
...
@@ -205,10 +208,10 @@ class DistributeTranspiler(object):
ps_dispatcher
=
self
.
config
.
split_method
(
self
.
pserver_endpoints
)
self
.
has_distributed_lookup_table
=
self
.
_has_distributed_lookup_table
()
# split and create vars, then put splited vars in dicts for later use.
# s
tep 1: s
plit and create vars, then put splited vars in dicts for later use.
self
.
_init_splited_vars
()
# step
3.1
: insert send op to send gradient vars to parameter servers
# step
2
: insert send op to send gradient vars to parameter servers
ps_dispatcher
.
reset
()
send_vars
=
[]
...
...
@@ -265,7 +268,7 @@ class DistributeTranspiler(object):
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
})
# step 3
.2
: insert recv op to receive parameters from parameter server
# step 3: insert recv op to receive parameters from parameter server
recv_vars
=
[]
for
_
,
var
in
enumerate
(
send_vars
):
recv_vars
.
append
(
self
.
grad_param_mapping
[
var
])
...
...
@@ -312,6 +315,8 @@ class DistributeTranspiler(object):
outputs
=
{
"Out"
:
[
orig_param
]},
attrs
=
{
"axis"
:
0
})
self
.
_get_trainer_startup_program
(
recv_vars
=
recv_vars
,
eplist
=
eplist
)
if
self
.
has_distributed_lookup_table
:
self
.
_replace_lookup_table_op_with_prefetch
(
program
,
pserver_endpoints
)
...
...
@@ -328,8 +333,78 @@ class DistributeTranspiler(object):
# FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
delete_ops
(
self
.
origin_program
.
global_block
(),
self
.
optimize_ops
)
self
.
origin_program
.
__str__
()
return
self
.
origin_program
def
_get_trainer_startup_program
(
self
,
recv_vars
,
eplist
,
startup_program
=
None
):
"""
Get transpiled trainer side startup program.
Args:
startup_program(Program): Startup program.
Returns:
Program: trainer side startup program.
"""
if
startup_program
is
None
:
startup_program
=
self
.
startup_program
# FIXME(gongwb): delete not need ops.
# note that: some parameter is not trainable and those ops can't be deleted.
for
varname
,
splited_var
in
self
.
param_var_mapping
.
iteritems
():
# Get the eplist of recv vars
eps
=
[]
for
var
in
splited_var
:
index
=
[
v
.
name
for
v
in
recv_vars
].
index
(
var
.
name
)
eps
.
append
(
eplist
[
index
])
for
var
in
splited_var
:
if
startup_program
.
global_block
().
has_var
(
var
.
name
):
continue
startup_program
.
global_block
().
create_var
(
name
=
var
.
name
,
persistable
=
False
,
type
=
var
.
type
,
dtype
=
var
.
dtype
,
shape
=
var
.
shape
,
lod_level
=
var
.
lod_level
)
op
=
startup_program
.
global_block
().
append_op
(
type
=
"recv"
,
inputs
=
{},
outputs
=
{
"Out"
:
splited_var
},
attrs
=
{
"epmap"
:
eps
,
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
})
startup_program
.
global_block
().
append_op
(
type
=
"fetch_barrier"
,
inputs
=
{},
outputs
=
{},
attrs
=
{
"endpoints"
:
self
.
pserver_endpoints
,
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
})
for
varname
,
splited_var
in
self
.
param_var_mapping
.
iteritems
():
#add concat ops to merge splited parameters received from parameter servers.
if
len
(
splited_var
)
<=
1
:
continue
orig_param
=
startup_program
.
global_block
().
vars
[
varname
]
startup_program
.
global_block
().
append_op
(
type
=
"concat"
,
inputs
=
{
"X"
:
splited_var
},
outputs
=
{
"Out"
:
[
orig_param
]},
attrs
=
{
"axis"
:
0
})
return
startup_program
def
get_pserver_program
(
self
,
endpoint
):
"""
Get parameter server side program.
...
...
@@ -576,14 +651,16 @@ class DistributeTranspiler(object):
new_outputs
=
dict
()
# do not append startup op if var is not on this pserver
op_on_pserver
=
False
for
key
in
op
.
output_names
:
newname
,
_
=
_get_splited_name_and_shape
(
op
.
output
(
key
)[
0
])
if
newname
:
op_on_pserver
=
True
new_outputs
[
key
]
=
created_var_map
[
newname
]
elif
op
.
output
(
key
)[
0
]
in
pserver_vars
:
op_on_pserver
=
True
new_outputs
[
key
]
=
pserver_vars
[
op
.
output
(
key
)[
0
]]
# TODO(gongwb): remove this line.
if
op
.
type
not
in
[
"recv"
,
"fetch_barrier"
,
"concat"
]:
for
key
in
op
.
output_names
:
newname
,
_
=
_get_splited_name_and_shape
(
op
.
output
(
key
)[
0
])
if
newname
:
op_on_pserver
=
True
new_outputs
[
key
]
=
created_var_map
[
newname
]
elif
op
.
output
(
key
)[
0
]
in
pserver_vars
:
op_on_pserver
=
True
new_outputs
[
key
]
=
pserver_vars
[
op
.
output
(
key
)[
0
]]
if
op_on_pserver
:
# most startup program ops have no inputs
...
...
@@ -1022,7 +1099,6 @@ class DistributeTranspiler(object):
var_mapping
[
varname
]
=
\
[
program
.
global_block
().
var
(
orig_var
.
name
)]
continue
var_mapping
[
varname
]
=
[]
orig_shape
=
orig_var
.
shape
orig_dim1_flatten
=
1
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录