Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
3c65cc1b
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
3c65cc1b
编写于
3月 19, 2019
作者:
D
dongdaxiang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add document for role_maker and fleet parameter, data_generator
上级
f6c9232a
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
246 addition
and
27 deletion
+246
-27
paddle/fluid/framework/data_set.cc
paddle/fluid/framework/data_set.cc
+2
-2
python/paddle/fluid/incubate/data_generator/__init__.py
python/paddle/fluid/incubate/data_generator/__init__.py
+6
-0
python/paddle/fluid/incubate/fleet/base/role_maker.py
python/paddle/fluid/incubate/fleet/base/role_maker.py
+127
-22
python/paddle/fluid/incubate/fleet/parameter_server/__init__.py
.../paddle/fluid/incubate/fleet/parameter_server/__init__.py
+111
-3
未找到文件。
paddle/fluid/framework/data_set.cc
浏览文件 @
3c65cc1b
...
...
@@ -47,7 +47,7 @@ template <typename T>
void
DatasetImpl
<
T
>::
SetThreadNum
(
int
thread_num
)
{
int
file_cnt
=
filelist_
.
size
();
if
(
file_cnt
!=
0
&&
thread_num
>
file_cnt
)
{
VLOG
(
1
)
<<
"DataSet thread num = "
<<
thread_num
VLOG
(
3
)
<<
"DataSet thread num = "
<<
thread_num
<<
", file num = "
<<
file_cnt
<<
". Changing DataSet thread num = "
<<
file_cnt
;
thread_num
=
file_cnt
;
...
...
@@ -178,7 +178,7 @@ void DatasetImpl<T>::DestroyReaders() {
t
.
join
();
}
std
::
vector
<
std
::
shared_ptr
<
paddle
::
framework
::
DataFeed
>>
().
swap
(
readers_
);
LOG
(
WARNING
)
<<
"readers size: "
<<
readers_
.
size
();
VLOG
(
3
)
<<
"readers size: "
<<
readers_
.
size
();
}
template
<
typename
T
>
...
...
python/paddle/fluid/incubate/data_generator/__init__.py
浏览文件 @
3c65cc1b
...
...
@@ -19,6 +19,12 @@ __all__ = ['MultiSlotDataGenerator']
class
DataGenerator
(
object
):
"""
DataGenerator is a general Base class for user to inherit
A user who wants to define his/her own python processing logic
with paddle.fluid.dataset should inherit this class.
"""
def
__init__
(
self
):
self
.
_proto_info
=
None
self
.
batch_size_
=
32
...
...
python/paddle/fluid/incubate/fleet/base/role_maker.py
浏览文件 @
3c65cc1b
...
...
@@ -11,36 +11,68 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
sys
class
RoleMakerBase
(
object
):
"""
RoleMakerBase is a base class for assigning a role to current process
in distributed training.
A paddle developer can implement RoleMakerBase to design a role maker
for worker or pserver assignment.
"""
def
__init__
(
self
):
self
.
role_maker_name_
=
""
self
.
trainer_endpoints_
=
[]
self
.
pserver_endpoints_
=
[]
self
.
role_is_generated_
=
False
def
is_worker
(
self
):
"""
return is_worker() of current process
"""
raise
NotImplementedError
(
"Please implement this method in child class"
)
def
is_server
(
self
):
"""
return is_server() of current process
"""
raise
NotImplementedError
(
"Please implement this method in child class"
)
def
get_local_ip
(
self
):
"""
return get local ip
"""
import
socket
self
.
ip_
=
socket
.
gethostbyname
(
socket
.
gethostname
())
return
self
.
ip_
def
get_trainer_endpoints
(
self
):
"""
return trainer endpoints
"""
return
self
.
trainer_endpoints_
def
get_pserver_endpoints
(
self
):
"""
return pserver endpoints
"""
return
self
.
pserver_endpoints_
def
generate_role
(
self
):
"""
generate_role() should be called to identify current process's role
"""
raise
NotImplementedError
(
"Please implement this method in child class"
)
class
MPIRoleMaker
(
RoleMakerBase
):
"""
MPIRoleMaker is a MPI-API based role maker which is a counter-part of K8SRoleMaker
mpi4py will be used if a developer inherits MPIRoleMaker
"""
def
__init__
(
self
):
from
mpi4py
import
MPI
self
.
comm_
=
MPI
.
COMM_WORLD
...
...
@@ -48,26 +80,44 @@ class MPIRoleMaker(RoleMakerBase):
self
.
ips_
=
None
def
get_rank
(
self
):
"""
return rank
"""
self
.
rank_
=
self
.
comm_
.
Get_rank
()
return
self
.
rank_
def
get_size
(
self
):
"""
return size
"""
self
.
size_
=
self
.
comm_
.
Get_size
()
return
self
.
size_
def
all_gather
(
self
,
obj
):
"""
all_gather(obj) will call MPI's allgather function
"""
self
.
barrier_all
()
return
self
.
comm_
.
allgather
(
obj
)
def
barrier_all
(
self
):
"""
barrier_all() will call MPI's barrier_all function
"""
self
.
comm_
.
barrier
()
def
get_ips
(
self
):
"""
collect current distributed job's ip list
"""
if
self
.
ips_
==
None
:
self
.
ips_
=
self
.
comm_
.
allgather
(
self
.
get_local_ip
())
return
self
.
ips_
def
finalize
(
self
):
"""
finalize the current MPI instance.
"""
self
.
comm_
.
finalize
()
...
...
@@ -83,44 +133,99 @@ class MPISymetricRoleMaker(MPIRoleMaker):
self
.
node_type_
=
None
self
.
proc_per_node_
=
2
def
_check_role_generation
(
self
):
if
not
self
.
role_is_generated_
:
sys
.
stderr
.
write
(
"generate_role() should be called first"
)
sys
.
exit
(
-
1
)
return
False
return
True
def
is_first_worker
(
self
):
return
self
.
is_worker
()
and
0
==
self
.
worker_index
()
"""
return whether current process is the first worker assigned by role maker
"""
if
self
.
_check_role_generation
():
return
self
.
is_worker
()
and
0
==
self
.
worker_index
()
return
False
def
is_worker
(
self
):
return
self
.
node_type_
==
1
"""
return whether current process is worker assigned by role maker
"""
if
self
.
_check_role_generation
():
return
self
.
node_type_
==
1
return
False
def
is_server
(
self
):
return
self
.
node_type_
==
0
"""
return whether current process is server assigned by role maker
"""
if
self
.
_check_role_generation
():
return
self
.
node_type_
==
0
return
False
def
worker_num
(
self
):
if
self
.
is_worker
():
return
self
.
get_size
()
"""
return the current number of worker
"""
if
self
.
_check_role_generation
():
if
self
.
is_worker
():
return
self
.
get_size
()
return
0
def
server_num
(
self
):
if
self
.
is_server
():
return
self
.
get_size
()
"""
return the current number of server
"""
if
self
.
_check_role_generation
():
if
self
.
is_server
():
return
self
.
get_size
()
return
0
def
worker_index
(
self
):
return
self
.
rank_
/
self
.
proc_per_node_
"""
return the index of worker
"""
if
self
.
_check_role_generation
():
return
self
.
rank_
/
self
.
proc_per_node_
return
0
def
server_index
(
self
):
return
self
.
rank_
/
self
.
proc_per_node_
"""
return the index of server
"""
if
self
.
_check_role_generation
():
return
self
.
rank_
/
self
.
proc_per_node_
return
0
def
barrier_worker
(
self
):
if
self
.
is_worker
():
self
.
node_type_comm_
.
barrier
()
"""
barrier all workers in current distributed job
"""
if
self
.
_check_role_generation
():
if
self
.
is_worker
():
self
.
node_type_comm_
.
barrier
()
def
barrier_server
(
self
):
if
self
.
is_server
():
self
.
node_type_comm_
.
barrier
()
"""
barrier all servers in current distributed job
"""
if
self
.
_check_role_generation
():
if
self
.
is_server
():
self
.
node_type_comm_
.
barrier
()
def
generate_role
(
self
):
# TODO(guru4elephant): only allow to be called once
self
.
trainer_endpoints_
=
self
.
get_ips
()
self
.
pserver_endpoints_
=
self
.
get_ips
()
if
0
==
self
.
get_rank
()
%
self
.
proc_per_node_
%
2
:
self
.
node_type_
=
0
else
:
self
.
node_type_
=
1
self
.
node_type_comm_
=
self
.
comm_
.
Split
(
self
.
node_type_
)
"""
generate currently process's role
"""
if
not
self
.
role_is_generated_
:
# TODO(guru4elephant): only allow to be called once
self
.
trainer_endpoints_
=
self
.
get_ips
()
self
.
pserver_endpoints_
=
self
.
get_ips
()
if
0
==
self
.
get_rank
()
%
self
.
proc_per_node_
%
2
:
self
.
node_type_
=
0
else
:
self
.
node_type_
=
1
self
.
node_type_comm_
=
self
.
comm_
.
Split
(
self
.
node_type_
)
self
.
role_is_generated_
=
True
python/paddle/fluid/incubate/fleet/parameter_server/__init__.py
浏览文件 @
3c65cc1b
...
...
@@ -22,7 +22,44 @@ import paddle.fluid as fluid
class
Fleet
(
object
):
"""
Fleet in Python. Fleet is used in distributed training. It is designed as a singlton instance
in c++. A Fleet() object will be initialized automatically when a user import this package as
fleet. The General interface Fleet supports are:
init(): which should be called only once in user's python scripts. init() will initialize
FleetWrapper in CPP, it will also initialize a RoleMaker which is used for identifying
current node's role, e.g. worker, server, etc.
stop(): will be called after a user finishes his/her training task. Fleet instance will be
destroyed when stop() is called.
init_pserver(): will be called by user. When a user knows current process is_worker(), he/she
should call init_pserver() to initialize global information about parameter server
init_worker(): will be called by user. When a user knows current process is_server(), he/she
should call init_worker() to initialize global information about worker and connect
worker with pserver.
get_worker_num(): return the number of current task's worker node
get_server_num(): return the number of current task's pserver node
is_worker(): return whether current process is a worker
is_server(): return thether current process is a server
init_pserver_model(): initialize model parameters in pserver, called from a worker node
save_pserver_model(): save model parameters in pserver, called from a server node
Example:
.. code-block:: python
import paddle.fluid.incubate.fleet.parameter_server as fleet
from my_model import bow_net
model = bow_net()
fleet.init()
sgd_optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.0001)
sgd_optimizer = fleet.DistributedOptimizer(sgd_optimizer)
sgd_optimizer.minimize(model.loss)
exe = paddle.fluid.Executor(paddle.fluid.CPUPlace())
if fleet.is_worker():
exe.run(paddle.fluid.default_startup_program())
fleet.init_worker() # init worker should be called before training
# do other things like training
elif fleet.is_server():
fleet.init_pserver()
fleet.stop()
"""
def
__init__
(
self
):
...
...
@@ -35,6 +72,11 @@ class Fleet(object):
# TODO(guru4elephant)
# this is a temporary solution
# we will support more configurable RoleMaker for users in the future
"""
init(): which should be called only once in user's python scripts. init() will initialize
FleetWrapper in CPP, it will also initialize a RoleMaker which is used for identifying
current node's role, e.g. worker, server, etc.
"""
if
not
self
.
is_initialized_
:
self
.
role_maker_
=
MPISymetricRoleMaker
()
self
.
role_maker_
.
generate_role
()
...
...
@@ -42,6 +84,10 @@ class Fleet(object):
self
.
is_initialized_
=
True
def
stop
(
self
):
"""
stop(): will be called after a user finishes his/her training task. Fleet instance will be
destroyed when stop() is called.
"""
self
.
role_maker_
.
barrier_worker
()
if
self
.
role_maker_
.
is_first_worker
():
self
.
_fleet_ptr
.
stop_server
()
...
...
@@ -50,6 +96,10 @@ class Fleet(object):
self
.
role_maker_
.
finalize
()
def
init_pserver
(
self
):
"""
init_pserver(): will be called by user. When a user knows current process is_worker(), he/she
should call init_pserver() to initialize global information about parameter server
"""
if
self
.
_opt_info
:
if
"fleet_desc"
in
self
.
_opt_info
:
self
.
_dist_desc_str
=
text_format
.
MessageToString
(
...
...
@@ -73,6 +123,11 @@ class Fleet(object):
sys
.
exit
(
-
1
)
def
init_worker
(
self
):
"""
init_worker(): will be called by user. When a user knows current process is_server(), he/she
should call init_worker() to initialize global information about worker and connect
worker with pserver.
"""
if
self
.
_opt_info
:
if
"fleet_desc"
in
self
.
_opt_info
:
self
.
_dist_desc_str
=
text_format
.
MessageToString
(
...
...
@@ -93,30 +148,61 @@ class Fleet(object):
sys
.
exit
(
-
1
)
def
get_worker_num
(
self
):
"""
return the number of current job's worker num
"""
return
self
.
role_maker_
.
worker_num
()
def
get_server_num
(
self
):
"""
return the number of current job's server num
"""
return
self
.
role_maker_
.
server_num
()
def
is_worker
(
self
):
"""
return whether current node is a worker
"""
return
self
.
role_maker_
.
is_worker
()
def
is_server
(
self
):
"""
return whether current node is pserver
"""
return
self
.
role_maker_
.
is_server
()
def
init_pserver_model
(
self
):
"""
init pserver model called from pserver
"""
if
self
.
role_maker_
.
is_first_worker
():
self
.
_fleet_ptr
.
init_model
()
self
.
role_maker_
.
barrier_worker
()
def
save_pserver_model
(
self
,
save_path
):
"""
save pserver model called from a worker
"""
self
.
_fleet_ptr
.
save_model
(
save_path
)
def
_set_opt_info
(
self
,
opt_info
):
"""
this function saves the result from DistributedOptimizer.minimize()
"""
self
.
_opt_info
=
opt_info
class
DistributedOptimizer
(
object
):
"""
DistributedOptimizer is a wrapper for paddle.fluid.optimizer
A user should pass a paddle.fluid.optimizer to DistributedOptimizer
minimize() function is implemented.
DistributedOptimizer is the starting point for a user who wants to
run distributed training. The optimized information will be stored in
Fleet() instance who holds the global information about current distributed
training.
"""
def
__init__
(
self
,
optimizer
,
dist_config
=
{}):
super
(
DistributedOptimizer
,
self
).
__init__
()
self
.
_optimizer
=
optimizer
...
...
@@ -136,16 +222,38 @@ class DistributedOptimizer(object):
parameter_list
=
None
,
no_grad_set
=
None
,
callbacks
=
None
):
pass
"""
Currently, backward function can not be called through DistributedOptimizer
"""
raise
NotImplementedError
()
def
apply_gradients
(
self
,
params_grads
):
pass
"""
Currently, apply_gradients function can not be called through DistributedOptimizer
"""
raise
NotImplementedError
()
def
minimize
(
self
,
loss
,
startup_program
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
):
"""
minimize a program through loss, loss can be a list in DistributedOptimizer
Args:
loss (Variable|Variable List): loss variable or loss variable list to run optimization.
startup_program (Program): startup_program for initializing parameters
in `parameter_list`.
parameter_list (list): list of Variables to update.
no_grad_set (set|None): set of Variables should be ignored.
Returns:
tuple: (optimize_ops, params_grads) which are, list of operators appended;
and list of (param, grad) Variables pair for optimization.
Note that in parameter server mode, a worker will not get anything about optimize_os
Because optmizer algorithms run on pserver side. We will make this usable in pserver
process, but currently the optimization part is written into Fleet(). A user does not
need to care about how to startup a pserver node.
"""
optimize_ops
,
param_grads
,
opt_info
=
\
self
.
_distributed_optimizer
.
minimize
(
loss
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录