Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
357311fd
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
357311fd
编写于
7月 02, 2019
作者:
G
guru4elephant
提交者:
GitHub
7月 02, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
make fleet support mpi job submit directly (#18441)
make fleet support mpi job submit directly.
上级
e0d8c6ac
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
25 addition
and
2 deletion
+25
-2
python/paddle/fluid/incubate/fleet/base/role_maker.py
python/paddle/fluid/incubate/fleet/base/role_maker.py
+2
-1
python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
.../fleet/parameter_server/distribute_transpiler/__init__.py
+23
-1
未找到文件。
python/paddle/fluid/incubate/fleet/base/role_maker.py
浏览文件 @
357311fd
...
@@ -198,7 +198,7 @@ class MPIRoleMaker(RoleMakerBase):
...
@@ -198,7 +198,7 @@ class MPIRoleMaker(RoleMakerBase):
"""
"""
finalize the current MPI instance.
finalize the current MPI instance.
"""
"""
pass
self
.
MPI
.
Finalize
()
def
_get_ips
(
self
):
def
_get_ips
(
self
):
"""
"""
...
@@ -356,6 +356,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
...
@@ -356,6 +356,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
print
(
"PaddleCloudRoleMaker() endpoints: %s"
%
self
.
endpoints
)
print
(
"PaddleCloudRoleMaker() endpoints: %s"
%
self
.
endpoints
)
self
.
endpoints
=
self
.
endpoints
.
split
(
","
)
self
.
endpoints
=
self
.
endpoints
.
split
(
","
)
self
.
_server_endpoints
=
self
.
endpoints
self
.
_server_endpoints
=
self
.
endpoints
self
.
_worker_endpoints
=
self
.
endpoints
if
self
.
role
.
upper
()
==
"PSERVER"
:
if
self
.
role
.
upper
()
==
"PSERVER"
:
self
.
_current_id
=
self
.
endpoints
.
index
(
self
.
current_endpoint
)
self
.
_current_id
=
self
.
endpoints
.
index
(
self
.
current_endpoint
)
self
.
_role
=
Role
.
SERVER
self
.
_role
=
Role
.
SERVER
...
...
python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
浏览文件 @
357311fd
...
@@ -26,6 +26,7 @@ from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerCo
...
@@ -26,6 +26,7 @@ from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerCo
from
paddle.fluid.incubate.fleet.base.fleet_base
import
DistributedOptimizer
from
paddle.fluid.incubate.fleet.base.fleet_base
import
DistributedOptimizer
from
paddle.fluid.incubate.fleet.base.fleet_base
import
Fleet
from
paddle.fluid.incubate.fleet.base.fleet_base
import
Fleet
from
paddle.fluid.incubate.fleet.base.fleet_base
import
Mode
from
paddle.fluid.incubate.fleet.base.fleet_base
import
Mode
from
paddle.fluid.incubate.fleet.base.role_maker
import
MPISymetricRoleMaker
class
DistributedTranspiler
(
Fleet
):
class
DistributedTranspiler
(
Fleet
):
...
@@ -52,6 +53,13 @@ class DistributedTranspiler(Fleet):
...
@@ -52,6 +53,13 @@ class DistributedTranspiler(Fleet):
Returns:
Returns:
None
None
"""
"""
# if MPISymetricRoleMaker is defined
# we suppose a user wants to submit job on mpi cluster
if
isinstance
(
self
.
_role_maker
,
MPISymetricRoleMaker
):
# check whether server has been initialized
from
paddle.fluid.transpiler.details.checkport
import
wait_server_ready
wait_server_ready
(
fleet
.
server_endpoints
(
to_string
=
False
))
if
not
self
.
_transpile_config
.
sync_mode
:
if
not
self
.
_transpile_config
.
sync_mode
:
self
.
_communicator
=
Communicator
(
self
.
main_program
)
self
.
_communicator
=
Communicator
(
self
.
main_program
)
...
@@ -114,6 +122,9 @@ class DistributedTranspiler(Fleet):
...
@@ -114,6 +122,9 @@ class DistributedTranspiler(Fleet):
self
.
_communicator
.
stop
()
self
.
_communicator
.
stop
()
self
.
_executor
.
close
()
self
.
_executor
.
close
()
if
isinstance
(
self
.
_role_maker
,
MPISymetricRoleMaker
):
self
.
_role_maker
.
_finalize
()
def
distributed_optimizer
(
self
,
optimizer
,
strategy
=
None
):
def
distributed_optimizer
(
self
,
optimizer
,
strategy
=
None
):
"""
"""
Optimizer for distributed training.
Optimizer for distributed training.
...
@@ -199,13 +210,24 @@ class DistributedTranspiler(Fleet):
...
@@ -199,13 +210,24 @@ class DistributedTranspiler(Fleet):
self
.
_transpile_config
=
config
self
.
_transpile_config
=
config
self
.
_transpiler
=
OriginTranspiler
(
config
)
self
.
_transpiler
=
OriginTranspiler
(
config
)
print
(
"server endpoints"
)
print
(
fleet
.
server_endpoints
(
to_string
=
True
))
print
(
"worker index: %d"
%
fleet
.
worker_index
())
print
(
"worker num: %d"
%
fleet
.
worker_num
())
if
self
.
is_worker
():
if
self
.
is_worker
():
self
.
_transpiler
.
transpile
(
self
.
_transpiler
.
transpile
(
trainer_id
=
fleet
.
worker_index
(),
trainer_id
=
fleet
.
worker_index
(),
pservers
=
fleet
.
server_endpoints
(
to_string
=
True
),
pservers
=
fleet
.
server_endpoints
(
to_string
=
True
),
trainers
=
fleet
.
worker_num
(),
trainers
=
fleet
.
worker_num
(),
sync_mode
=
config
.
sync_mode
)
sync_mode
=
config
.
sync_mode
)
self
.
main_program
=
self
.
_transpiler
.
get_trainer_program
()
wait_port
=
True
if
isinstance
(
self
.
_role_maker
,
MPISymetricRoleMaker
):
wait_port
=
False
self
.
main_program
=
self
.
_transpiler
.
get_trainer_program
(
wait_port
=
wait_port
)
self
.
startup_program
=
default_startup_program
()
self
.
startup_program
=
default_startup_program
()
else
:
else
:
self
.
_transpiler
.
transpile
(
self
.
_transpiler
.
transpile
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录