Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
5132f512
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
5132f512
编写于
9月 29, 2020
作者:
L
lilong12
提交者:
GitHub
9月 29, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
terminate http server used by gloo for fleet after init (#27698)
上级
8a4f85fe
变更
1
显示空白变更内容
内联
并排
Showing
1 changed file
with
25 addition
and
10 deletion
+25
-10
python/paddle/distributed/fleet/base/role_maker.py
python/paddle/distributed/fleet/base/role_maker.py
+25
-10
未找到文件。
python/paddle/distributed/fleet/base/role_maker.py
浏览文件 @
5132f512
...
...
@@ -78,10 +78,10 @@ class Gloo(object):
self
.
_worker_num
=
worker_num
self
.
_server_num
=
server_num
self
.
_need_init_all
=
need_init_all
self
.
_start_http_server
=
kwargs
.
get
(
"start_http_server"
,
False
)
self
.
_iface
=
""
self
.
_prefix
=
kwargs
.
get
(
"store.prefix"
,
""
)
http_server
=
None
if
self
.
_rendezvous
==
Gloo
.
RENDEZVOUS
.
HDFS
:
dfs_name
=
kwargs
.
get
(
"dfs.name"
,
""
)
dfs_ugi
=
kwargs
.
get
(
"dfs.ugi"
,
""
)
...
...
@@ -101,17 +101,18 @@ class Gloo(object):
elif
self
.
_rendezvous
==
Gloo
.
RENDEZVOUS
.
HTTP
:
ip
=
kwargs
.
get
(
"http.host"
,
""
)
port
=
kwargs
.
get
(
"http.port"
,
""
)
start_http_server
=
kwargs
.
get
(
"start_http_server"
,
False
)
http_server_d
=
kwargs
.
get
(
"http_server_d"
)
if
not
ip
or
not
port
:
raise
ValueError
(
self
.
_err_type
)
self
.
_init_http
(
ip
,
port
,
self
.
_prefix
,
self
.
_start_http_server
)
ep
=
":"
.
join
([
ip
,
port
])
wait_server_ready
([
ep
])
http_server
=
self
.
_init_http
(
ip
,
port
,
self
.
_prefix
,
start_http_server
,
http_server_d
)
else
:
raise
ValueError
(
self
.
_err_type
)
self
.
_is_initialized
=
True
self
.
_http_server
=
http_server
def
_init_fs
(
self
,
fs_path
,
prefix
):
def
init
(
rank
,
nodes
,
role
):
...
...
@@ -167,7 +168,7 @@ class Gloo(object):
gloo
=
init
(
rank
,
nodes
,
"ALL"
)
self
.
_nodes_comm
=
gloo
def
_init_http
(
self
,
ip
,
port
,
prefix
,
start_http_server
):
def
_init_http
(
self
,
ip
,
port
,
prefix
,
start_http_server
,
http_server_d
):
def
__start_kv_server
(
http_server_d
,
size_d
):
from
paddle.distributed.fleet.utils.http_server
import
KVServer
http_server
=
KVServer
(
port
,
size_d
)
...
...
@@ -177,21 +178,22 @@ class Gloo(object):
time
.
sleep
(
wait_seconds
)
http_server
.
stop
()
def
init_kv_server
():
def
init_kv_server
(
http_server_d
):
size_d
=
{
"trainer"
:
self
.
_worker_num
,
"pserver"
:
self
.
_server_num
,
"all"
:
self
.
_worker_num
+
self
.
_server_num
}
_http_server_d
=
{
"running"
:
True
}
http_server_d
[
"running"
]
=
True
# child process for http server
_http_server
=
Process
(
target
=
__start_kv_server
,
args
=
(
_
http_server_d
,
size_d
))
target
=
__start_kv_server
,
args
=
(
http_server_d
,
size_d
))
_http_server
.
daemon
=
True
# set running status to True
# start child process
_http_server
.
start
()
return
_http_server
def
init
(
rank
,
nodes
,
role
):
gloo
=
fluid
.
core
.
Gloo
()
...
...
@@ -202,12 +204,15 @@ class Gloo(object):
gloo
.
set_timeout_seconds
(
self
.
_init_timeout_seconds
,
self
.
_run_timeout_seconds
)
gloo
.
set_http_store
(
ip
,
port
,
role
)
ep
=
":"
.
join
([
ip
,
str
(
port
)])
wait_server_ready
([
ep
])
gloo
.
init
()
return
gloo
port
=
int
(
port
)
if
start_http_server
:
init_kv_server
(
)
http_server
=
init_kv_server
(
http_server_d
)
if
self
.
_role
==
Role
.
WORKER
:
rank
,
nodes
=
self
.
_get_rank_nodes
(
Role
.
WORKER
)
...
...
@@ -222,6 +227,9 @@ class Gloo(object):
rank
,
nodes
=
self
.
_get_rank_nodes
(
Role
.
ALL
)
gloo
=
init
(
rank
,
nodes
,
"ALL"
)
self
.
_nodes_comm
=
gloo
if
start_http_server
:
http_server_d
[
"running"
]
=
False
http_server
.
join
()
def
_get_rank_nodes
(
self
,
role
):
nodes
=
0
...
...
@@ -804,6 +812,9 @@ class PaddleCloudRoleMaker(RoleMakerBase):
}
elif
rendezvous_type
==
Gloo
.
RENDEZVOUS
.
HTTP
:
start_http_server
=
False
manager
=
Manager
()
http_server_d
=
manager
.
dict
()
http_server_d
[
"running"
]
=
False
if
self
.
_is_collective
:
ep_rank_0
=
self
.
_worker_endpoints
[
0
]
if
self
.
_is_first_worker
():
...
...
@@ -818,6 +829,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
"http.port"
:
port
,
"store.prefix"
:
prefix
,
'start_http_server'
:
start_http_server
,
'http_server_d'
:
http_server_d
,
}
else
:
dfs_path
=
os
.
getenv
(
"PADDLE_GLOO_FS_PATH"
,
""
)
...
...
@@ -844,6 +856,9 @@ class PaddleCloudRoleMaker(RoleMakerBase):
need_init_all
=
need_init_all
,
kwargs
=
kwargs
)
if
rendezvous_type
==
Gloo
.
RENDEZVOUS
.
HTTP
:
http_server_d
[
'running'
]
=
False
def
_generate_role
(
self
):
"""
generate role for role maker
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录