Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
d8e1e50a
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
d8e1e50a
编写于
12月 08, 2020
作者:
L
lilong12
提交者:
GitHub
12月 08, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Cherry-pick] Fix bug in gloo that gloo initialization hangs (#29449)
* update, test=develop (#29331)
上级
49265879
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
21 addition
and
18 deletion
+21
-18
python/paddle/distributed/fleet/base/role_maker.py
python/paddle/distributed/fleet/base/role_maker.py
+6
-6
python/paddle/distributed/fleet/utils/http_server.py
python/paddle/distributed/fleet/utils/http_server.py
+4
-4
python/paddle/distributed/parallel.py
python/paddle/distributed/parallel.py
+7
-4
python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
.../paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+4
-4
未找到文件。
python/paddle/distributed/fleet/base/role_maker.py
浏览文件 @
d8e1e50a
...
...
@@ -171,6 +171,7 @@ class Gloo(object):
def
_init_http
(
self
,
ip
,
port
,
prefix
,
start_http_server
,
http_server_d
):
def
__start_kv_server
(
http_server_d
,
size_d
):
print
(
"start http_server: {}, {}"
.
format
(
port
,
size_d
))
from
paddle.distributed.fleet.utils.http_server
import
KVServer
http_server
=
KVServer
(
port
,
size_d
)
http_server
.
start
()
...
...
@@ -181,11 +182,9 @@ class Gloo(object):
http_server
.
stop
()
def
init_kv_server
(
http_server_d
):
size_d
=
{
"trainer"
:
self
.
_worker_num
,
"pserver"
:
self
.
_server_num
,
"all"
:
self
.
_worker_num
+
self
.
_server_num
}
worker_key
=
prefix
+
'_'
+
'worker'
size_d
=
{
worker_key
:
self
.
_worker_num
,
}
print
(
"worker_key:{}, size: {}"
.
format
(
worker_key
,
size_d
))
http_server_d
[
"running"
]
=
True
# child process for http server
...
...
@@ -205,7 +204,7 @@ class Gloo(object):
gloo
.
set_iface
(
self
.
_iface
)
gloo
.
set_timeout_seconds
(
self
.
_init_timeout_seconds
,
self
.
_run_timeout_seconds
)
gloo
.
set_http_store
(
ip
,
port
,
role
)
gloo
.
set_http_store
(
ip
,
port
,
'worker'
)
ep
=
":"
.
join
([
ip
,
str
(
port
)])
wait_server_ready
([
ep
])
gloo
.
init
()
...
...
@@ -214,6 +213,7 @@ class Gloo(object):
port
=
int
(
port
)
if
start_http_server
:
print
(
"to start http_server"
)
http_server
=
init_kv_server
(
http_server_d
)
if
self
.
_role
==
Role
.
WORKER
:
...
...
python/paddle/distributed/fleet/utils/http_server.py
浏览文件 @
d8e1e50a
...
...
@@ -112,8 +112,8 @@ class KVHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
_
,
scope
,
key
=
paths
with
self
.
server
.
delete_kv_lock
:
if
self
.
server
.
delete_kv
.
get
(
scope
)
is
None
:
self
.
server
.
delete_kv
[
scope
]
=
[]
self
.
server
.
delete_kv
[
scope
].
a
ppen
d
(
key
)
self
.
server
.
delete_kv
[
scope
]
=
set
()
self
.
server
.
delete_kv
[
scope
].
a
d
d
(
key
)
self
.
send_status_code
(
200
)
_http_server_logger
.
info
(
log_str
)
...
...
@@ -151,7 +151,7 @@ class KVHTTPServer(HTTPServer, object):
"""
ret
=
0
with
self
.
delete_kv_lock
:
ret
=
self
.
delete_kv
.
get
(
key
,
0
)
ret
=
len
(
self
.
delete_kv
.
get
(
key
,
set
())
)
return
ret
...
...
@@ -164,7 +164,7 @@ class KVServer:
"""Init."""
self
.
http_server
=
KVHTTPServer
(
port
,
KVHandler
)
self
.
listen_thread
=
None
self
.
size
=
{}
self
.
size
=
size
def
start
(
self
):
"""
...
...
python/paddle/distributed/parallel.py
浏览文件 @
d8e1e50a
...
...
@@ -44,11 +44,11 @@ def _get_global_parallel_env():
return
_global_parallel_env
def
_start_kv_server
(
port
,
http_server_d
):
def
_start_kv_server
(
port
,
http_server_d
,
size
):
from
paddle.distributed.fleet.utils.http_server
import
KVServer
http_server
=
KVServer
(
int
(
port
))
http_server
=
KVServer
(
int
(
port
)
,
size
=
size
)
http_server
.
start
()
wait_seconds
=
5
wait_seconds
=
3
while
http_server_d
.
get
(
"running"
,
False
)
or
not
http_server
.
should_stop
():
time
.
sleep
(
wait_seconds
)
http_server
.
stop
()
...
...
@@ -149,8 +149,11 @@ def init_parallel_env():
http_server_d
=
manager
.
dict
()
http_server_d
[
"running"
]
=
False
if
parallel_env
.
rank
==
0
:
# The scope for worker used by http server is '_worker'
size
=
{
'_worker'
:
parallel_env
.
world_size
}
http_server
=
Process
(
target
=
_start_kv_server
,
args
=
(
int
(
ep_rank_0
[
1
]),
http_server_d
))
target
=
_start_kv_server
,
args
=
(
int
(
ep_rank_0
[
1
]),
http_server_d
,
size
))
http_server
.
daemon
=
True
http_server_d
[
"running"
]
=
True
http_server
.
start
()
...
...
python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
浏览文件 @
d8e1e50a
...
...
@@ -274,7 +274,7 @@ class TestGlooWithCloudRoleMaker(unittest.TestCase):
print
(
"skip gloo UT on MacOS/Win"
)
return
os
.
environ
[
"TRAINING_ROLE"
]
=
"
PSERV
ER"
os
.
environ
[
"TRAINING_ROLE"
]
=
"
WORK
ER"
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
]
=
"127.0.0.1:36001"
os
.
environ
[
"POD_IP"
]
=
"127.0.0.1"
os
.
environ
[
"PADDLE_PORT"
]
=
"36001"
...
...
@@ -284,7 +284,7 @@ class TestGlooWithCloudRoleMaker(unittest.TestCase):
os
.
environ
[
"PADDLE_GLOO_RENDEZVOUS"
]
=
"3"
os
.
environ
[
"PADDLE_GLOO_HTTP_ENDPOINT"
]
=
"127.0.0.1:30019"
role
=
role_maker
.
PaddleCloudRoleMaker
()
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collecitve
=
True
)
role
.
_generate_role
()
import
time
time
.
sleep
(
3
)
...
...
@@ -532,7 +532,7 @@ class TestGlooWithCloudRoleMaker(unittest.TestCase):
print
(
"skip gloo UT on MacOS/Win"
)
return
os
.
environ
[
"TRAINING_ROLE"
]
=
"
PSERV
ER"
os
.
environ
[
"TRAINING_ROLE"
]
=
"
WORK
ER"
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
]
=
"127.0.0.1:36001"
os
.
environ
[
"POD_IP"
]
=
"127.0.0.1"
os
.
environ
[
"PADDLE_PORT"
]
=
"36001"
...
...
@@ -542,7 +542,7 @@ class TestGlooWithCloudRoleMaker(unittest.TestCase):
os
.
environ
[
"PADDLE_GLOO_RENDEZVOUS"
]
=
"3"
os
.
environ
[
"PADDLE_GLOO_HTTP_ENDPOINT"
]
=
"127.0.0.1:30019"
role
=
role_maker
.
PaddleCloudRoleMaker
()
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
role
.
_generate_role
()
import
time
time
.
sleep
(
3
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录