Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
1734bc6f
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
1734bc6f
编写于
8月 23, 2022
作者:
L
LiYuRio
提交者:
GitHub
8月 23, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add store_barrier to prevent master exit (#44964)
上级
eb4531f1
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
68 addition
and
13 deletion
+68
-13
paddle/fluid/distributed/store/tcp_store.cc
paddle/fluid/distributed/store/tcp_store.cc
+7
-5
python/paddle/distributed/collective.py
python/paddle/distributed/collective.py
+61
-8
未找到文件。
paddle/fluid/distributed/store/tcp_store.cc
浏览文件 @
1734bc6f
...
...
@@ -194,8 +194,8 @@ void MasterDaemon::ProcessCommands(std::vector<struct pollfd>* p_fds) {
<<
" from addr info:"
<<
GetSockName
(
fds
[
i
].
fd
);
}
}
catch
(
const
std
::
exception
&
ex
)
{
fds
.
erase
(
fds
.
begin
()
+
i
);
tcputils
::
close_socket
(
fds
[
i
].
fd
);
fds
.
erase
(
fds
.
begin
()
+
i
);
#ifdef _WIN32
_sockets
.
erase
(
_sockets
.
begin
()
+
i
-
1
);
#else
...
...
@@ -405,12 +405,14 @@ std::vector<uint8_t> TCPStore::get(const std::string& key) {
void
TCPStore
::
wait
(
const
std
::
string
&
key
)
{
ReplyType
reply
;
VLOG
(
3
)
<<
"TCPStore wait."
;
do
{
_client
->
send_command_for_key
(
Command
::
WAIT
,
_key_prefix
+
key
);
_client
->
send_command_for_key
(
Command
::
WAIT
,
_key_prefix
+
key
);
reply
=
_client
->
receive_value
<
ReplyType
>
();
while
(
reply
!=
ReplyType
::
STOP_WAIT
)
{
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
500
));
_client
->
send_command_for_key
(
Command
::
WAIT
,
_key_prefix
+
key
);
reply
=
_client
->
receive_value
<
ReplyType
>
();
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
500
));
}
while
(
reply
!=
ReplyType
::
STOP_WAIT
);
}
}
TCPStore
::~
TCPStore
()
{
VLOG
(
3
)
<<
"TCPStore destructure"
;
}
...
...
python/paddle/distributed/collective.py
浏览文件 @
1734bc6f
...
...
@@ -16,7 +16,8 @@ import numpy as np
import
os
import
pickle
import
io
from
datetime
import
timedelta
import
datetime
import
time
from
..fluid.layer_helper
import
LayerHelper
from
..fluid.framework
import
Variable
from
..fluid.framework
import
in_dygraph_mode
...
...
@@ -134,6 +135,7 @@ def _get_global_env():
# group map : the map of all group, 0 for GlobalGroup
# Dict[int, Group]
_group_map
=
{}
_global_env_gid
=
0
# group map by name : the map of all groups from their names
# Dict[name, Group]
...
...
@@ -149,6 +151,8 @@ _default_group_name = "_default_pg"
_valid_backend_list
=
[
'nccl'
,
'gloo'
,
'hccl'
,
'heter'
,
'xccl'
]
_default_store
=
None
# the default tcp store
_default_backend
=
None
_default_timeout
=
datetime
.
timedelta
(
seconds
=
1800
)
_start_ring_id
=
0
def
_set_default_backend
(
backend
):
...
...
@@ -163,16 +167,16 @@ def _set_default_store(store):
def
_get_group_map
():
global
_group_map
if
not
_group_map
:
if
_global_env_gid
not
in
_group_map
:
genv
=
_get_global_env
()
_group_map
[
0
]
=
Group
(
genv
.
rank
,
genv
.
world_size
,
ranks
=
list
(
range
(
genv
.
world_size
)))
_group_map
[
_global_env_gid
]
=
Group
(
genv
.
rank
,
genv
.
world_size
,
ranks
=
list
(
range
(
genv
.
world_size
)))
return
_group_map
def
_get_global_group
():
return
_get_group_map
()[
0
]
return
_get_group_map
()[
_global_env_gid
]
def
_get_group_map_by_name
():
...
...
@@ -206,7 +210,13 @@ def _set_group_map_backend(group, backend):
def
_new_ring_id
():
return
len
(
_get_group_map
())
+
max
(
_get_global_env
().
nrings
,
9
)
# NOTE(liyurui): For compatible reason, auto parallel and eager mode relay on previous syntax.
if
in_dygraph_mode
():
global
_start_ring_id
_start_ring_id
+=
1
return
_start_ring_id
+
max
(
_get_global_env
().
nrings
,
9
)
else
:
return
len
(
_get_group_map
())
+
max
(
_get_global_env
().
nrings
,
9
)
def
_get_reduce_op
(
reduce_op
,
func_name
):
...
...
@@ -293,6 +303,7 @@ def _new_process_group_impl(backend,
cluster_id
-
1
]
global_rank
=
cluster_offset
+
rank
global_world_size
=
cluster_size_cumsum
[
-
1
]
global_rank
,
global_world_size
=
_get_global_config
(
backend
,
rank
)
pg
=
core
.
ProcessGroupHeter
(
store
,
rank
=
global_rank
,
world_size
=
global_world_size
,
...
...
@@ -368,7 +379,43 @@ def _set_custom_gid(gid):
_custom_gid
=
gid
def
new_group
(
ranks
=
None
,
backend
=
None
):
def
_barrier_by_tcp_store
(
group_name
,
store
,
timeout
):
global_rank
=
paddle
.
distributed
.
get_rank
()
global_world_size
=
paddle
.
distributed
.
get_world_size
()
if
global_world_size
<
2
:
return
barrier_prefix
=
"Barrier/"
+
group_name
+
"/"
is_master
=
(
global_rank
==
0
)
def
_check_keys_ready
(
wait_keys
):
start_time
=
time
.
time
()
while
len
(
wait_keys
)
>
0
:
time
.
sleep
(
0.1
)
elapse_time
=
time
.
time
()
-
start_time
if
datetime
.
timedelta
(
seconds
=
elapse_time
)
>
timeout
:
raise
RuntimeError
(
"Timeout while initializing process group {}."
"Keys {} are not ready sinck rank {} is waiting them."
"Two reason may cause this error:
\n
1. The create process group api should be called by all ranks.
\n
"
" 2. Try to increase the waiting time.
\n
"
.
format
(
group_name
,
wait_keys
,
global_rank
))
wait_keys
=
list
(
filter
(
lambda
key
:
int
(
store
.
get
(
key
))
!=
1
,
wait_keys
))
# all the workers set their exiting key and exit
# the master will wait for all workers' exiting key, ensure to exit in the end
if
is_master
:
wait_keys
=
[
barrier_prefix
+
str
(
rank
)
for
rank
in
range
(
1
,
global_world_size
)
]
_check_keys_ready
(
wait_keys
)
else
:
store
.
add
(
barrier_prefix
+
str
(
global_rank
),
1
)
def
new_group
(
ranks
=
None
,
backend
=
None
,
timeout
=
_default_timeout
):
"""
Creates a new distributed communication group.
...
...
@@ -376,6 +423,7 @@ def new_group(ranks=None, backend=None):
Args:
ranks (list): The global ranks of group members.
backend (str): The backend used to create group, only nccl is supported now.
timeout (datetime.timedelta, optional): The waiting timeout for store relevant options, default is 30 minutes.
Returns:
Group: The group instance.
...
...
@@ -433,6 +481,11 @@ def new_group(ranks=None, backend=None):
# TODO(shenliang03): This is a temporary solution to solve the problem of
# hang caused by tcp
paddle
.
distributed
.
barrier
(
group
=
group
)
# NOTE(liyurui): All processors should hang and wait using tcp store, in case master exit before sub-group is created.
if
backend
!=
'heter'
:
_barrier_by_tcp_store
(
group_name
,
_default_store
,
timeout
)
else
:
print
(
"Warning: store barrier is not supported for heter backend."
)
return
group
if
not
backend
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录