Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
8d99dd0c
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
8d99dd0c
编写于
11月 10, 2022
作者:
L
LiYuRio
提交者:
GitHub
11月 10, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
remove the hang checkness (#47806)
上级
594bd723
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
20 addition
and
43 deletion
+20
-43
paddle/fluid/distributed/store/tcp_store.cc
paddle/fluid/distributed/store/tcp_store.cc
+19
-39
paddle/fluid/distributed/store/tcp_store.h
paddle/fluid/distributed/store/tcp_store.h
+1
-4
未找到文件。
paddle/fluid/distributed/store/tcp_store.cc
浏览文件 @
8d99dd0c
...
...
@@ -95,19 +95,6 @@ void MasterDaemon::_do_get(SocketType socket) {
tcputils
::
send_vector
<
uint8_t
>
(
socket
,
value
);
}
void
MasterDaemon
::
_do_stop
(
SocketType
socket
)
{
VLOG
(
4
)
<<
"MasterDaemon::_do_stop "
<<
GetSockName
(
socket
);
if
(
!
_has_stop
)
{
_stop_time
=
std
::
chrono
::
system_clock
::
now
();
}
_has_stop
=
true
;
ReplyType
value
=
ReplyType
::
STOP_WAIT
;
tcputils
::
send_value
<
ReplyType
>
(
socket
,
value
);
if
(
--
_nranks
==
0
)
{
_stop
=
true
;
}
}
#ifndef _WIN32
void
MasterDaemon
::
InitControlFd
()
{
PADDLE_ENFORCE_NE
(
...
...
@@ -135,9 +122,13 @@ void MasterDaemon::StopByControlFd() {
}
}
#else
void
MasterDaemon
::
InitControlFd
()
{}
void
MasterDaemon
::
CloseControlFd
()
{}
void
MasterDaemon
::
StopByControlFd
()
{}
void
MasterDaemon
::
InitControlFd
()
{
ghStopEvent_
=
CreateEvent
(
NULL
,
TRUE
,
FALSE
,
NULL
);
PADDLE_ENFORCE
(
ghStopEvent_
,
platform
::
errors
::
Fatal
(
"failed to cread control pipe"
));
}
void
MasterDaemon
::
CloseControlFd
()
{
CloseHandle
(
ghStopEvent_
);
}
void
MasterDaemon
::
StopByControlFd
()
{
SetEvent
(
ghStopEvent_
);
}
#endif
void
MasterDaemon
::
_do_wait
(
SocketType
socket
)
{
...
...
@@ -186,9 +177,6 @@ void MasterDaemon::ProcessCommands(std::vector<struct pollfd>* p_fds) {
case
Command
::
WAIT
:
_do_wait
(
fds
[
i
].
fd
);
break
;
case
Command
::
STOP
:
_do_stop
(
fds
[
i
].
fd
);
break
;
default:
LOG
(
WARNING
)
<<
"Unknown command: "
<<
static_cast
<
int
>
(
command
)
<<
" from addr info:"
<<
GetSockName
(
fds
[
i
].
fd
);
...
...
@@ -208,7 +196,6 @@ void MasterDaemon::ProcessCommands(std::vector<struct pollfd>* p_fds) {
}
void
MasterDaemon
::
run
()
{
VLOG
(
4
)
<<
"begin to run run _stop:"
<<
_stop
<<
" _has_stop:"
<<
_has_stop
;
std
::
vector
<
struct
pollfd
>
fds
;
#ifdef _WIN32
fds
.
push_back
({
_listen_socket
,
POLLIN
});
...
...
@@ -218,23 +205,8 @@ void MasterDaemon::run() {
{.
fd
=
_control_fd
[
0
],
.
events
=
POLLIN
|
POLLHUP
,
.
revents
=
0
});
#endif
while
(
!
_stop
)
{
auto
end_time
=
std
::
chrono
::
system_clock
::
now
();
if
(
_has_stop
)
{
std
::
chrono
::
duration
<
double
>
diff
=
end_time
-
_stop_time
;
int
elapsed_seconds
=
static_cast
<
int
>
(
diff
.
count
());
PADDLE_ENFORCE_LT
(
elapsed_seconds
,
_timeout
,
platform
::
errors
::
Fatal
(
"%d seconds elapsed after the first worker "
"stopped, so we think there may be something wrong and will "
"stop the master worker. You can use "
"'export FLAGS_stop_check_timeout=3600'"
" to change the timeout value in seconds. The default one is 900"
,
elapsed_seconds
));
}
bool
finished
=
false
;
while
(
!
finished
)
{
for
(
size_t
i
=
0
;
i
<
fds
.
size
();
i
++
)
{
fds
[
i
].
revents
=
0
;
}
...
...
@@ -242,7 +214,15 @@ void MasterDaemon::run() {
VLOG
(
9
)
<<
"begin to poll fds_size:"
<<
paddle
::
string
::
Sprintf
(
"%d"
,
fds
.
size
());
#ifdef _WIN32
::
WSAPoll
(
fds
.
data
(),
fds
.
size
(),
INFTIME
);
int
res
=
::
WSAPoll
(
fds
.
data
(),
fds
.
size
(),
INFTIME
);
if
(
res
==
0
)
{
auto
rv
=
WaitForSingleObject
(
ghStopEvent_
,
0
);
if
(
rv
!=
WAIT_TIMEOUT
)
{
finished
=
true
;
break
;
}
continue
;
}
#else
::
poll
(
fds
.
data
(),
fds
.
size
(),
INFTIME
);
...
...
@@ -256,7 +236,7 @@ void MasterDaemon::run() {
}
VLOG
(
0
)
<<
"receive shutdown event and so quit from MasterDaemon run loop"
;
_stop
=
true
;
finished
=
true
;
break
;
}
#endif
...
...
paddle/fluid/distributed/store/tcp_store.h
浏览文件 @
8d99dd0c
...
...
@@ -60,21 +60,18 @@ class MasterDaemon {
void
_do_wait
(
SocketType
socket
);
void
_do_get
(
SocketType
socket
);
void
_do_set
(
SocketType
socket
);
void
_do_stop
(
SocketType
socket
);
SocketType
_listen_socket
;
std
::
vector
<
SocketType
>
_sockets
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
uint8_t
>>
_store
;
std
::
thread
_background_thread
{};
int
_nranks
=
-
1
;
int
_timeout
=
0
;
bool
_stop
=
false
;
// all workers stopped
std
::
chrono
::
time_point
<
std
::
chrono
::
system_clock
>
_stop_time
;
bool
_has_stop
=
false
;
// at least one worker stopped
void
InitControlFd
();
void
CloseControlFd
();
void
StopByControlFd
();
#ifdef _WIN32
HANDLE
ghStopEvent_
{};
#else
std
::
array
<
int
,
2
>
_control_fd
{{
-
1
,
-
1
}};
#endif
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录