Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
4ac9d64f
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
4ac9d64f
编写于
6月 30, 2022
作者:
K
kuizhiqing
提交者:
GitHub
6月 30, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix launch exit graceful (#43940)
上级
99a4ff8f
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
40 addition
and
16 deletion
+40
-16
python/paddle/distributed/launch/context/__init__.py
python/paddle/distributed/launch/context/__init__.py
+4
-0
python/paddle/distributed/launch/controllers/controller.py
python/paddle/distributed/launch/controllers/controller.py
+10
-8
python/paddle/distributed/launch/controllers/master.py
python/paddle/distributed/launch/controllers/master.py
+1
-1
python/paddle/distributed/launch/controllers/watcher.py
python/paddle/distributed/launch/controllers/watcher.py
+3
-1
python/paddle/distributed/launch/job/container.py
python/paddle/distributed/launch/job/container.py
+5
-1
python/paddle/distributed/launch/job/pod.py
python/paddle/distributed/launch/job/pod.py
+17
-5
未找到文件。
python/paddle/distributed/launch/context/__init__.py
浏览文件 @
4ac9d64f
...
...
@@ -76,6 +76,10 @@ class Context(object):
def
get_envs
(
self
):
return
self
.
envs
.
copy
()
def
set_envs
(
self
,
env
=
{}):
env
=
{
k
:
v
for
k
,
v
in
env
.
items
()
if
isinstance
(
v
,
str
)}
self
.
envs
.
update
(
env
)
def
_enable_plugin
(
self
):
for
pl
in
plugins
.
enabled_plugins
:
pl
(
self
)
...
...
python/paddle/distributed/launch/controllers/controller.py
浏览文件 @
4ac9d64f
...
...
@@ -49,6 +49,8 @@ class ControllerBase(object):
jid
=
self
.
ctx
.
args
.
job_id
)
self
.
pod
=
Pod
()
self
.
ctx
.
set_envs
({
"POD_NAME"
:
self
.
pod
.
name
})
self
.
join_server
=
None
def
deploy_pod
(
self
):
...
...
@@ -104,17 +106,18 @@ class ControllerBase(object):
self
.
ctx
.
logger
.
info
(
"Pod {}"
.
format
(
status
))
self
.
ctx
.
logger
.
error
(
"Container failed !!!
\n
{}"
.
format
(
fc
[
0
]))
fc
[
0
].
tail
()
self
.
pod
.
stop
()
if
self
.
ctx
.
args
.
elastic_level
<=
0
:
self
.
pod
.
stop
(
timeout
=
3
)
return
True
else
:
self
.
pod
.
stop
(
timeout
=
30
)
return
False
# peer failure
if
self
.
ctx
.
status
.
is_restarting
(
)
and
self
.
master
.
get_status
()
!=
self
.
ctx
.
status
.
COMPLETED
:
self
.
pod
.
stop
()
self
.
pod
.
stop
(
timeout
=
30
)
return
False
def
stop
(
self
,
sigint
=
None
):
...
...
@@ -123,7 +126,7 @@ class ControllerBase(object):
self
.
watcher
.
stop
()
self
.
master
.
stop
()
self
.
pod
.
stop
(
sigint
)
self
.
pod
.
stop
(
timeout
=
30
)
def
finalize
(
self
):
self
.
pod
.
join
()
...
...
@@ -133,17 +136,16 @@ class ControllerBase(object):
sys
.
exit
(
self
.
pod
.
exit_code
)
def
signal_handler
(
self
,
sigint
,
frame
):
self
.
ctx
.
logger
.
info
(
"Terminating with signal {}"
.
format
(
sigint
))
if
hasattr
(
self
,
'sigint'
):
self
.
ctx
.
logger
.
info
(
"Force quit in 10 seconds..."
)
time
.
sleep
(
11
)
self
.
pod
.
stop
(
timeout
=
10
)
sys
.
exit
(
sigint
)
self
.
ctx
.
logger
.
info
(
"Terminating with signal {}"
.
format
(
sigint
))
self
.
sigint
=
sigint
self
.
ctx
.
status
.
done
()
self
.
stop
(
sigint
)
time
.
sleep
(
1
)
self
.
stop
(
sigint
=
sigint
)
self
.
ctx
.
logger
.
info
(
"Exit with signal {}"
.
format
(
sigint
))
sys
.
exit
(
sigint
)
...
...
python/paddle/distributed/launch/controllers/master.py
浏览文件 @
4ac9d64f
...
...
@@ -316,5 +316,5 @@ class ETCDMaster(Master):
def
stop
(
self
):
if
hasattr
(
self
,
'beat_thread'
):
self
.
ctx
.
status
.
done
()
#
TODO(kuizhiqing) thread should exit
#
daemon thread
#self.beat_thread.join()
python/paddle/distributed/launch/controllers/watcher.py
浏览文件 @
4ac9d64f
...
...
@@ -93,4 +93,6 @@ class Watcher(object):
def
stop
(
self
):
if
hasattr
(
self
,
"proc"
):
self
.
proc
.
join
()
# daemon without join
# self.proc.join()
pass
python/paddle/distributed/launch/job/container.py
浏览文件 @
4ac9d64f
...
...
@@ -131,7 +131,11 @@ class Container(object):
return
self
.
_proc
.
terminate
(
force
)
def
wait
(
self
,
timeout
=
None
):
self
.
_proc
.
wait
(
timeout
)
try
:
self
.
_proc
.
wait
(
timeout
)
return
True
except
Exception
:
return
False
@
property
def
exit_code
(
self
):
...
...
python/paddle/distributed/launch/job/pod.py
浏览文件 @
4ac9d64f
...
...
@@ -116,14 +116,26 @@ class Pod(PodSepc):
self
.
_restart
+=
1
def
stop
(
self
,
sigint
=
0
):
def
stop
(
self
,
sigint
=
15
,
timeout
=
None
):
for
c
in
self
.
_containers
:
force
=
True
if
sigint
==
9
else
False
c
.
terminate
(
force
)
if
isinstance
(
sigint
,
int
)
and
timeout
is
None
:
c
.
send_signal
(
sigint
)
else
:
c
.
terminate
()
if
isinstance
(
timeout
,
int
):
if
not
self
.
join
(
timeout
):
for
c
in
self
.
_containers
:
c
.
terminate
(
force
=
True
)
return
False
else
:
return
True
def
join
(
self
):
def
join
(
self
,
timeout
=
None
):
for
c
in
self
.
_containers
:
c
.
wait
(
None
)
if
not
c
.
wait
(
timeout
):
return
False
return
True
@
property
def
status
(
self
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录