未验证 提交 4f1d827c 编写于 作者: S Stas Bekman 提交者: GitHub

[launcher] look ma, no more zombies (#714)

Co-authored-by: NJeff Rasley <jerasley@microsoft.com>
上级 72b23ea3
...@@ -13,6 +13,8 @@ import subprocess ...@@ -13,6 +13,8 @@ import subprocess
import os import os
import json import json
import base64 import base64
import time
import signal
from collections import defaultdict from collections import defaultdict
from argparse import ArgumentParser, REMAINDER from argparse import ArgumentParser, REMAINDER
...@@ -122,11 +124,47 @@ def main(): ...@@ -122,11 +124,47 @@ def main():
args.training_script, args.training_script,
"--local_rank={}".format(local_rank) "--local_rank={}".format(local_rank)
] + args.training_script_args ] + args.training_script_args
sig_names = {2: "SIGINT", 15: "SIGTERM"}
last_return_code = None
def sigkill_handler(signum, frame):
for process in processes:
print(f"Killing subprocess {process.pid}")
try:
process.kill()
except Exception as e:
pass
if last_return_code is not None:
raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd)
if signum in sig_names:
print(f"Main process received {sig_names[signum]}, exiting")
sys.exit(1)
# pass SIGINT/SIGTERM to children if the parent is being terminated
signal.signal(signal.SIGINT, sigkill_handler)
signal.signal(signal.SIGTERM, sigkill_handler)
process = subprocess.Popen(cmd, env=current_env) process = subprocess.Popen(cmd, env=current_env)
processes.append(process) processes.append(process)
for process in processes: alive_processes = set(processes)
process.wait() while len(alive_processes):
finished_processes = []
for process in alive_processes:
if process.poll() is None:
# the process is still running
continue
else:
if process.returncode != 0:
last_return_code = process.returncode # for sigkill_handler
sigkill_handler(signal.SIGTERM, None) # not coming back
else:
# exited cleanly
finished_processes.append(process)
alive_processes = set(alive_processes) - set(finished_processes)
time.sleep(1)
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册