未验证 提交 84eb6757 编写于 作者: K kuizhiqing 提交者: GitHub

kill all procs on exiting (#34741)

上级 8a6aa596
......@@ -279,14 +279,20 @@ def launch_collective(args):
print("launch proc_id:{} idx:{}".format(proc.proc.pid, idx))
while True:
alive = watch_local_trainers(procs, cluster.trainers_nranks())
try:
alive = watch_local_trainers(procs, cluster.trainers_nranks())
if not alive:
logger.info("Local processes completed.")
logger.debug("POD info:{}".format(pod))
break
if not alive:
logger.info("Local processes completed.")
logger.debug("POD info:{}".format(pod))
break
time.sleep(3)
time.sleep(3)
except:
logger.warning("Terminating... exit")
terminate_local_procs(procs)
exit(1)
if os.path.exists(gloo_rendezvous_dir):
shutil.rmtree(gloo_rendezvous_dir)
......
......@@ -307,6 +307,17 @@ def get_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
def terminate_local_procs(procs):
# try to terminate process by group, this happend in multiprocess senario in user process
if os.name != 'nt':
for p in procs:
if p.proc.poll() is None:
os.killpg(os.getpgid(p.proc.pid), signal.SIGTERM)
if p.log_fn:
p.log_fn.close()
logger.info("terminate process group gid:{}".format(p.proc.pid))
time.sleep(1)
for p in procs:
if p.proc.poll() is None:
p.proc.terminate()
......@@ -583,19 +594,19 @@ def watch_local_trainers(procs, nranks):
except KeyboardInterrupt:
logger.warning("KeyboardInterrupt, exit")
terminate_local_procs(procs)
raise
return
except SystemExit:
logger.error(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
format(nranks, error_rank))
terminate_local_procs(procs)
raise
return
except:
logger.error(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
format(nranks, error_rank))
terminate_local_procs(procs)
raise
return
return alive
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册