未验证 提交 84eb6757 编写于 作者: K kuizhiqing 提交者: GitHub

kill all procs on exiting (#34741)

上级 8a6aa596
...@@ -279,14 +279,20 @@ def launch_collective(args): ...@@ -279,14 +279,20 @@ def launch_collective(args):
print("launch proc_id:{} idx:{}".format(proc.proc.pid, idx)) print("launch proc_id:{} idx:{}".format(proc.proc.pid, idx))
while True: while True:
alive = watch_local_trainers(procs, cluster.trainers_nranks()) try:
alive = watch_local_trainers(procs, cluster.trainers_nranks())
if not alive: if not alive:
logger.info("Local processes completed.") logger.info("Local processes completed.")
logger.debug("POD info:{}".format(pod)) logger.debug("POD info:{}".format(pod))
break break
time.sleep(3) time.sleep(3)
except:
logger.warning("Terminating... exit")
terminate_local_procs(procs)
exit(1)
if os.path.exists(gloo_rendezvous_dir): if os.path.exists(gloo_rendezvous_dir):
shutil.rmtree(gloo_rendezvous_dir) shutil.rmtree(gloo_rendezvous_dir)
......
...@@ -307,6 +307,17 @@ def get_cluster(node_ips, node_ip, trainer_endpoints, device_mode, ...@@ -307,6 +307,17 @@ def get_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
def terminate_local_procs(procs): def terminate_local_procs(procs):
# try to terminate process by group, this happend in multiprocess senario in user process
if os.name != 'nt':
for p in procs:
if p.proc.poll() is None:
os.killpg(os.getpgid(p.proc.pid), signal.SIGTERM)
if p.log_fn:
p.log_fn.close()
logger.info("terminate process group gid:{}".format(p.proc.pid))
time.sleep(1)
for p in procs: for p in procs:
if p.proc.poll() is None: if p.proc.poll() is None:
p.proc.terminate() p.proc.terminate()
...@@ -583,19 +594,19 @@ def watch_local_trainers(procs, nranks): ...@@ -583,19 +594,19 @@ def watch_local_trainers(procs, nranks):
except KeyboardInterrupt: except KeyboardInterrupt:
logger.warning("KeyboardInterrupt, exit") logger.warning("KeyboardInterrupt, exit")
terminate_local_procs(procs) terminate_local_procs(procs)
raise return
except SystemExit: except SystemExit:
logger.error( logger.error(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.". "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
format(nranks, error_rank)) format(nranks, error_rank))
terminate_local_procs(procs) terminate_local_procs(procs)
raise return
except: except:
logger.error( logger.error(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.". "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
format(nranks, error_rank)) format(nranks, error_rank))
terminate_local_procs(procs) terminate_local_procs(procs)
raise return
return alive return alive
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册