From 84eb6757f8b0dab0d415897804e68fa543956893 Mon Sep 17 00:00:00 2001 From: kuizhiqing Date: Tue, 10 Aug 2021 10:57:23 +0800 Subject: [PATCH] kill all procs on exiting (#34741) --- python/paddle/distributed/fleet/launch.py | 18 ++++++++++++------ .../paddle/distributed/fleet/launch_utils.py | 17 ++++++++++++++--- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index 7704a87e234..bc7942826e1 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -279,14 +279,20 @@ def launch_collective(args): print("launch proc_id:{} idx:{}".format(proc.proc.pid, idx)) while True: - alive = watch_local_trainers(procs, cluster.trainers_nranks()) + try: + alive = watch_local_trainers(procs, cluster.trainers_nranks()) - if not alive: - logger.info("Local processes completed.") - logger.debug("POD info:{}".format(pod)) - break + if not alive: + logger.info("Local processes completed.") + logger.debug("POD info:{}".format(pod)) + break - time.sleep(3) + time.sleep(3) + + except: + logger.warning("Terminating... exit") + terminate_local_procs(procs) + exit(1) if os.path.exists(gloo_rendezvous_dir): shutil.rmtree(gloo_rendezvous_dir) diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 6ead643df6c..e114670440c 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -307,6 +307,17 @@ def get_cluster(node_ips, node_ip, trainer_endpoints, device_mode, def terminate_local_procs(procs): + # try to terminate process by group, this happend in multiprocess senario in user process + if os.name != 'nt': + for p in procs: + if p.proc.poll() is None: + os.killpg(os.getpgid(p.proc.pid), signal.SIGTERM) + if p.log_fn: + p.log_fn.close() + logger.info("terminate process group gid:{}".format(p.proc.pid)) + + time.sleep(1) + for p in procs: if p.proc.poll() is None: p.proc.terminate() @@ -583,19 +594,19 @@ def watch_local_trainers(procs, nranks): except KeyboardInterrupt: logger.warning("KeyboardInterrupt, exit") terminate_local_procs(procs) - raise + return except SystemExit: logger.error( "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.". format(nranks, error_rank)) terminate_local_procs(procs) - raise + return except: logger.error( "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.". format(nranks, error_rank)) terminate_local_procs(procs) - raise + return return alive -- GitLab