diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index 7704a87e234f6b5e3e84e53062233ba737286a63..bc7942826e1eaaeb89dd5854acf8abaa710148fa 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -279,14 +279,20 @@ def launch_collective(args): print("launch proc_id:{} idx:{}".format(proc.proc.pid, idx)) while True: - alive = watch_local_trainers(procs, cluster.trainers_nranks()) + try: + alive = watch_local_trainers(procs, cluster.trainers_nranks()) - if not alive: - logger.info("Local processes completed.") - logger.debug("POD info:{}".format(pod)) - break + if not alive: + logger.info("Local processes completed.") + logger.debug("POD info:{}".format(pod)) + break - time.sleep(3) + time.sleep(3) + + except: + logger.warning("Terminating... exit") + terminate_local_procs(procs) + exit(1) if os.path.exists(gloo_rendezvous_dir): shutil.rmtree(gloo_rendezvous_dir) diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 6ead643df6c1b8c57588313dbb917f8d754d9f51..e114670440c0656cb83a9b471ba3e82eca848334 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -307,6 +307,17 @@ def get_cluster(node_ips, node_ip, trainer_endpoints, device_mode, def terminate_local_procs(procs): + # try to terminate process by group, this happend in multiprocess senario in user process + if os.name != 'nt': + for p in procs: + if p.proc.poll() is None: + os.killpg(os.getpgid(p.proc.pid), signal.SIGTERM) + if p.log_fn: + p.log_fn.close() + logger.info("terminate process group gid:{}".format(p.proc.pid)) + + time.sleep(1) + for p in procs: if p.proc.poll() is None: p.proc.terminate() @@ -583,19 +594,19 @@ def watch_local_trainers(procs, nranks): except KeyboardInterrupt: logger.warning("KeyboardInterrupt, exit") terminate_local_procs(procs) - raise + return except SystemExit: logger.error( "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.". format(nranks, error_rank)) terminate_local_procs(procs) - raise + return except: logger.error( "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.". format(nranks, error_rank)) terminate_local_procs(procs) - raise + return return alive