diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py index 3b55ec7ffce4df7e96034df067f667f2c807943f..511d501a2206e3ce106b7b76a5f3463b48353ff4 100644 --- a/python/paddle/distributed/utils.py +++ b/python/paddle/distributed/utils.py @@ -252,15 +252,12 @@ def get_cluster(node_ips, node_ip, paddle_ports, selected_gpus): def terminate_local_procs(procs): for p in procs: if p.proc.poll() is None: - # subprocess need to release resource(e.g. shared memory) - # use join to wait subprocess releasing - p.proc.join(timeout=1) + p.proc.terminate() p.log_fn.close() logger.debug("terminate process id:{}".format(p.proc.pid)) - # wait all process terminiated - # time.sleep(3) - + #wait all process terminiated + time.sleep(3) for step in range(0, 50): alive = False for p in procs: