From 905e2346acb72c9bd8c0d955473141bc5e02107e Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Tue, 22 Sep 2020 15:23:10 +0800 Subject: [PATCH] add endpoints log;test=develop (#27439) --- python/paddle/distributed/fleet/launch.py | 5 ++--- python/paddle/distributed/fleet/launch_utils.py | 8 ++++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index 17fa0a0c7c..d63c9f9184 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -463,9 +463,8 @@ def launch(): cuda_device_num = 0 if len(has_ps_args) > 0 or cuda_device_num == 0: - logger.info( - "Run parameter-sever cpu mode. pserver arguments:{}, cuda count:{}". - format(has_ps_args, cuda_device_num)) + logger.info("Run parameter-sever cpu mode. pserver arguments:{}".format( + has_ps_args)) launch_ps(args) elif len(has_collective_args) > 0: logger.info("Run collective gpu mode. gpu arguments:{}, cuda count:{}". diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 17d3b96cf4..7540cd9f4c 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -435,9 +435,17 @@ def start_local_trainers(cluster, len(pod.trainers), pretty_print_envs(proc_env, ("Distributed Envs", "Value")))) + logger.info( + "details abouts PADDLE_TRAINER_ENDPOINTS can be found in {}/endpoints.log.". + format(log_dir)) fn = None if log_dir is not None: os.system("mkdir -p {}".format(log_dir)) + if os.path.exists("%s/endpoints.log" % log_dir): + os.system("rm -f {}/endpoints.log".format(log_dir)) + with open("%s/endpoints.log" % log_dir, "w") as f: + f.write("PADDLE_TRAINER_ENDPOINTS: \n") + f.write("\n".join(cluster.trainers_endpoints())) fn = open("%s/workerlog.%d" % (log_dir, idx), "a") proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn) else: -- GitLab