diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index 17fa0a0c7c34744383d210a0fa409933d96dea21..d63c9f9184c0eb9aafec73df09b225d598f3413f 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -463,9 +463,8 @@ def launch(): cuda_device_num = 0 if len(has_ps_args) > 0 or cuda_device_num == 0: - logger.info( - "Run parameter-sever cpu mode. pserver arguments:{}, cuda count:{}". - format(has_ps_args, cuda_device_num)) + logger.info("Run parameter-sever cpu mode. pserver arguments:{}".format( + has_ps_args)) launch_ps(args) elif len(has_collective_args) > 0: logger.info("Run collective gpu mode. gpu arguments:{}, cuda count:{}". diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 17d3b96cf4466e560381c20fe265b39cac6697f0..7540cd9f4c1f352804550561c6f75b63104f9381 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -435,9 +435,17 @@ def start_local_trainers(cluster, len(pod.trainers), pretty_print_envs(proc_env, ("Distributed Envs", "Value")))) + logger.info( + "details abouts PADDLE_TRAINER_ENDPOINTS can be found in {}/endpoints.log.". + format(log_dir)) fn = None if log_dir is not None: os.system("mkdir -p {}".format(log_dir)) + if os.path.exists("%s/endpoints.log" % log_dir): + os.system("rm -f {}/endpoints.log".format(log_dir)) + with open("%s/endpoints.log" % log_dir, "w") as f: + f.write("PADDLE_TRAINER_ENDPOINTS: \n") + f.write("\n".join(cluster.trainers_endpoints())) fn = open("%s/workerlog.%d" % (log_dir, idx), "a") proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn) else: