diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py index e7b6dfa6d608622043b72c8986601e8473974c34..b685723763833321bba1c70bec32cca52268d728 100644 --- a/python/paddle/distributed/launch.py +++ b/python/paddle/distributed/launch.py @@ -173,7 +173,11 @@ def start_procs(args): "PADDLE_CURRENT_ENDPOINT": "%s:%d" % (current_node_ip, args.started_port + i), "PADDLE_TRAINERS_NUM": "%d" % nranks, - "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints + "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints, + # paddle broadcast ncclUniqueId use socket, and + # proxy maybe make trainers unreachable, so set them to "" + "http_proxy": "", + "https_proxy": "" }) cmd = [sys.executable, "-u", args.training_script @@ -182,6 +186,7 @@ def start_procs(args): cmds.append(cmd) if args.log_dir is not None: + os.system("mkdir -p {}".format(args.log_dir)) fn = open("%s/workerlog.%d" % (args.log_dir, i), "w") log_fns.append(fn) diff --git a/python/paddle/fluid/tests/unittests/test_launch.sh b/python/paddle/fluid/tests/unittests/test_launch.sh index 7b849d022d2e609991b634c1dcb83a93b9003979..01b620d01dfc7f42682e4e027509ec4e8b9f4b46 100644 --- a/python/paddle/fluid/tests/unittests/test_launch.sh +++ b/python/paddle/fluid/tests/unittests/test_launch.sh @@ -8,7 +8,7 @@ python -m paddle.distributed.launch multi_process.py cluster_node_ips="127.0.0.1" node_ip="127.0.0.1" -distributed_args="--cluster_node_ips ${cluster_node_ips} --node_ip ${node_ip} --selected_gpus=0,1" +distributed_args="--cluster_node_ips ${cluster_node_ips} --node_ip ${node_ip} --selected_gpus=0,1 --log_dir testlog" python -m paddle.distributed.launch ${distributed_args} multi_process.py str1="selected_gpus:0 worker_endpoints:['127.0.0.1:6170', '127.0.0.1:6171'] trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0"