Unset https_proxy and http_proxy in our launch.py (#17915)

f3e5a5cf · gongweibao · GitHub · 5df65e50 · f3e5a5cf · f3e5a5cf
隐藏空白更改
内联并排

Showing with 7 addition and 2 deletion

python/paddle/distributed/launch.py python/paddle/distributed/launch.py +6 -1

python/paddle/fluid/tests/unittests/test_launch.sh python/paddle/fluid/tests/unittests/test_launch.sh +1 -1

未找到文件。
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -173,7 +173,11 @@ def start_procs(args):
            "PADDLE_CURRENT_ENDPOINT":
            "%s:%d" % (current_node_ip, args.started_port + i),
            "PADDLE_TRAINERS_NUM": "%d" % nranks,
-            "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints
+            "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints,
+            # paddle broadcast ncclUniqueId use socket, and
+            # proxy maybe make trainers unreachable, so set them to ""
+            "http_proxy": "",
+            "https_proxy": ""
        })

        cmd = [sys.executable, "-u", args.training_script
@@ -182,6 +186,7 @@ def start_procs(args):
        cmds.append(cmd)

        if args.log_dir is not None:
+            os.system("mkdir -p {}".format(args.log_dir))
            fn = open("%s/workerlog.%d" % (args.log_dir, i), "w")
            log_fns.append(fn)


--- a/python/paddle/fluid/tests/unittests/test_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_launch.sh
@@ -8,7 +8,7 @@ python -m paddle.distributed.launch multi_process.py
 cluster_node_ips="127.0.0.1"
 node_ip="127.0.0.1"

-distributed_args="--cluster_node_ips ${cluster_node_ips} --node_ip ${node_ip} --selected_gpus=0,1"
+distributed_args="--cluster_node_ips ${cluster_node_ips} --node_ip ${node_ip} --selected_gpus=0,1 --log_dir testlog"
 python -m paddle.distributed.launch ${distributed_args} multi_process.py

 str1="selected_gpus:0 worker_endpoints:['127.0.0.1:6170', '127.0.0.1:6171'] trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0"