From f5262865c0bf994325c710631785d15117b0fe7a Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Tue, 14 Jan 2020 19:25:05 +0800 Subject: [PATCH] change select_gpus into absolute values in launch.py (#22031) --- python/paddle/distributed/launch.py | 17 ++++++++++++++++- .../paddle/fluid/tests/unittests/test_launch.sh | 5 +++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py index 3e2f2e59f91..1e99da4615a 100644 --- a/python/paddle/distributed/launch.py +++ b/python/paddle/distributed/launch.py @@ -184,7 +184,22 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips)) gpus_num = fluid.core.get_cuda_device_count() selected_gpus = [str(x) for x in range(0, gpus_num)] else: - selected_gpus = [x.strip() for x in args.selected_gpus.split(',')] + cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") + if cuda_visible_devices is None or cuda_visible_devices == "": + selected_gpus = [x.strip() for x in args.selected_gpus.split(',')] + else: + # change selected_gpus into relative values + # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7; + # therefore selected_gpus=0,1,2,3 + cuda_visible_devices_list = cuda_visible_devices.split(',') + for x in args.selected_gpus.split(','): + assert x in cuda_visible_devices_list, "Can't find "\ + "your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\ + % (x, cuda_visible_devices) + selected_gpus = [ + cuda_visible_devices_list.index(x.strip()) + for x in args.selected_gpus.split(',') + ] selected_gpus_num = len(selected_gpus) trainers_endpoints = "" diff --git a/python/paddle/fluid/tests/unittests/test_launch.sh b/python/paddle/fluid/tests/unittests/test_launch.sh index d3b8d34e49c..f561624d3f2 100644 --- a/python/paddle/fluid/tests/unittests/test_launch.sh +++ b/python/paddle/fluid/tests/unittests/test_launch.sh @@ -13,7 +13,7 @@ export PADDLE_TRAINER_ID=0 distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog" -python -m paddle.distributed.launch ${distributed_args} multi_process.py +CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} multi_process.py str1="selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0" str2="selected_gpus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6171 trainer_id:1" @@ -45,7 +45,8 @@ fi echo "" echo "paddle.distributed.launch async poll process test" -if ! python -m paddle.distributed.launch ${distributed_args} multi_process.py abort; then +nvidia-smi +if ! CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} multi_process.py abort; then echo "train abort as planned" fi -- GitLab