提交 f5262865 编写于 作者: D danleifeng 提交者: gongweibao

change select_gpus into absolute values in launch.py (#22031)

上级 2b619493
......@@ -184,7 +184,22 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
gpus_num = fluid.core.get_cuda_device_count()
selected_gpus = [str(x) for x in range(0, gpus_num)]
else:
selected_gpus = [x.strip() for x in args.selected_gpus.split(',')]
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
if cuda_visible_devices is None or cuda_visible_devices == "":
selected_gpus = [x.strip() for x in args.selected_gpus.split(',')]
else:
# change selected_gpus into relative values
# e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7;
# therefore selected_gpus=0,1,2,3
cuda_visible_devices_list = cuda_visible_devices.split(',')
for x in args.selected_gpus.split(','):
assert x in cuda_visible_devices_list, "Can't find "\
"your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
% (x, cuda_visible_devices)
selected_gpus = [
cuda_visible_devices_list.index(x.strip())
for x in args.selected_gpus.split(',')
]
selected_gpus_num = len(selected_gpus)
trainers_endpoints = ""
......
......@@ -13,7 +13,7 @@ export PADDLE_TRAINER_ID=0
distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip}
--selected_gpus=0,1 --log_dir=testlog"
python -m paddle.distributed.launch ${distributed_args} multi_process.py
CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} multi_process.py
str1="selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0"
str2="selected_gpus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6171 trainer_id:1"
......@@ -45,7 +45,8 @@ fi
echo ""
echo "paddle.distributed.launch async poll process test"
if ! python -m paddle.distributed.launch ${distributed_args} multi_process.py abort; then
nvidia-smi
if ! CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} multi_process.py abort; then
echo "train abort as planned"
fi
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册