未验证 提交 ca03f498 编写于 作者: C chengduo 提交者: GitHub

fix distributed launch.py (#17571)

test=develop
上级 6e11f977
...@@ -75,11 +75,11 @@ def start_procs(gpus, entrypoint, entrypoint_args, log_dir): ...@@ -75,11 +75,11 @@ def start_procs(gpus, entrypoint, entrypoint_args, log_dir):
nranks = num_nodes * gpus nranks = num_nodes * gpus
# ======== for dist training ======= # ======== for dist training =======
gpu_ids = get_gpu_ids(gpus) gpu_ids = get_gpu_ids(gpus)
for i in gpu_ids: for i in range(gpus):
curr_env = {} curr_env = {}
curr_env.update(default_envs) curr_env.update(default_envs)
curr_env.update({ curr_env.update({
"FLAGS_selected_gpus": "%d" % i, "FLAGS_selected_gpus": "%d" % gpu_ids[i],
"PADDLE_TRAINER_ID": "%d" % (node_trainer_id * gpus + i), "PADDLE_TRAINER_ID": "%d" % (node_trainer_id * gpus + i),
"PADDLE_CURRENT_ENDPOINT": "%s:617%d" % (current_ip, i), "PADDLE_CURRENT_ENDPOINT": "%s:617%d" % (current_ip, i),
# nranks # nranks
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册