fix distributed launch.py (#17571)

test=develop

fix distributed launch.py (#17571)
test=develop
ca03f498 · chengduo · GitHub · 6e11f977 · ca03f498
显示空白变更内容
内联并排

Showing with 2 addition and 2 deletion

python/paddle/distributed/launch.py python/paddle/distributed/launch.py +2 -2

未找到文件。
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -75,11 +75,11 @@ def start_procs(gpus, entrypoint, entrypoint_args, log_dir):
    nranks = num_nodes * gpus
    # ======== for dist training =======
    gpu_ids = get_gpu_ids(gpus)
-    for i in gpu_ids:
+    for i in range(gpus):
        curr_env = {}
        curr_env.update(default_envs)
        curr_env.update({
-            "FLAGS_selected_gpus": "%d" % i,
+            "FLAGS_selected_gpus": "%d" % gpu_ids[i],
            "PADDLE_TRAINER_ID": "%d" % (node_trainer_id * gpus + i),
            "PADDLE_CURRENT_ENDPOINT": "%s:617%d" % (current_ip, i),
            # nranks