未验证 提交 041ef22c 编写于 作者: K kuizhiqing 提交者: GitHub

fix ips offset (#45219)

上级 b6a4db1d
...@@ -52,7 +52,8 @@ class CollectiveController(Controller): ...@@ -52,7 +52,8 @@ class CollectiveController(Controller):
self.ctx.logger.debug("job endpoints: {}".format(job_endpoints)) self.ctx.logger.debug("job endpoints: {}".format(job_endpoints))
rank_offset = ips.index( rank_offset = ips.index(
self.ctx.node.ip) if self.ctx.node.ip in ips else 0 self.ctx.node.ip
) * self.pod.replicas if self.ctx.node.ip in ips else 0
self.save_pod_log(job_endpoints) self.save_pod_log(job_endpoints)
...@@ -66,7 +67,7 @@ class CollectiveController(Controller): ...@@ -66,7 +67,7 @@ class CollectiveController(Controller):
"PADDLE_LOCAL_SIZE": "{}".format(self.pod.replicas), "PADDLE_LOCAL_SIZE": "{}".format(self.pod.replicas),
"PADDLE_GLOBAL_RANK": "{}".format(i + rank_offset), "PADDLE_GLOBAL_RANK": "{}".format(i + rank_offset),
"PADDLE_LOCAL_RANK": "{}".format(i), "PADDLE_LOCAL_RANK": "{}".format(i),
"PADDLE_NNODES": "{}".format(self.job.replicas), "PADDLE_NNODES": "{}".format(len(ips)),
## compatible env ## compatible env
"PADDLE_TRAINER_ENDPOINTS": ",".join(job_endpoints), "PADDLE_TRAINER_ENDPOINTS": ",".join(job_endpoints),
"PADDLE_CURRENT_ENDPOINT": job_endpoints[i + rank_offset], "PADDLE_CURRENT_ENDPOINT": job_endpoints[i + rank_offset],
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册