未验证 提交 a8625aaf 编写于 作者: W WangXi 提交者: GitHub

fix wait server ready (#32889)

上级 dace3fd5
......@@ -63,9 +63,9 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
trainer_endpoints_env = ",".join(trainer_endpoints)
trainers_num = self.role_maker._worker_num()
# FIXME(wangxi): approve this.
#if trainer_id == 0:
# wait_server_ready(other_trainers)
# NOTE(wangxi): npu don't need to wait server ready
if trainer_id == 0 and not paddle.is_compiled_with_npu():
wait_server_ready(other_trainers)
if core.is_compiled_with_cuda():
comm_id_var = startup_program.global_block().create_var(
......
......@@ -80,15 +80,17 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
cost_val = exe.run(feed=gen_data(), fetch_list=[avg_cost.name])
print("cost of step[{}] = {}".format(i, cost_val))
proc_a = launch_func(node_func, node_a)
proc_a.start()
# rank 1
proc_b = launch_func(node_func, node_b)
proc_b.start()
# rank 0, for wait server ready coverage
# just for coverage
for key in node_b:
os.environ[key] = node_b[key]
for key in node_a:
os.environ[key] = node_a[key]
node_func()
proc_a.join()
proc_b.join()
if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册