未验证 提交 a8625aaf 编写于 作者: W WangXi 提交者: GitHub

fix wait server ready (#32889)

上级 dace3fd5
...@@ -63,9 +63,9 @@ class GraphExecutionOptimizer(MetaOptimizerBase): ...@@ -63,9 +63,9 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
trainer_endpoints_env = ",".join(trainer_endpoints) trainer_endpoints_env = ",".join(trainer_endpoints)
trainers_num = self.role_maker._worker_num() trainers_num = self.role_maker._worker_num()
# FIXME(wangxi): approve this. # NOTE(wangxi): npu don't need to wait server ready
#if trainer_id == 0: if trainer_id == 0 and not paddle.is_compiled_with_npu():
# wait_server_ready(other_trainers) wait_server_ready(other_trainers)
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
comm_id_var = startup_program.global_block().create_var( comm_id_var = startup_program.global_block().create_var(
......
...@@ -80,15 +80,17 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase): ...@@ -80,15 +80,17 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
cost_val = exe.run(feed=gen_data(), fetch_list=[avg_cost.name]) cost_val = exe.run(feed=gen_data(), fetch_list=[avg_cost.name])
print("cost of step[{}] = {}".format(i, cost_val)) print("cost of step[{}] = {}".format(i, cost_val))
proc_a = launch_func(node_func, node_a) # rank 1
proc_a.start() proc_b = launch_func(node_func, node_b)
proc_b.start()
# rank 0, for wait server ready coverage
# just for coverage # just for coverage
for key in node_b: for key in node_a:
os.environ[key] = node_b[key] os.environ[key] = node_a[key]
node_func() node_func()
proc_a.join() proc_b.join()
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册