From faf517b2b5589f3476b15d3f3eeedcaf38eab441 Mon Sep 17 00:00:00 2001 From: kuizhiqing Date: Tue, 25 Jan 2022 14:16:11 +0800 Subject: [PATCH] restore gloo (#39163) --- python/paddle/distributed/fleet/base/role_maker.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py index b08753066ca..f0cf6573139 100644 --- a/python/paddle/distributed/fleet/base/role_maker.py +++ b/python/paddle/distributed/fleet/base/role_maker.py @@ -224,6 +224,14 @@ class Gloo(object): self._worker_comm = gloo # TODO (sandyhouse): initialize gloo for server and all + # the closing of kv server may cause gloo init failure + # since it depend on the full mesh connection + # e.g. 0 connected with 1,2,3 while 2-3 not connected yet + # TODO(kuizhiqing) + if start_http_server: + http_server_d["running"] = False + http_server.join() + def _get_rank_nodes(self, role): nodes = 0 rank = -1 -- GitLab