未验证 提交 5132f512 编写于 作者: L lilong12 提交者: GitHub

terminate http server used by gloo for fleet after init (#27698)

上级 8a4f85fe
...@@ -78,10 +78,10 @@ class Gloo(object): ...@@ -78,10 +78,10 @@ class Gloo(object):
self._worker_num = worker_num self._worker_num = worker_num
self._server_num = server_num self._server_num = server_num
self._need_init_all = need_init_all self._need_init_all = need_init_all
self._start_http_server = kwargs.get("start_http_server", False)
self._iface = "" self._iface = ""
self._prefix = kwargs.get("store.prefix", "") self._prefix = kwargs.get("store.prefix", "")
http_server = None
if self._rendezvous == Gloo.RENDEZVOUS.HDFS: if self._rendezvous == Gloo.RENDEZVOUS.HDFS:
dfs_name = kwargs.get("dfs.name", "") dfs_name = kwargs.get("dfs.name", "")
dfs_ugi = kwargs.get("dfs.ugi", "") dfs_ugi = kwargs.get("dfs.ugi", "")
...@@ -101,17 +101,18 @@ class Gloo(object): ...@@ -101,17 +101,18 @@ class Gloo(object):
elif self._rendezvous == Gloo.RENDEZVOUS.HTTP: elif self._rendezvous == Gloo.RENDEZVOUS.HTTP:
ip = kwargs.get("http.host", "") ip = kwargs.get("http.host", "")
port = kwargs.get("http.port", "") port = kwargs.get("http.port", "")
start_http_server = kwargs.get("start_http_server", False)
http_server_d = kwargs.get("http_server_d")
if not ip or not port: if not ip or not port:
raise ValueError(self._err_type) raise ValueError(self._err_type)
self._init_http(ip, port, self._prefix, self._start_http_server) http_server = self._init_http(ip, port, self._prefix,
ep = ":".join([ip, port]) start_http_server, http_server_d)
wait_server_ready([ep])
else: else:
raise ValueError(self._err_type) raise ValueError(self._err_type)
self._is_initialized = True self._is_initialized = True
self._http_server = http_server
def _init_fs(self, fs_path, prefix): def _init_fs(self, fs_path, prefix):
def init(rank, nodes, role): def init(rank, nodes, role):
...@@ -167,7 +168,7 @@ class Gloo(object): ...@@ -167,7 +168,7 @@ class Gloo(object):
gloo = init(rank, nodes, "ALL") gloo = init(rank, nodes, "ALL")
self._nodes_comm = gloo self._nodes_comm = gloo
def _init_http(self, ip, port, prefix, start_http_server): def _init_http(self, ip, port, prefix, start_http_server, http_server_d):
def __start_kv_server(http_server_d, size_d): def __start_kv_server(http_server_d, size_d):
from paddle.distributed.fleet.utils.http_server import KVServer from paddle.distributed.fleet.utils.http_server import KVServer
http_server = KVServer(port, size_d) http_server = KVServer(port, size_d)
...@@ -177,21 +178,22 @@ class Gloo(object): ...@@ -177,21 +178,22 @@ class Gloo(object):
time.sleep(wait_seconds) time.sleep(wait_seconds)
http_server.stop() http_server.stop()
def init_kv_server(): def init_kv_server(http_server_d):
size_d = { size_d = {
"trainer": self._worker_num, "trainer": self._worker_num,
"pserver": self._server_num, "pserver": self._server_num,
"all": self._worker_num + self._server_num "all": self._worker_num + self._server_num
} }
_http_server_d = {"running": True} http_server_d["running"] = True
# child process for http server # child process for http server
_http_server = Process( _http_server = Process(
target=__start_kv_server, args=(_http_server_d, size_d)) target=__start_kv_server, args=(http_server_d, size_d))
_http_server.daemon = True _http_server.daemon = True
# set running status to True # set running status to True
# start child process # start child process
_http_server.start() _http_server.start()
return _http_server
def init(rank, nodes, role): def init(rank, nodes, role):
gloo = fluid.core.Gloo() gloo = fluid.core.Gloo()
...@@ -202,12 +204,15 @@ class Gloo(object): ...@@ -202,12 +204,15 @@ class Gloo(object):
gloo.set_timeout_seconds(self._init_timeout_seconds, gloo.set_timeout_seconds(self._init_timeout_seconds,
self._run_timeout_seconds) self._run_timeout_seconds)
gloo.set_http_store(ip, port, role) gloo.set_http_store(ip, port, role)
ep = ":".join([ip, str(port)])
wait_server_ready([ep])
gloo.init()
return gloo return gloo
port = int(port) port = int(port)
if start_http_server: if start_http_server:
init_kv_server() http_server = init_kv_server(http_server_d)
if self._role == Role.WORKER: if self._role == Role.WORKER:
rank, nodes = self._get_rank_nodes(Role.WORKER) rank, nodes = self._get_rank_nodes(Role.WORKER)
...@@ -222,6 +227,9 @@ class Gloo(object): ...@@ -222,6 +227,9 @@ class Gloo(object):
rank, nodes = self._get_rank_nodes(Role.ALL) rank, nodes = self._get_rank_nodes(Role.ALL)
gloo = init(rank, nodes, "ALL") gloo = init(rank, nodes, "ALL")
self._nodes_comm = gloo self._nodes_comm = gloo
if start_http_server:
http_server_d["running"] = False
http_server.join()
def _get_rank_nodes(self, role): def _get_rank_nodes(self, role):
nodes = 0 nodes = 0
...@@ -804,6 +812,9 @@ class PaddleCloudRoleMaker(RoleMakerBase): ...@@ -804,6 +812,9 @@ class PaddleCloudRoleMaker(RoleMakerBase):
} }
elif rendezvous_type == Gloo.RENDEZVOUS.HTTP: elif rendezvous_type == Gloo.RENDEZVOUS.HTTP:
start_http_server = False start_http_server = False
manager = Manager()
http_server_d = manager.dict()
http_server_d["running"] = False
if self._is_collective: if self._is_collective:
ep_rank_0 = self._worker_endpoints[0] ep_rank_0 = self._worker_endpoints[0]
if self._is_first_worker(): if self._is_first_worker():
...@@ -818,6 +829,7 @@ class PaddleCloudRoleMaker(RoleMakerBase): ...@@ -818,6 +829,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
"http.port": port, "http.port": port,
"store.prefix": prefix, "store.prefix": prefix,
'start_http_server': start_http_server, 'start_http_server': start_http_server,
'http_server_d': http_server_d,
} }
else: else:
dfs_path = os.getenv("PADDLE_GLOO_FS_PATH", "") dfs_path = os.getenv("PADDLE_GLOO_FS_PATH", "")
...@@ -844,6 +856,9 @@ class PaddleCloudRoleMaker(RoleMakerBase): ...@@ -844,6 +856,9 @@ class PaddleCloudRoleMaker(RoleMakerBase):
need_init_all=need_init_all, need_init_all=need_init_all,
kwargs=kwargs) kwargs=kwargs)
if rendezvous_type == Gloo.RENDEZVOUS.HTTP:
http_server_d['running'] = False
def _generate_role(self): def _generate_role(self):
""" """
generate role for role maker generate role for role maker
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册