未验证 提交 5132f512 编写于 作者: L lilong12 提交者: GitHub

terminate http server used by gloo for fleet after init (#27698)

上级 8a4f85fe
......@@ -78,10 +78,10 @@ class Gloo(object):
self._worker_num = worker_num
self._server_num = server_num
self._need_init_all = need_init_all
self._start_http_server = kwargs.get("start_http_server", False)
self._iface = ""
self._prefix = kwargs.get("store.prefix", "")
http_server = None
if self._rendezvous == Gloo.RENDEZVOUS.HDFS:
dfs_name = kwargs.get("dfs.name", "")
dfs_ugi = kwargs.get("dfs.ugi", "")
......@@ -101,17 +101,18 @@ class Gloo(object):
elif self._rendezvous == Gloo.RENDEZVOUS.HTTP:
ip = kwargs.get("http.host", "")
port = kwargs.get("http.port", "")
start_http_server = kwargs.get("start_http_server", False)
http_server_d = kwargs.get("http_server_d")
if not ip or not port:
raise ValueError(self._err_type)
self._init_http(ip, port, self._prefix, self._start_http_server)
ep = ":".join([ip, port])
wait_server_ready([ep])
http_server = self._init_http(ip, port, self._prefix,
start_http_server, http_server_d)
else:
raise ValueError(self._err_type)
self._is_initialized = True
self._http_server = http_server
def _init_fs(self, fs_path, prefix):
def init(rank, nodes, role):
......@@ -167,7 +168,7 @@ class Gloo(object):
gloo = init(rank, nodes, "ALL")
self._nodes_comm = gloo
def _init_http(self, ip, port, prefix, start_http_server):
def _init_http(self, ip, port, prefix, start_http_server, http_server_d):
def __start_kv_server(http_server_d, size_d):
from paddle.distributed.fleet.utils.http_server import KVServer
http_server = KVServer(port, size_d)
......@@ -177,21 +178,22 @@ class Gloo(object):
time.sleep(wait_seconds)
http_server.stop()
def init_kv_server():
def init_kv_server(http_server_d):
size_d = {
"trainer": self._worker_num,
"pserver": self._server_num,
"all": self._worker_num + self._server_num
}
_http_server_d = {"running": True}
http_server_d["running"] = True
# child process for http server
_http_server = Process(
target=__start_kv_server, args=(_http_server_d, size_d))
target=__start_kv_server, args=(http_server_d, size_d))
_http_server.daemon = True
# set running status to True
# start child process
_http_server.start()
return _http_server
def init(rank, nodes, role):
gloo = fluid.core.Gloo()
......@@ -202,12 +204,15 @@ class Gloo(object):
gloo.set_timeout_seconds(self._init_timeout_seconds,
self._run_timeout_seconds)
gloo.set_http_store(ip, port, role)
ep = ":".join([ip, str(port)])
wait_server_ready([ep])
gloo.init()
return gloo
port = int(port)
if start_http_server:
init_kv_server()
http_server = init_kv_server(http_server_d)
if self._role == Role.WORKER:
rank, nodes = self._get_rank_nodes(Role.WORKER)
......@@ -222,6 +227,9 @@ class Gloo(object):
rank, nodes = self._get_rank_nodes(Role.ALL)
gloo = init(rank, nodes, "ALL")
self._nodes_comm = gloo
if start_http_server:
http_server_d["running"] = False
http_server.join()
def _get_rank_nodes(self, role):
nodes = 0
......@@ -804,6 +812,9 @@ class PaddleCloudRoleMaker(RoleMakerBase):
}
elif rendezvous_type == Gloo.RENDEZVOUS.HTTP:
start_http_server = False
manager = Manager()
http_server_d = manager.dict()
http_server_d["running"] = False
if self._is_collective:
ep_rank_0 = self._worker_endpoints[0]
if self._is_first_worker():
......@@ -818,6 +829,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
"http.port": port,
"store.prefix": prefix,
'start_http_server': start_http_server,
'http_server_d': http_server_d,
}
else:
dfs_path = os.getenv("PADDLE_GLOO_FS_PATH", "")
......@@ -844,6 +856,9 @@ class PaddleCloudRoleMaker(RoleMakerBase):
need_init_all=need_init_all,
kwargs=kwargs)
if rendezvous_type == Gloo.RENDEZVOUS.HTTP:
http_server_d['running'] = False
def _generate_role(self):
"""
generate role for role maker
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册