From b0bd93de0002907d225b3395bda66048d8c749a1 Mon Sep 17 00:00:00 2001 From: lilong12 Date: Thu, 31 Dec 2020 14:33:35 +0800 Subject: [PATCH] Disable gloo by default (#29805) * update, test=develop --- .../distributed/fleet/base/role_maker.py | 9 +-- python/paddle/distributed/fleet/launch.py | 2 +- .../paddle/distributed/fleet/launch_utils.py | 6 +- python/paddle/distributed/parallel.py | 65 ++++++++++--------- .../unittests/test_collective_api_base.py | 1 + 5 files changed, 40 insertions(+), 43 deletions(-) diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py index 2b9d2f4c27..a8683aea97 100644 --- a/python/paddle/distributed/fleet/base/role_maker.py +++ b/python/paddle/distributed/fleet/base/role_maker.py @@ -220,15 +220,8 @@ class Gloo(object): rank, nodes = self._get_rank_nodes(Role.WORKER) gloo = init(rank, nodes, "WORKER") self._worker_comm = gloo - else: - rank, nodes = self._get_rank_nodes(Role.SERVER) - gloo = init(rank, nodes, "SERVER") - self._server_comm = gloo + # TODO (sandyhouse): initialize gloo for server and all - if self._need_init_all: - rank, nodes = self._get_rank_nodes(Role.ALL) - gloo = init(rank, nodes, "ALL") - self._nodes_comm = gloo if start_http_server: http_server_d["running"] = False http_server.join() diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index a7490f770d..afc352f89c 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -219,7 +219,7 @@ def launch_collective(args): global_envs = copy.copy(os.environ.copy()) gloo_rendezvous_dir = tempfile.mkdtemp() # add gloo env - global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "1")) + global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "0")) global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3" global_envs["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 93c7d8a6ab..32d2f784e0 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -954,7 +954,7 @@ class ParameterServerLauncher(object): "TRAINING_ROLE": "PSERVER", "PADDLE_TRAINERS_NUM": str(self.worker_num), "POD_IP": cur_server.endpoint.split(":")[0], - "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "1")), + "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "0")), "PADDLE_GLOO_RENDEZVOUS": "3", "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir, "PADDLE_GLOO_HTTP_ENDPOINT": self.http_port @@ -1018,7 +1018,7 @@ class ParameterServerLauncher(object): self.heter_worker_endpoints, "TRAINING_ROLE": "TRAINER", "PADDLE_TRAINER_ID": str(cur_worker.rank), - "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "1")), + "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "0")), "PADDLE_GLOO_RENDEZVOUS": "3", "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir, "FLAGS_selected_gpus": "0", @@ -1088,7 +1088,7 @@ class ParameterServerLauncher(object): "TRAINING_ROLE": "HETER_TRAINER", "PADDLE_TRAINERS_NUM": str(self.worker_num), "POD_IP": cur_heter_worker.endpoint.split(":")[0], - "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "1")), + "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "0")), "PADDLE_GLOO_RENDEZVOUS": "3", "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir, "FLAGS_selected_gpus": "0", diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index be66e13aa1..c41c3663a1 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -142,21 +142,23 @@ def init_parallel_env(): _check_var_exists("PADDLE_TRAINER_ENDPOINTS") # 3: init gloo context (step 1: httpsever start) - ep_rank_0 = parallel_env.trainer_endpoints[0].split(":") - ep_rank = parallel_env.trainer_endpoints[parallel_env.rank].split(":") - manager = Manager() - # glboal dict to store status - http_server_d = manager.dict() - http_server_d["running"] = False - if parallel_env.rank == 0: - # The scope for worker used by http server is '_worker' - size = {'_worker': parallel_env.world_size} - http_server = Process( - target=_start_kv_server, - args=(int(ep_rank_0[1]), http_server_d, size)) - http_server.daemon = True - http_server_d["running"] = True - http_server.start() + init_gloo = int(os.getenv("PADDLE_WITH_GLOO", "0")) + if init_gloo: + ep_rank_0 = parallel_env.trainer_endpoints[0].split(":") + ep_rank = parallel_env.trainer_endpoints[parallel_env.rank].split(":") + manager = Manager() + # glboal dict to store status + http_server_d = manager.dict() + http_server_d["running"] = False + if parallel_env.rank == 0: + # The scope for worker used by http server is '_worker' + size = {'_worker': parallel_env.world_size} + http_server = Process( + target=_start_kv_server, + args=(int(ep_rank_0[1]), http_server_d, size)) + http_server.daemon = True + http_server_d["running"] = True + http_server.start() # 4. init NCCL ParallelStrategy strategy = ParallelStrategy() @@ -185,22 +187,23 @@ def init_parallel_env(): # dividing init_gloo into two part beacause nccl and gloo # are separately looking for free ports which sometimes # leads to port-conflict. - wait_server_ready([parallel_env.trainer_endpoints[0]]) - - gloo_strategy = core.GlooParallelStrategy() - gloo_strategy.rank = parallel_env.rank - gloo_strategy.rank_num = parallel_env.world_size - gloo_strategy.ip_address = ep_rank_0[0] - gloo_strategy.ip_port = int(ep_rank_0[1]) - default_init_timeout_seconds = 3600 - default_run_timeout_seconds = 9999999 - gloo_strategy.init_seconds = default_init_timeout_seconds - gloo_strategy.run_seconds = default_run_timeout_seconds - gloo = core.GlooParallelContext(gloo_strategy) - gloo.init() - if parallel_env.rank == 0: - http_server_d["running"] = False - http_server.join() + if init_gloo: + wait_server_ready([parallel_env.trainer_endpoints[0]]) + + gloo_strategy = core.GlooParallelStrategy() + gloo_strategy.rank = parallel_env.rank + gloo_strategy.rank_num = parallel_env.world_size + gloo_strategy.ip_address = ep_rank_0[0] + gloo_strategy.ip_port = int(ep_rank_0[1]) + default_init_timeout_seconds = 3600 + default_run_timeout_seconds = 9999999 + gloo_strategy.init_seconds = default_init_timeout_seconds + gloo_strategy.run_seconds = default_run_timeout_seconds + gloo = core.GlooParallelContext(gloo_strategy) + gloo.init() + if parallel_env.rank == 0: + http_server_d["running"] = False + http_server.join() def get_rank(): diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py index f883e220f9..9a41f8c55a 100644 --- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py +++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py @@ -178,6 +178,7 @@ class TestDistBase(unittest.TestCase): "LD_PRELOAD": os.getenv("LD_PRELOAD", ""), "GLOG_v": "0", "NCCL_P2P_DISABLE": "1", + "PADDLE_WITH_GLOO": "1", "BACKEND": backend, "PATH_ID": path_id } -- GitLab