From 640f8cf01c585170744dd48a2c285b37bddd7d3c Mon Sep 17 00:00:00 2001 From: lilong12 Date: Thu, 31 Dec 2020 21:39:29 +0800 Subject: [PATCH] [Cherry-pick] Disable gloo by default #29559 #29805 (#29601) * update, test=develop (#29559) * Disable gloo by default (#29805) * update, test=develop * update, test=develop --- paddle/fluid/framework/fleet/gloo_wrapper.cc | 5 +- .../distributed/fleet/base/role_maker.py | 9 +-- python/paddle/distributed/fleet/launch.py | 2 +- .../paddle/distributed/fleet/launch_utils.py | 6 +- python/paddle/distributed/parallel.py | 65 ++++++++++--------- .../unittests/test_collective_api_base.py | 2 + 6 files changed, 44 insertions(+), 45 deletions(-) diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc index f4b2d2d7d1..8780db89e8 100644 --- a/paddle/fluid/framework/fleet/gloo_wrapper.cc +++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc @@ -272,8 +272,7 @@ void GlooWrapper::Init() { attr.iface = iface_; std::shared_ptr file_store = nullptr; std::shared_ptr http_store = nullptr; - auto context = - std::make_shared(rank_, size_); + auto context = std::make_shared(rank_, size_); context->setTimeout(run_timeout_); auto dev = gloo::transport::tcp::CreateDevice(attr); switch (store_type_) { @@ -295,6 +294,7 @@ void GlooWrapper::Init() { http_store->SetTimeoutSeconds(init_timeout_.count()); context->connectFullMesh(*http_store, dev); http_store->Finalize(); + VLOG(3) << "after calling http_store->Finalize."; break; } default: @@ -304,6 +304,7 @@ void GlooWrapper::Init() { context_ = std::move(context); #endif is_initialized_ = true; + VLOG(3) << "gloo initialized done."; } template std::vector GlooWrapper::AllReduce( diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py index 2b9d2f4c27..a8683aea97 100644 --- a/python/paddle/distributed/fleet/base/role_maker.py +++ b/python/paddle/distributed/fleet/base/role_maker.py @@ -220,15 +220,8 @@ class Gloo(object): rank, nodes = self._get_rank_nodes(Role.WORKER) gloo = init(rank, nodes, "WORKER") self._worker_comm = gloo - else: - rank, nodes = self._get_rank_nodes(Role.SERVER) - gloo = init(rank, nodes, "SERVER") - self._server_comm = gloo + # TODO (sandyhouse): initialize gloo for server and all - if self._need_init_all: - rank, nodes = self._get_rank_nodes(Role.ALL) - gloo = init(rank, nodes, "ALL") - self._nodes_comm = gloo if start_http_server: http_server_d["running"] = False http_server.join() diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index a7490f770d..afc352f89c 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -219,7 +219,7 @@ def launch_collective(args): global_envs = copy.copy(os.environ.copy()) gloo_rendezvous_dir = tempfile.mkdtemp() # add gloo env - global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "1")) + global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "0")) global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3" global_envs["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 526d586f1c..971e387739 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -955,7 +955,7 @@ class ParameterServerLauncher(object): "TRAINING_ROLE": "PSERVER", "PADDLE_TRAINERS_NUM": str(self.worker_num), "POD_IP": cur_server.endpoint.split(":")[0], - "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "1")), + "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "0")), "PADDLE_GLOO_RENDEZVOUS": "3", "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir, "PADDLE_GLOO_HTTP_ENDPOINT": self.http_port @@ -1019,7 +1019,7 @@ class ParameterServerLauncher(object): self.heter_worker_endpoints, "TRAINING_ROLE": "TRAINER", "PADDLE_TRAINER_ID": str(cur_worker.rank), - "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "1")), + "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "0")), "PADDLE_GLOO_RENDEZVOUS": "3", "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir, "FLAGS_selected_gpus": "0", @@ -1089,7 +1089,7 @@ class ParameterServerLauncher(object): "TRAINING_ROLE": "HETER_TRAINER", "PADDLE_TRAINERS_NUM": str(self.worker_num), "POD_IP": cur_heter_worker.endpoint.split(":")[0], - "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "1")), + "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "0")), "PADDLE_GLOO_RENDEZVOUS": "3", "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir, "FLAGS_selected_gpus": "0", diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index be66e13aa1..c41c3663a1 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -142,21 +142,23 @@ def init_parallel_env(): _check_var_exists("PADDLE_TRAINER_ENDPOINTS") # 3: init gloo context (step 1: httpsever start) - ep_rank_0 = parallel_env.trainer_endpoints[0].split(":") - ep_rank = parallel_env.trainer_endpoints[parallel_env.rank].split(":") - manager = Manager() - # glboal dict to store status - http_server_d = manager.dict() - http_server_d["running"] = False - if parallel_env.rank == 0: - # The scope for worker used by http server is '_worker' - size = {'_worker': parallel_env.world_size} - http_server = Process( - target=_start_kv_server, - args=(int(ep_rank_0[1]), http_server_d, size)) - http_server.daemon = True - http_server_d["running"] = True - http_server.start() + init_gloo = int(os.getenv("PADDLE_WITH_GLOO", "0")) + if init_gloo: + ep_rank_0 = parallel_env.trainer_endpoints[0].split(":") + ep_rank = parallel_env.trainer_endpoints[parallel_env.rank].split(":") + manager = Manager() + # glboal dict to store status + http_server_d = manager.dict() + http_server_d["running"] = False + if parallel_env.rank == 0: + # The scope for worker used by http server is '_worker' + size = {'_worker': parallel_env.world_size} + http_server = Process( + target=_start_kv_server, + args=(int(ep_rank_0[1]), http_server_d, size)) + http_server.daemon = True + http_server_d["running"] = True + http_server.start() # 4. init NCCL ParallelStrategy strategy = ParallelStrategy() @@ -185,22 +187,23 @@ def init_parallel_env(): # dividing init_gloo into two part beacause nccl and gloo # are separately looking for free ports which sometimes # leads to port-conflict. - wait_server_ready([parallel_env.trainer_endpoints[0]]) - - gloo_strategy = core.GlooParallelStrategy() - gloo_strategy.rank = parallel_env.rank - gloo_strategy.rank_num = parallel_env.world_size - gloo_strategy.ip_address = ep_rank_0[0] - gloo_strategy.ip_port = int(ep_rank_0[1]) - default_init_timeout_seconds = 3600 - default_run_timeout_seconds = 9999999 - gloo_strategy.init_seconds = default_init_timeout_seconds - gloo_strategy.run_seconds = default_run_timeout_seconds - gloo = core.GlooParallelContext(gloo_strategy) - gloo.init() - if parallel_env.rank == 0: - http_server_d["running"] = False - http_server.join() + if init_gloo: + wait_server_ready([parallel_env.trainer_endpoints[0]]) + + gloo_strategy = core.GlooParallelStrategy() + gloo_strategy.rank = parallel_env.rank + gloo_strategy.rank_num = parallel_env.world_size + gloo_strategy.ip_address = ep_rank_0[0] + gloo_strategy.ip_port = int(ep_rank_0[1]) + default_init_timeout_seconds = 3600 + default_run_timeout_seconds = 9999999 + gloo_strategy.init_seconds = default_init_timeout_seconds + gloo_strategy.run_seconds = default_run_timeout_seconds + gloo = core.GlooParallelContext(gloo_strategy) + gloo.init() + if parallel_env.rank == 0: + http_server_d["running"] = False + http_server.join() def get_rank(): diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py index 84b58f15f8..b21e0ddafc 100644 --- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py +++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py @@ -169,6 +169,7 @@ class TestDistBase(unittest.TestCase): path_id="0", check_error_log=False, need_envs={}): + with_gloo = '0' if backend == "nccl" else '1' required_envs = { "FLAGS_fraction_of_gpu_memory_to_use": "0.15", "FLAGS_eager_delete_tensor_gb": "0.0", @@ -178,6 +179,7 @@ class TestDistBase(unittest.TestCase): "LD_PRELOAD": os.getenv("LD_PRELOAD", ""), "GLOG_v": "0", "NCCL_P2P_DISABLE": "1", + "PADDLE_WITH_GLOO": with_gloo, "BACKEND": backend, "PATH_ID": path_id } -- GitLab