From 640f8cf01c585170744dd48a2c285b37bddd7d3c Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Thu, 31 Dec 2020 21:39:29 +0800
Subject: [PATCH] [Cherry-pick] Disable gloo by default #29559 #29805 (#29601)

* update, test=develop (#29559)

* Disable gloo by default (#29805)

* update, test=develop

* update, test=develop
---
 paddle/fluid/framework/fleet/gloo_wrapper.cc  |  5 +-
 .../distributed/fleet/base/role_maker.py      |  9 +--
 python/paddle/distributed/fleet/launch.py     |  2 +-
 .../paddle/distributed/fleet/launch_utils.py  |  6 +-
 python/paddle/distributed/parallel.py         | 65 ++++++++++---------
 .../unittests/test_collective_api_base.py     |  2 +
 6 files changed, 44 insertions(+), 45 deletions(-)

diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc
index f4b2d2d7d18..8780db89e85 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.cc
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc
@@ -272,8 +272,7 @@ void GlooWrapper::Init() {
   attr.iface = iface_;
   std::shared_ptr<gloo::rendezvous::HdfsStore> file_store = nullptr;
   std::shared_ptr<gloo::rendezvous::HTTPStore> http_store = nullptr;
-  auto context =
-      std::make_shared<gloo::rendezvous::ParallelConnectContext>(rank_, size_);
+  auto context = std::make_shared<gloo::rendezvous::Context>(rank_, size_);
   context->setTimeout(run_timeout_);
   auto dev = gloo::transport::tcp::CreateDevice(attr);
   switch (store_type_) {
@@ -295,6 +294,7 @@ void GlooWrapper::Init() {
       http_store->SetTimeoutSeconds(init_timeout_.count());
       context->connectFullMesh(*http_store, dev);
       http_store->Finalize();
+      VLOG(3) << "after calling http_store->Finalize.";
       break;
     }
     default:
@@ -304,6 +304,7 @@ void GlooWrapper::Init() {
   context_ = std::move(context);
 #endif
   is_initialized_ = true;
+  VLOG(3) << "gloo initialized done.";
 }
 
 template std::vector<int64_t> GlooWrapper::AllReduce<int64_t>(
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 2b9d2f4c277..a8683aea97f 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -220,15 +220,8 @@ class Gloo(object):
             rank, nodes = self._get_rank_nodes(Role.WORKER)
             gloo = init(rank, nodes, "WORKER")
             self._worker_comm = gloo
-        else:
-            rank, nodes = self._get_rank_nodes(Role.SERVER)
-            gloo = init(rank, nodes, "SERVER")
-            self._server_comm = gloo
+        # TODO (sandyhouse): initialize gloo for server and all
 
-        if self._need_init_all:
-            rank, nodes = self._get_rank_nodes(Role.ALL)
-            gloo = init(rank, nodes, "ALL")
-            self._nodes_comm = gloo
         if start_http_server:
             http_server_d["running"] = False
             http_server.join()
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index a7490f770d9..afc352f89cb 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -219,7 +219,7 @@ def launch_collective(args):
     global_envs = copy.copy(os.environ.copy())
     gloo_rendezvous_dir = tempfile.mkdtemp()
     # add gloo env
-    global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "1"))
+    global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "0"))
     global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3"
     global_envs["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir
 
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 526d586f1c3..971e3877393 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -955,7 +955,7 @@ class ParameterServerLauncher(object):
                 "TRAINING_ROLE": "PSERVER",
                 "PADDLE_TRAINERS_NUM": str(self.worker_num),
                 "POD_IP": cur_server.endpoint.split(":")[0],
-                "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "1")),
+                "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "0")),
                 "PADDLE_GLOO_RENDEZVOUS": "3",
                 "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
                 "PADDLE_GLOO_HTTP_ENDPOINT": self.http_port
@@ -1019,7 +1019,7 @@ class ParameterServerLauncher(object):
                 self.heter_worker_endpoints,
                 "TRAINING_ROLE": "TRAINER",
                 "PADDLE_TRAINER_ID": str(cur_worker.rank),
-                "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "1")),
+                "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "0")),
                 "PADDLE_GLOO_RENDEZVOUS": "3",
                 "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
                 "FLAGS_selected_gpus": "0",
@@ -1089,7 +1089,7 @@ class ParameterServerLauncher(object):
                 "TRAINING_ROLE": "HETER_TRAINER",
                 "PADDLE_TRAINERS_NUM": str(self.worker_num),
                 "POD_IP": cur_heter_worker.endpoint.split(":")[0],
-                "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "1")),
+                "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "0")),
                 "PADDLE_GLOO_RENDEZVOUS": "3",
                 "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
                 "FLAGS_selected_gpus": "0",
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index be66e13aa1b..c41c3663a17 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -142,21 +142,23 @@ def init_parallel_env():
     _check_var_exists("PADDLE_TRAINER_ENDPOINTS")
 
     # 3: init gloo context (step 1: httpsever start)
-    ep_rank_0 = parallel_env.trainer_endpoints[0].split(":")
-    ep_rank = parallel_env.trainer_endpoints[parallel_env.rank].split(":")
-    manager = Manager()
-    # glboal dict to store status
-    http_server_d = manager.dict()
-    http_server_d["running"] = False
-    if parallel_env.rank == 0:
-        # The scope for worker used by http server is '_worker'
-        size = {'_worker': parallel_env.world_size}
-        http_server = Process(
-            target=_start_kv_server,
-            args=(int(ep_rank_0[1]), http_server_d, size))
-        http_server.daemon = True
-        http_server_d["running"] = True
-        http_server.start()
+    init_gloo = int(os.getenv("PADDLE_WITH_GLOO", "0"))
+    if init_gloo:
+        ep_rank_0 = parallel_env.trainer_endpoints[0].split(":")
+        ep_rank = parallel_env.trainer_endpoints[parallel_env.rank].split(":")
+        manager = Manager()
+        # glboal dict to store status
+        http_server_d = manager.dict()
+        http_server_d["running"] = False
+        if parallel_env.rank == 0:
+            # The scope for worker used by http server is '_worker'
+            size = {'_worker': parallel_env.world_size}
+            http_server = Process(
+                target=_start_kv_server,
+                args=(int(ep_rank_0[1]), http_server_d, size))
+            http_server.daemon = True
+            http_server_d["running"] = True
+            http_server.start()
 
     # 4. init NCCL ParallelStrategy
     strategy = ParallelStrategy()
@@ -185,22 +187,23 @@ def init_parallel_env():
     # dividing init_gloo into two part beacause nccl and gloo
     # are separately looking for free ports which sometimes
     # leads to port-conflict.
-    wait_server_ready([parallel_env.trainer_endpoints[0]])
-
-    gloo_strategy = core.GlooParallelStrategy()
-    gloo_strategy.rank = parallel_env.rank
-    gloo_strategy.rank_num = parallel_env.world_size
-    gloo_strategy.ip_address = ep_rank_0[0]
-    gloo_strategy.ip_port = int(ep_rank_0[1])
-    default_init_timeout_seconds = 3600
-    default_run_timeout_seconds = 9999999
-    gloo_strategy.init_seconds = default_init_timeout_seconds
-    gloo_strategy.run_seconds = default_run_timeout_seconds
-    gloo = core.GlooParallelContext(gloo_strategy)
-    gloo.init()
-    if parallel_env.rank == 0:
-        http_server_d["running"] = False
-        http_server.join()
+    if init_gloo:
+        wait_server_ready([parallel_env.trainer_endpoints[0]])
+
+        gloo_strategy = core.GlooParallelStrategy()
+        gloo_strategy.rank = parallel_env.rank
+        gloo_strategy.rank_num = parallel_env.world_size
+        gloo_strategy.ip_address = ep_rank_0[0]
+        gloo_strategy.ip_port = int(ep_rank_0[1])
+        default_init_timeout_seconds = 3600
+        default_run_timeout_seconds = 9999999
+        gloo_strategy.init_seconds = default_init_timeout_seconds
+        gloo_strategy.run_seconds = default_run_timeout_seconds
+        gloo = core.GlooParallelContext(gloo_strategy)
+        gloo.init()
+        if parallel_env.rank == 0:
+            http_server_d["running"] = False
+            http_server.join()
 
 
 def get_rank():
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index 84b58f15f88..b21e0ddafc2 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -169,6 +169,7 @@ class TestDistBase(unittest.TestCase):
                          path_id="0",
                          check_error_log=False,
                          need_envs={}):
+        with_gloo = '0' if backend == "nccl" else '1'
         required_envs = {
             "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
             "FLAGS_eager_delete_tensor_gb": "0.0",
@@ -178,6 +179,7 @@ class TestDistBase(unittest.TestCase):
             "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
             "GLOG_v": "0",
             "NCCL_P2P_DISABLE": "1",
+            "PADDLE_WITH_GLOO": with_gloo,
             "BACKEND": backend,
             "PATH_ID": path_id
         }
-- 
GitLab