Disable gloo by default (#29805)

* update, test=develop

Disable gloo by default (#29805)
* update, test=develop
b0bd93de · lilong12 · GitHub · b6fd2629 · b0bd93de · b0bd93de
5 changed file
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -220,15 +220,8 @@ class Gloo(object):
            rank, nodes = self._get_rank_nodes(Role.WORKER)
            gloo = init(rank, nodes, "WORKER")
            self._worker_comm = gloo
-        else:
+        # TODO (sandyhouse): initialize gloo for server and all
-            rank, nodes = self._get_rank_nodes(Role.SERVER)
-            gloo = init(rank, nodes, "SERVER")
-            self._server_comm = gloo
-        if self._need_init_all:
-            rank, nodes = self._get_rank_nodes(Role.ALL)
-            gloo = init(rank, nodes, "ALL")
-            self._nodes_comm = gloo
        if start_http_server:
            http_server_d["running"] = False
            http_server.join()

--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -219,7 +219,7 @@ def launch_collective(args):
    global_envs = copy.copy(os.environ.copy())
    gloo_rendezvous_dir = tempfile.mkdtemp()
    # add gloo env
-    global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "1"))
+    global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "0"))
    global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3"
    global_envs["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir

--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -954,7 +954,7 @@ class ParameterServerLauncher(object):
                "TRAINING_ROLE": "PSERVER",
                "PADDLE_TRAINERS_NUM": str(self.worker_num),
                "POD_IP": cur_server.endpoint.split(":")[0],
-                "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "1")),
+                "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "0")),
                "PADDLE_GLOO_RENDEZVOUS": "3",
                "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
                "PADDLE_GLOO_HTTP_ENDPOINT": self.http_port
@@ -1018,7 +1018,7 @@ class ParameterServerLauncher(object):
                self.heter_worker_endpoints,
                "TRAINING_ROLE": "TRAINER",
                "PADDLE_TRAINER_ID": str(cur_worker.rank),
-                "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "1")),
+                "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "0")),
                "PADDLE_GLOO_RENDEZVOUS": "3",
                "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
                "FLAGS_selected_gpus": "0",
@@ -1088,7 +1088,7 @@ class ParameterServerLauncher(object):
                "TRAINING_ROLE": "HETER_TRAINER",
                "PADDLE_TRAINERS_NUM": str(self.worker_num),
                "POD_IP": cur_heter_worker.endpoint.split(":")[0],
-                "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "1")),
+                "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "0")),
                "PADDLE_GLOO_RENDEZVOUS": "3",
                "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
                "FLAGS_selected_gpus": "0",

--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -142,6 +142,8 @@ def init_parallel_env():
    _check_var_exists("PADDLE_TRAINER_ENDPOINTS")
    # 3: init gloo context (step 1: httpsever start)
+    init_gloo = int(os.getenv("PADDLE_WITH_GLOO", "0"))
+    if init_gloo:
        ep_rank_0 = parallel_env.trainer_endpoints[0].split(":")
        ep_rank = parallel_env.trainer_endpoints[parallel_env.rank].split(":")
        manager = Manager()
@@ -185,6 +187,7 @@ def init_parallel_env():
    # dividing init_gloo into two part beacause nccl and gloo
    # are separately looking for free ports which sometimes
    # leads to port-conflict.
+    if init_gloo:
        wait_server_ready([parallel_env.trainer_endpoints[0]])
        gloo_strategy = core.GlooParallelStrategy()

--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -178,6 +178,7 @@ class TestDistBase(unittest.TestCase):
            "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
            "GLOG_v": "0",
            "NCCL_P2P_DISABLE": "1",
+            "PADDLE_WITH_GLOO": "1",
            "BACKEND": backend,
            "PATH_ID": path_id
        }