refactor(distributed): remove the shm backend for distributed training

GitOrigin-RevId: ab76f23f9dc6a4452fcde58fac6078f4c24af352

refactor(distributed): remove the shm backend for distributed training
GitOrigin-RevId: ab76f23f9dc6a4452fcde58fac6078f4c24af352
3aef5224 · Megvii Engine Team · 21849d79 · 3aef5224 · 3aef5224 · 3aef5224
4 changed file
--- a/imperative/python/megengine/distributed/__init__.py
+++ b/imperative/python/megengine/distributed/__init__.py
@@ -26,7 +26,7 @@ from .server import Client, Server
 @mproperty
 def backend(mod):
    r"""Get or set backend of collective communication.
-    Available backends are ['nccl', 'shm', 'rccl']
+    Available backends are ['nccl', 'rccl']

    Examples:


--- a/imperative/python/megengine/distributed/group.py
+++ b/imperative/python/megengine/distributed/group.py
@@ -95,7 +95,7 @@ class Group:
 WORLD = Group([])

 _devices = {"gpu", "cuda", "rocm"}
-_backends = {"nccl", "rccl", "shm", "auto"}
+_backends = {"nccl", "rccl", "auto"}


 def init_process_group(
@@ -115,7 +115,7 @@ def init_process_group(
        world_size: total number of processes participating in the job.
        rank: rank of the current process.
        device: the GPU device id to bind this process to.
-        backend: communicator backend, currently support 'nccl' and 'shm'.
+        backend: communicator backend, currently support 'nccl' and 'rccl'.
    """
    physical_device_type = what_is_xpu() if device_type == "xpu" else device_type
    if not isinstance(master_ip, str):

--- a/imperative/python/megengine/distributed/helper.py
+++ b/imperative/python/megengine/distributed/helper.py
@@ -205,10 +205,7 @@ class AllreduceCallback:
            assert _group._sd, "please call init_process_group first"
            backend = _group._sd.backend
        if backend == "auto":
-            if group.is_single_machine and not _check_enable_p2p():
-                backend = "shm"
-            else:
-                backend = "nccl"
+            backend = "nccl"
        self._backend = backend

    def _reset(self):

--- a/src/opr-mm/impl/megray_helper.cpp
+++ b/src/opr-mm/impl/megray_helper.cpp
@@ -31,10 +31,8 @@ MegRay::Backend mgb::opr::get_megray_backend(const std::string& backend) {
        return MegRay::MEGRAY_RCCL;
    } else if (backend == "ucx") {
        return MegRay::MEGRAY_UCX;
-    } else if (backend == "shm") {
-        return MegRay::MEGRAY_SHM;
    } else {
-        mgb_throw(MegBrainError, "back CollectiveComm backend");
+        mgb_throw(MegBrainError, "bad CollectiveComm backend");
    }
 }