feat(mge/distributed): add parameter replica_mode

GitOrigin-RevId: 244e4ca437e3427d65d395f4be01b4fe6ed92e91

feat(mge/distributed): add parameter replica_mode
GitOrigin-RevId: 244e4ca437e3427d65d395f4be01b4fe6ed92e91
2b3a88d0 · Megvii Engine Team · 44c381b6 · 2b3a88d0 · 2b3a88d0
Showing with 12 addition and 5 deletion

python_module/megengine/core/tensor_nn.py python_module/megengine/core/tensor_nn.py +3 -0

python_module/megengine/optimizer/optimizer.py python_module/megengine/optimizer/optimizer.py +9 -5

未找到文件。
--- a/python_module/megengine/core/tensor_nn.py
+++ b/python_module/megengine/core/tensor_nn.py
@@ -31,6 +31,9 @@ class Parameter(Tensor):
            t = tensor(value, dtype=dtype, device=device, requires_grad=requires_grad)
        self.__dict__.update(t.__dict__)

+        # broadcast and allreduce will not be performed in optimizer if replica_mode is False
+        self.replica_mode = True
+
    @property
    def shape(self):
        r"""Return shape of parameter.

--- a/python_module/megengine/optimizer/optimizer.py
+++ b/python_module/megengine/optimizer/optimizer.py
@@ -178,7 +178,7 @@ class Optimizer(metaclass=ABCMeta):
        assert len(grads) == len(params)

        for param, grad in zip(params, grads):
-            if is_distributed():
+            if is_distributed() and param.replica_mode:
                with opr_priority_scope(cg, -(2 ** 30)):
                    # always run all_reduce_mean first except add_update
                    grad = (
@@ -230,10 +230,14 @@ class Optimizer(metaclass=ABCMeta):
        key = 0
        for group in self.param_groups:
            for param in group["params"]:
-                bcast_param(
-                    param, "bcast_param_" + str(key), get_world_size(), get_rank() == 0,
-                )
-                key += 1
+                if param.replica_mode:
+                    bcast_param(
+                        param,
+                        "bcast_param_" + str(key),
+                        get_world_size(),
+                        get_rank() == 0,
+                    )
+                    key += 1

    def state_dict(self) -> Dict:
        r"""Export the optimizer state.