[HybridParallel] Fix amp bug in ModelParallel (#32579)

* fix amp bug * fix name of wordsize

[HybridParallel] Fix amp bug in ModelParallel (#32579)
* fix amp bug * fix name of wordsize
c1db7e32 · ShenLiang · GitHub · 9930a582 · c1db7e32 · c1db7e32
2 changed file
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
@@ -67,10 +67,11 @@ class HybridParallelGradScaler:
        # allreduce_max found_inf in check_group
        if self._is_mp:
            self._found_inf = paddle.cast(self._found_inf, dtype="int32")
+            # TODO(shenliang03) Since the minimize call in the optimizer is 
+            # after the gradscaler, check_finite needs to synchronize global 
+            # information. In the future, we should use check_group
            paddle.distributed.all_reduce(
-                self._found_inf,
-                op=paddle.distributed.ReduceOp.MAX,
-                group=self._hcg.get_check_parallel_group())
+                self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None)
            self._found_inf = paddle.cast(self._found_inf, dtype="bool")

    def __getattr__(self, item):

--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -77,7 +77,7 @@ class PipelineLayer(Layer):
        self.layers = layers
        self._loss_fn = loss_fn
        self._topo = topology
-        word_size = dist.get_world_size()
+        world_size = dist.get_world_size()
        self.global_rank = dist.get_rank()

        if self._topo:
@@ -88,11 +88,11 @@ class PipelineLayer(Layer):
                    self._num_stages)
        else:
            # construct default topology
-            if word_size % num_stages != 0:
+            if world_size % num_stages != 0:
                raise ValueError("should provide correct num_stages({}) "
-                                 "which can be divided by word_size({})".format(
-                                     num_stages, word_size))
-            dp_num = word_size // num_stages
+                                 "which can be divided by world_size({})".
+                                 format(num_stages, world_size))
+            dp_num = world_size // num_stages
            self._topo = fleet.CommunicateTopology(["data", "pipe", "model"],
                                                   [dp_num, num_stages, 1])
            self._stage_id = self._topo.get_coord(self.global_rank).pipe