diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py index 11bb897a678b7ca46359af37cf150d14ba67fde2..13bb9d2acece283a94432f58042843ca969b5de5 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py @@ -67,10 +67,11 @@ class HybridParallelGradScaler: # allreduce_max found_inf in check_group if self._is_mp: self._found_inf = paddle.cast(self._found_inf, dtype="int32") + # TODO(shenliang03) Since the minimize call in the optimizer is + # after the gradscaler, check_finite needs to synchronize global + # information. In the future, we should use check_group paddle.distributed.all_reduce( - self._found_inf, - op=paddle.distributed.ReduceOp.MAX, - group=self._hcg.get_check_parallel_group()) + self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None) self._found_inf = paddle.cast(self._found_inf, dtype="bool") def __getattr__(self, item): diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py index e2db689eb76740b385a1e3340bc802a2425c0946..669ed032a3443862b8776c6ae79f221f7f362979 100644 --- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py +++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py @@ -77,7 +77,7 @@ class PipelineLayer(Layer): self.layers = layers self._loss_fn = loss_fn self._topo = topology - word_size = dist.get_world_size() + world_size = dist.get_world_size() self.global_rank = dist.get_rank() if self._topo: @@ -88,11 +88,11 @@ class PipelineLayer(Layer): self._num_stages) else: # construct default topology - if word_size % num_stages != 0: + if world_size % num_stages != 0: raise ValueError("should provide correct num_stages({}) " - "which can be divided by word_size({})".format( - num_stages, word_size)) - dp_num = word_size // num_stages + "which can be divided by world_size({})". + format(num_stages, world_size)) + dp_num = world_size // num_stages self._topo = fleet.CommunicateTopology(["data", "pipe", "model"], [dp_num, num_stages, 1]) self._stage_id = self._topo.get_coord(self.global_rank).pipe