From c1db7e32128fe821c2adc02d6624f39589dad38b Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Tue, 27 Apr 2021 09:57:24 +0800 Subject: [PATCH] [HybridParallel] Fix amp bug in ModelParallel (#32579) * fix amp bug * fix name of wordsize --- .../dygraph_optimizer/hybrid_parallel_gradscaler.py | 7 ++++--- .../fleet/meta_parallel/parallel_layers/pp_layers.py | 10 +++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py index 11bb897a67..13bb9d2ace 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py @@ -67,10 +67,11 @@ class HybridParallelGradScaler: # allreduce_max found_inf in check_group if self._is_mp: self._found_inf = paddle.cast(self._found_inf, dtype="int32") + # TODO(shenliang03) Since the minimize call in the optimizer is + # after the gradscaler, check_finite needs to synchronize global + # information. In the future, we should use check_group paddle.distributed.all_reduce( - self._found_inf, - op=paddle.distributed.ReduceOp.MAX, - group=self._hcg.get_check_parallel_group()) + self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None) self._found_inf = paddle.cast(self._found_inf, dtype="bool") def __getattr__(self, item): diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py index e2db689eb7..669ed032a3 100644 --- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py +++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py @@ -77,7 +77,7 @@ class PipelineLayer(Layer): self.layers = layers self._loss_fn = loss_fn self._topo = topology - word_size = dist.get_world_size() + world_size = dist.get_world_size() self.global_rank = dist.get_rank() if self._topo: @@ -88,11 +88,11 @@ class PipelineLayer(Layer): self._num_stages) else: # construct default topology - if word_size % num_stages != 0: + if world_size % num_stages != 0: raise ValueError("should provide correct num_stages({}) " - "which can be divided by word_size({})".format( - num_stages, word_size)) - dp_num = word_size // num_stages + "which can be divided by world_size({})". + format(num_stages, world_size)) + dp_num = world_size // num_stages self._topo = fleet.CommunicateTopology(["data", "pipe", "model"], [dp_num, num_stages, 1]) self._stage_id = self._topo.get_coord(self.global_rank).pipe -- GitLab