[HybridParallel]fix bug of check_inf in fleet_base.py (#36651)

* fix bug of check_inf * fix allreduce

[HybridParallel]fix bug of check_inf in fleet_base.py (#36651)
* fix bug of check_inf * fix allreduce
59d8b8cb · Haohongxiang · GitHub · 50778ad6 · 59d8b8cb · 59d8b8cb
2 changed file
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -1586,16 +1586,16 @@ class Fleet(object):
                _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
                                                param_grads_fp32,
                                                temp_found_inf_fp32)
            self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
+            is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
            # TODO(shenliang03) Since dp allreduce in the optimizer is 
            # after the gradscaler, check_finite needs to synchronize global 
            # information. In the future, we should use check_group to speed.
            paddle.distributed.all_reduce(
-                paddle.to_tensor(
+                is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None)
-                    [self._found_inf], dtype="int32"),
+            self._found_inf = is_found_inf.numpy()[0]
-                op=paddle.distributed.ReduceOp.MAX,
-                group=None)
        # Only tensor_parallel and pipeline_parallel need to modify scaler
        if self._hcg.get_parallel_mode() in (ParallelMode.TENSOR_PARALLEL,

--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -47,6 +47,7 @@ def _apply_collective_grads(parameters, comm_group):
        nranks = paddle.distributed.get_world_size(
        ) if comm_group is None else comm_group.nranks
        div_factor = paddle.to_tensor(nranks, dtype=coalesced_grad.dtype)
+        paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
        paddle.fluid.framework._dygraph_tracer().trace_op(
            type="elementwise_div",
            inputs={'X': coalesced_grad,
@@ -54,8 +55,6 @@ def _apply_collective_grads(parameters, comm_group):
            outputs={'Out': coalesced_grad},
            attrs={'axis': -1})
-        paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
    _split_tensors(coalesced_grads_and_vars)