未验证 提交 59d8b8cb 编写于 作者: H Haohongxiang 提交者: GitHub

[HybridParallel]fix bug of check_inf in fleet_base.py (#36651)

* fix bug of check_inf

* fix allreduce
上级 50778ad6
...@@ -1586,16 +1586,16 @@ class Fleet(object): ...@@ -1586,16 +1586,16 @@ class Fleet(object):
_C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
param_grads_fp32, param_grads_fp32,
temp_found_inf_fp32) temp_found_inf_fp32)
self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0 self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
# TODO(shenliang03) Since dp allreduce in the optimizer is # TODO(shenliang03) Since dp allreduce in the optimizer is
# after the gradscaler, check_finite needs to synchronize global # after the gradscaler, check_finite needs to synchronize global
# information. In the future, we should use check_group to speed. # information. In the future, we should use check_group to speed.
paddle.distributed.all_reduce( paddle.distributed.all_reduce(
paddle.to_tensor( is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None)
[self._found_inf], dtype="int32"), self._found_inf = is_found_inf.numpy()[0]
op=paddle.distributed.ReduceOp.MAX,
group=None)
# Only tensor_parallel and pipeline_parallel need to modify scaler # Only tensor_parallel and pipeline_parallel need to modify scaler
if self._hcg.get_parallel_mode() in (ParallelMode.TENSOR_PARALLEL, if self._hcg.get_parallel_mode() in (ParallelMode.TENSOR_PARALLEL,
......
...@@ -47,6 +47,7 @@ def _apply_collective_grads(parameters, comm_group): ...@@ -47,6 +47,7 @@ def _apply_collective_grads(parameters, comm_group):
nranks = paddle.distributed.get_world_size( nranks = paddle.distributed.get_world_size(
) if comm_group is None else comm_group.nranks ) if comm_group is None else comm_group.nranks
div_factor = paddle.to_tensor(nranks, dtype=coalesced_grad.dtype) div_factor = paddle.to_tensor(nranks, dtype=coalesced_grad.dtype)
paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
paddle.fluid.framework._dygraph_tracer().trace_op( paddle.fluid.framework._dygraph_tracer().trace_op(
type="elementwise_div", type="elementwise_div",
inputs={'X': coalesced_grad, inputs={'X': coalesced_grad,
...@@ -54,8 +55,6 @@ def _apply_collective_grads(parameters, comm_group): ...@@ -54,8 +55,6 @@ def _apply_collective_grads(parameters, comm_group):
outputs={'Out': coalesced_grad}, outputs={'Out': coalesced_grad},
attrs={'axis': -1}) attrs={'axis': -1})
paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
_split_tensors(coalesced_grads_and_vars) _split_tensors(coalesced_grads_and_vars)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册