未验证 提交 52ff3f48 编写于 作者: S ShenLiang 提交者: GitHub

fix pipeline on processgroup (#42989)

上级 5b86e190
...@@ -403,6 +403,11 @@ def new_group(ranks=None, backend=None): ...@@ -403,6 +403,11 @@ def new_group(ranks=None, backend=None):
_group_map_by_name[group_name] = group _group_map_by_name[group_name] = group
_group_map[gid] = group _group_map[gid] = group
# TODO(shenliang03): This is a temporary solution to solve the problem of
# hang caused by tcp
tmp = paddle.to_tensor([1], dtype="int32")
paddle.distributed.all_reduce(tmp, group=group, use_calc_stream=True)
paddle.distributed.wait(tmp)
return group return group
if not backend: if not backend:
......
...@@ -18,6 +18,7 @@ from ...utils.log_util import logger ...@@ -18,6 +18,7 @@ from ...utils.log_util import logger
import numpy as np import numpy as np
from paddle import _C_ops from paddle import _C_ops
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.framework import _in_legacy_dygraph, _non_static_mode, in_dygraph_mode
_hcg = None _hcg = None
_use_cache = False _use_cache = False
...@@ -148,9 +149,15 @@ _send_recv_meta = SendRecvMeta() ...@@ -148,9 +149,15 @@ _send_recv_meta = SendRecvMeta()
def _is_valid_send_recv_partial(tensor, mp_degree): def _is_valid_send_recv_partial(tensor, mp_degree):
tensor_numel = np.prod(tensor.shape)
assert tensor_numel != 0, "can't send/recv zero element" if _in_legacy_dygraph():
return mp_degree > 1 and tensor_numel % mp_degree == 0 tensor_numel = np.prod(tensor.shape)
assert tensor_numel != 0, "can't send/recv zero element"
return mp_degree > 1 and tensor_numel % mp_degree == 0
elif in_dygraph_mode():
# TODO(shenliang03) support mp+pp optimizer in future.
# (partial_send/partial_recv/partial_allgather_)
return False
def send_partial(tensor, def send_partial(tensor,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册