未验证 提交 3689a126 编写于 作者: H Haohongxiang 提交者: GitHub

[Dygraph] Fix barrier bugs of ProcessGroup in Eager Mode (#43589)

* fix pg bugs

* update
上级 19b87aec
...@@ -378,6 +378,7 @@ def new_group(ranks=None, backend=None): ...@@ -378,6 +378,7 @@ def new_group(ranks=None, backend=None):
_group_map_by_name[group_name] = group _group_map_by_name[group_name] = group
_group_map[gid] = group _group_map[gid] = group
paddle.distributed.barrier(group=group)
return group return group
if not backend: if not backend:
......
...@@ -20,6 +20,7 @@ from multiprocessing import Manager # noqa: F401 ...@@ -20,6 +20,7 @@ from multiprocessing import Manager # noqa: F401
import time import time
import sys import sys
import paddle
from paddle import compat as cpt from paddle import compat as cpt
# deprecated module import # deprecated module import
...@@ -253,6 +254,7 @@ def init_parallel_env(): ...@@ -253,6 +254,7 @@ def init_parallel_env():
_set_group_map_by_name(_default_group_name, group) _set_group_map_by_name(_default_group_name, group)
_set_group_map(0, group) _set_group_map(0, group)
parallel_helper._set_parallel_ctx(True) parallel_helper._set_parallel_ctx(True)
paddle.distributed.barrier(group=group)
return group return group
node_num = set([i.split(":")[0] for i in parallel_env.trainer_endpoints]) node_num = set([i.split(":")[0] for i in parallel_env.trainer_endpoints])
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册