From 3689a126c30a85cce3e314bb5ffcc8aaffc1155a Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Fri, 17 Jun 2022 12:27:24 +0800 Subject: [PATCH] [Dygraph] Fix barrier bugs of ProcessGroup in Eager Mode (#43589) * fix pg bugs * update --- python/paddle/distributed/collective.py | 1 + python/paddle/distributed/parallel.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index b92b2a3c15..65018e7e2e 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -378,6 +378,7 @@ def new_group(ranks=None, backend=None): _group_map_by_name[group_name] = group _group_map[gid] = group + paddle.distributed.barrier(group=group) return group if not backend: diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index f0365cab8c..13a027db37 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -20,6 +20,7 @@ from multiprocessing import Manager # noqa: F401 import time import sys +import paddle from paddle import compat as cpt # deprecated module import @@ -253,6 +254,7 @@ def init_parallel_env(): _set_group_map_by_name(_default_group_name, group) _set_group_map(0, group) parallel_helper._set_parallel_ctx(True) + paddle.distributed.barrier(group=group) return group node_num = set([i.split(":")[0] for i in parallel_env.trainer_endpoints]) -- GitLab