From 0d9a23b4c36714f493a4c0468cdc663cf2e233ba Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Fri, 5 May 2023 21:40:45 +0800 Subject: [PATCH] [Dygraph] Fix bugs in dp_pp_comm_overlap for HybridParallel (#53384) --- .../distributed/fleet/meta_parallel/pipeline_parallel.py | 3 +++ python/paddle/distributed/fleet/optimizer.py | 9 ++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py index ab1cf9701dd..2a7dd4d0bb7 100755 --- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py +++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py @@ -75,6 +75,9 @@ class PipelineParallel(MetaParallelBase): ].dp_comm_overlap self._dp_comm_buffers = [] + if self._dp_comm_overlap: + assert self.use_data_parallel and self.num_stages > 1 + p2p.initialize_p2p_groups( hcg, self._using_cache, self._enable_partial_send_recv ) diff --git a/python/paddle/distributed/fleet/optimizer.py b/python/paddle/distributed/fleet/optimizer.py index 89b0456fe6a..5abe7c47e9b 100755 --- a/python/paddle/distributed/fleet/optimizer.py +++ b/python/paddle/distributed/fleet/optimizer.py @@ -62,9 +62,16 @@ def _dygraph_distributed_optimizer(optimizer, strategy=None): if fleet_env.worker_num() > 1: if not fleet_env._user_defined_strategy.heter_ccl_mode: - return HybridParallelOptimizer( + hp_optim = HybridParallelOptimizer( optimizer, fleet_env._hcg, fleet_env._user_defined_strategy ) + + if fleet_env._user_defined_strategy.hybrid_configs[ + "pp_configs" + ].dp_comm_overlap: + hp_optim._dp_enable = False + + return hp_optim else: return HeterParallelOptimizer( optimizer, fleet_env._user_defined_strategy -- GitLab