From 8ffcc7c85cd4538314bf3159dd8d37ba75d80e17 Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Thu, 14 Oct 2021 10:57:24 +0800 Subject: [PATCH] [HybridParallel]Rebuild code for pipeline (#36396) * add no_sync for parameters sync * add pipeline for moe --- .../fleet/meta_parallel/pipeline_parallel.py | 55 +++++++++++-------- python/paddle/fluid/dygraph/parallel.py | 10 +++- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py index 431bc6d7bc..9096097397 100755 --- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py +++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py @@ -77,26 +77,15 @@ class PipelineParallel(MetaParallelBase): logger.info("start broadcast dp parameters") broadcast_dp_parameters(self._layers, self._hcg) - def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None): - assert isinstance(optimizer, HybridParallelOptimizer), ( - 'optimizer should be HybridParallelOptimizer subclass.') - - assert fluid.framework._dygraph_tracer()._has_grad, ( - 'Please enable the generation of gradients.') - - if self.is_first_stage or self.is_last_stage: - assert data is not None, ( - "For the first and the last stage, the data must be set.") - else: - data = None + def forward_backward_pipeline(self, data, scaler=None): + # use the 1f1b scheduling strategy. + # this strategy is inspired by: + # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/schedules.py - self.optimizer = optimizer - self.lr_scheduler = lr_scheduler self.scaler = scaler - self.data = data - self._compute_loss = True - self._layers.train() + # store data for train + self.data = data # store total loss of entire batch self.total_loss = None @@ -104,10 +93,6 @@ class PipelineParallel(MetaParallelBase): # store data id for micro_batch self.micro_batch_id = 0 - # Next, use the 1f1b scheduling strategy. - # this strategy is inspired by: - # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/schedules.py - startup_steps = (self.num_stages - self.stage_id - 1) startup_steps = min(startup_steps, self.accumulate_steps) steady_steps = self.accumulate_steps - startup_steps @@ -161,11 +146,35 @@ class PipelineParallel(MetaParallelBase): self._layers.allreduce_shared_weight_gradients() - self.train_loss = self._broadcast_final_loss() + train_loss = self._broadcast_final_loss() + + return train_loss + + def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None): + assert isinstance(optimizer, HybridParallelOptimizer), ( + 'optimizer should be HybridParallelOptimizer subclass.') + + assert fluid.framework._dygraph_tracer()._has_grad, ( + 'Please enable the generation of gradients.') + + if self.is_first_stage or self.is_last_stage: + assert data is not None, ( + "For the first and the last stage, the data must be set.") + else: + data = None + + self.optimizer = optimizer + self.lr_scheduler = lr_scheduler + + self._layers.train() + + # 1f1b for pipeline + train_loss = self.forward_backward_pipeline(data, scaler) # optimizer self._optimizer_step() - return self.train_loss + + return train_loss def eval_batch(self, data, compute_loss=False): self._layers.eval() diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index e4525a8d17..7dd8d38aa7 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -354,9 +354,15 @@ def sync_params_buffers(model, if not isinstance(param, core.VarBase): raise TypeError("The data type of '%s' must be Varbase" % param.name) + # is_distributed param not need to sync when in mp mode - if is_model_parallel and isinstance(param, ParamBase): - if param.is_distributed: + if isinstance(param, ParamBase): + if is_model_parallel and param.is_distributed: + continue + + # NOTE(shenliang03): Support situations that do not require synchronization parameters, + # such as moe's expert parameters + if getattr(param, "no_sync", False): continue model_vars.append(param.detach()) -- GitLab