@@ -314,18 +331,6 @@ class HybridParallelClipGrad:
...
@@ -314,18 +331,6 @@ class HybridParallelClipGrad:
group=self._hcg.get_pipe_parallel_group(),
group=self._hcg.get_pipe_parallel_group(),
)
)
# In Sharding mode, param and grad is mapping different rank in optimizer.
# ClipGradByGlobalNorm need allreduce to get globol norm
# TODO(pangengzheng): remove the self.not_sharding_stage1 flag when there is no diff in calculating global norm values in HybridParallelClipGrad compared to dp.