From 057202574bf2c4e15d4bb5f4d313488af3870202 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Tue, 25 Jul 2023 10:57:27 +0800 Subject: [PATCH] Call multiply_ instead of scale_ to avoid multiple DtoH copy. (#55589) * Call multiply_ instead of scale_ to avoid multiple DtoH copy. * Call _squared_l2_norm to calculate grad_clip. * Fix import error. --- .../dygraph_optimizer/hybrid_parallel_optimizer.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py index b24247b5807..cef28af4d47 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py @@ -103,8 +103,7 @@ class HybridParallelClipGrad: if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = clip.merge_selected_rows(g) merge_grad = clip.get_tensor_from_selected_rows(merge_grad) - square = paddle.square(merge_grad) - sum_square = paddle.sum(square) + sum_square = clip._squared_l2_norm(merge_grad) not_shared_enable = (not hasattr(p, 'is_firstly_shared')) or ( hasattr(p, 'is_firstly_shared') @@ -230,15 +229,15 @@ class HybridParallelClipGrad: if getattr(p, 'need_clip', True) is False: continue if g.dtype == paddle.float16: - g.scale_(clip_var_fp16) + g.multiply_(clip_var_fp16) elif g.dtype == paddle.bfloat16: if paddle.is_compiled_with_xpu(): raise NotImplementedError( "BF16 is not supported on XPU now" ) - g.scale_(clip_var_bf16) + g.multiply_(clip_var_bf16) else: - g.scale_(clip_var) + g.multiply_(clip_var) p._reset_grad_inplace_version(True) return params_grads -- GitLab