diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py index 340eff46f7341697a83b1d1f1ead5576cb3dc15b..4796ad2f1f3f1d98766abf2a2587476b48e0dd43 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py @@ -111,6 +111,7 @@ class GradientClipHelper(object): to_check_param - should_check_param) for var_name in deperated_vars: - block._remove_var(var_name, sync=False) + if block.has_var(var_name): + block._remove_var(var_name, sync=False) block._sync_with_cpp() return diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 8fd01509331e207af1aaabde1e40404f1a8c6f74..5e4ea24137e538558a1885b5748c80655d7dd6f1 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -489,9 +489,14 @@ class ClipGradByGlobalNorm(ClipGradBase): continue with p.block.program._optimized_guard([p, g]): - new_grad = layers.elementwise_mul(x=g, y=scale_var) - param_new_grad_name_dict[p.name] = new_grad.name - params_and_grads.append((p, new_grad)) + p.block.append_op( + type='elementwise_mul', + inputs={'X': g, + 'Y': scale_var}, + outputs={'Out': g}) + + param_new_grad_name_dict[p.name] = p.name + params_and_grads.append((p, p)) _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict) return params_and_grads