diff --git a/paddle/phi/kernels/shape_kernel.cc b/paddle/phi/kernels/shape_kernel.cc index dd26a7edc9cdd8e1917bb5d88e957b3e7d545f93..f87b5014c12072ab536c2969804e7b9439cacdc0 100644 --- a/paddle/phi/kernels/shape_kernel.cc +++ b/paddle/phi/kernels/shape_kernel.cc @@ -63,5 +63,7 @@ PD_REGISTER_KERNEL(shape, double, phi::dtype::complex, phi::dtype::complex, - phi::dtype::float16) {} + phi::dtype::float16) { + kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); +} #endif diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 826deae498cb598028f4922590896d6373ed29d3..ff1bacb5c9b6732fcdc9a5a94130c909cb4bfc6e 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -468,10 +468,15 @@ class ClipGradByGlobalNorm(ClipGradBase): sdg.step() """ - def __init__(self, clip_norm, group_name="default_group"): + def __init__(self, + clip_norm, + group_name="default_group", + auto_skip_clip=False): super(ClipGradByGlobalNorm, self).__init__() self.clip_norm = float(clip_norm) self.group_name = group_name + assert isinstance(auto_skip_clip, bool) + self.auto_skip_clip = auto_skip_clip def __str__(self): return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm) @@ -524,14 +529,19 @@ class ClipGradByGlobalNorm(ClipGradBase): max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) - # only when global_norm_var > max_global_norm, grad need clip need_clip = False - if global_norm_var > max_global_norm: + if not self.auto_skip_clip: # always apply clip + need_clip = True + clip_var = layers.elementwise_div( + x=max_global_norm, + y=layers.elementwise_max( + x=global_norm_var, y=max_global_norm)) + elif global_norm_var > max_global_norm: + # only when global_norm_var > max_global_norm, grad need clip need_clip = True - - if need_clip: clip_var = layers.elementwise_div( x=max_global_norm, y=global_norm_var) + for p, g in params_grads: if g is None: continue