diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index e884185528282021fd16289ccc6a3533e22b9967..4c24d0d6a7069c75c7b9b8245f4567ae8bfc2742 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -271,7 +271,8 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): "All parameters' 'clip_norm' of a same group should be the same" ) - local_norm_var = layers.reduce_sum(input=layers.pow(x=grad, factor=2.0)) + square = grad * grad + local_norm_var = layers.cast(layers.reduce_sum(input=square), 'float64') context[self.group_name].append(local_norm_var) self.context = context @@ -281,6 +282,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): if group_scale_name not in self.context: group_norm_var = layers.sums(input=self.context[self.group_name]) group_norm_var = layers.sqrt(x=group_norm_var) + group_norm_var = layers.cast(group_norm_var, 'float32') clip_var = self.context[self.group_name + "_clip"] group_scale_var = layers.elementwise_div( x=clip_var,