diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 32b8f1189fd65ba1e8da5aeaf316fc0ae05af552..6845e3b6b1f4875349591d15fdc3fd82318bd668 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -271,7 +271,8 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): "All parameters' 'clip_norm' of a same group should be the same" ) - local_norm_var = layers.reduce_sum(input=layers.pow(x=grad, factor=2.0)) + square = grad * grad + local_norm_var = layers.cast(layers.reduce_sum(input=square), 'float64') context[self.group_name].append(local_norm_var) self.context = context @@ -281,6 +282,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): if group_scale_name not in self.context: group_norm_var = layers.sums(input=self.context[self.group_name]) group_norm_var = layers.sqrt(x=group_norm_var) + group_norm_var = layers.cast(group_norm_var, 'float32') clip_var = self.context[self.group_name + "_clip"] group_scale_var = layers.elementwise_div( x=clip_var,