From 89f5cd80207dfe0a60b2291bcfb7c04b2234c77c Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 28 Sep 2018 16:20:47 +0800 Subject: [PATCH] fix(clip): use double to accumulate grad^2 Global Norm need to compulte L2 norm of grads. It will calculate sum{grad^2}. Using float32 is easily overflowed. test=release/1.0.0 --- python/paddle/fluid/clip.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 32b8f1189..6845e3b6b 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -271,7 +271,8 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): "All parameters' 'clip_norm' of a same group should be the same" ) - local_norm_var = layers.reduce_sum(input=layers.pow(x=grad, factor=2.0)) + square = grad * grad + local_norm_var = layers.cast(layers.reduce_sum(input=square), 'float64') context[self.group_name].append(local_norm_var) self.context = context @@ -281,6 +282,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): if group_scale_name not in self.context: group_norm_var = layers.sums(input=self.context[self.group_name]) group_norm_var = layers.sqrt(x=group_norm_var) + group_norm_var = layers.cast(group_norm_var, 'float32') clip_var = self.context[self.group_name + "_clip"] group_scale_var = layers.elementwise_div( x=clip_var, -- GitLab