From adc26dffa9dac81bd93c88d70f0ab66fcdcc81f0 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 15 Jan 2018 10:36:09 +0800
Subject: [PATCH] developing GradientClipByGlobalNorm

---
 python/paddle/v2/fluid/clip.py       | 54 ++++++++++++++++++++++++----
 python/paddle/v2/fluid/layers/ops.py | 20 ++++-------
 2 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/python/paddle/v2/fluid/clip.py b/python/paddle/v2/fluid/clip.py
index eb75018d779..f0904e18ea3 100644
--- a/python/paddle/v2/fluid/clip.py
+++ b/python/paddle/v2/fluid/clip.py
@@ -1,5 +1,6 @@
 import functools
 import layers
+from framework import Variable
 from . import core
 
 __all__ = [
@@ -44,7 +45,7 @@ def error_clip_callback(block, context):
 
 
 class BaseGradientClipAttr(object):
-    def process_context(self, context, p_g):
+    def process_context(self, context, param, grad):
         raise NotImplementedError()
 
     def create_operators(self, param, grad):
@@ -52,7 +53,7 @@ class BaseGradientClipAttr(object):
 
 
 class NullGradientClipAttr(BaseGradientClipAttr):
-    def process_context(self, context, p_g):
+    def process_context(self, context, param, grad):
         pass
 
     def create_operators(self, param, grad):
@@ -69,7 +70,7 @@ class GradientClipByValue(BaseGradientClipAttr):
         self.max = max
         self.min = min
 
-    def process_context(self, context, p_g):
+    def process_context(self, context, param, grad):
         pass
 
     def create_operators(self, param, grad):
@@ -81,7 +82,7 @@ class GradientClipByNorm(BaseGradientClipAttr):
     def __init__(self, clip_norm):
         self.clip_norm = clip_norm
 
-    def process_context(self, context, p_g):
+    def process_context(self, context, param, grad):
         pass
 
     def create_operators(self, param, grad):
@@ -89,6 +90,46 @@ class GradientClipByNorm(BaseGradientClipAttr):
         return param, new_grad
 
 
+class GradientClipByGlobalNorm(BaseGradientClipAttr):
+    global_norm_var = None
+    clip_norm_var = None
+    ratio_var = None
+
+    @classmethod
+    def init(cls, clip_norm):
+        cls.global_norm_var = layers.fill_constant(
+            shape=[1], dtype="float32", value=0.0)
+        cls.clip_norm_var = layers.fill_constant(
+            shape=[1], dtype="float32", value=clip_norm)
+
+    def __init__(self):
+        if not (isinstance(self.__class__.global_norm_var, Variable) and
+                isinstance(self.__class__.clip_norm_var, Variable)):
+            raise ValueError(
+                "Class 'GradientClipByGlobalNorm' has not been properly initialized. Please call GradientClipByGlobalNorm.init() first."
+            )
+
+    def process_context(self, context, param, grad):
+        local_norm_var = layers.reduce_sum(
+            x=layers.pow(x=grad, factor=2), reduce_all=True)
+        layers.sums(
+            input=[local_norm_var, self.__class__.global_norm_var],
+            out=[self.__class__.global_norm_var])
+
+    def create_operators(self, param, grad):
+        if self.__class__.ratio_var is None:
+            self.__class__.global_norm_var = layers.sqrt(
+                x=self.__class__.global_norm_var)
+            self.__class__.ratio_var = layers.elementwise_div(
+                x=self.__class__.clip_norm_var,
+                y=layers.elementwise_max(
+                    x=self.__class__.clip_norm_var,
+                    y=self.__class__.global_norm_var))
+        # 缺乏elementwise_max
+        # 没法将ratio_var送给scale_op。
+        # new_grad = layers.
+
+
 def append_gradient_clip_ops(param_grad):
     context = dict()
     create_op_callbacks = []
@@ -98,10 +139,9 @@ def append_gradient_clip_ops(param_grad):
             clip_attr = NullGradientClipAttr()
         if not isinstance(clip_attr, BaseGradientClipAttr):
             raise TypeError(
-                "clip attribute should be an instance of BaseGradientClippingAttr"
-            )
+                "clip attribute should be an instance of BaseGradientClipAttr")
 
-        clip_attr.process_context(context=context, p_g=param_grad)
+        clip_attr.process_context(context=context, param=p, grad=g)
         create_op_callbacks.append(
             functools.partial(
                 clip_attr.create_operators, param=p, grad=g))
diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py
index 884e84011d9..021b87828f3 100644
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -1,23 +1,15 @@
 from ..registry import register_layer
 
 __activations__ = [
-    'abs', 'tanh', 'sigmoid', 'relu', 'sqrt', 'ceil', 'floor', 'log', 'round'
+    'abs', 'tanh', 'sigmoid', 'relu', 'sqrt', 'ceil', 'floor', 'log', 'round',
+    'pow'
 ]
 
 __all__ = [
-    'mean',
-    'mul',
-    'reshape',
-    'scale',
-    'transpose',
-    'sigmoid_cross_entropy_with_logits',
-    'elementwise_add',
-    'elementwise_div',
-    'elementwise_sub',
-    'elementwise_mul',
-    'clip',
-    'clip_by_norm',
-    'sequence_softmax',
+    'mean', 'mul', 'reshape', 'scale', 'transpose',
+    'sigmoid_cross_entropy_with_logits', 'elementwise_add', 'elementwise_div',
+    'elementwise_sub', 'elementwise_mul', 'clip', 'clip_by_norm',
+    'sequence_softmax', 'reduce_sum'
 ] + __activations__
 
 for _OP in set(__all__):
-- 
GitLab