change clip grad api, test=develop (#27767)

994438b1 · Qi Li · GitHub · 365c2c9c · 994438b1 · 994438b1
6 changed file
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -26,8 +26,8 @@ from . import name_scope
 from .dygraph import base as imperative_base

 __all__ = [
-    'set_gradient_clip', 'ErrorClipByValue', 'GradientClipByValue',
-    'GradientClipByNorm', 'GradientClipByGlobalNorm'
+    'set_gradient_clip', 'ErrorClipByValue', 'ClipGradByValue',
+    'ClipGradByNorm', 'ClipGradByGlobalNorm'
 ]


@@ -115,16 +115,9 @@ def error_clip_callback(block, context):
            error_clip._append_clip_op(block, grad_n)


-class GradientClipBase(object):
-    def __init__(self, need_clip=None):
-        if need_clip is not None and not callable(need_clip):
-            raise TypeError(
-                "The type of need_clip must be funciton, and it can filter out "
-                "parameter that does't need gradient clip. This function must return "
-                "True or False, and True means that clipping is required. Please refer to "
-                "API documention of GradientClipByGlobalNorm / GradientClipByNorm "
-                "/GradientClipByValue.")
-        self._need_clip_func = need_clip
+class ClipGradBase(object):
+    def __init__(self):
+        super(ClipGradBase, self).__init__()

    def __str__(self):
        raise NotImplementedError()
@@ -144,7 +137,7 @@ class GradientClipBase(object):
                if getattr(p, 'gradient_clip_attr', None) is not None:
                    warnings.warn(
                        "'set_gradient_clip' will be ineffective, because you have "
-                        "set 'grad_clip' in 'optimizer'. So, 'set_gradient_clip' "
+                        "set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' "
                        "is redundant and you can remove it.")
                    break
            return self._static_clip(params_grads)
@@ -156,7 +149,7 @@ class GradientClipBase(object):
        raise NotImplementedError()


-class GradientClipByValue(GradientClipBase):
+class ClipGradByValue(ClipGradBase):
    """
    Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
    
@@ -164,19 +157,20 @@ class GradientClipByValue(GradientClipBase):
    
    - Any values greater than max are set to ``max``.

-    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
-    is not None, then only part of gradients can be selected for gradient clipping.
+    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``. 
+    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
    
    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
    (for example: :ref:`api_paddle_optimizer_SGD`).

+    Note:
+        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0. 
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+    
    Args:
        max (float): The maximum value to clip by.
        min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max`` 
            automatically. In this case, ``max`` must be greater than 0.
-        need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool`` 
-            (True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None, 
-            and gradients of all parameters in the network will be clipped.

    Examples:
        .. code-block:: python
@@ -184,29 +178,20 @@ class GradientClipByValue(GradientClipBase):
            import paddle

            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(10, 10)
+            linear = paddle.nn.Linear(in_features=10, out_features=10, 
+                                      weight_attr=paddle.ParamAttr(need_clip=True), 
+                                      bias_attr=paddle.ParamAttr(need_clip=False))
            out = linear(x)
            loss = paddle.mean(out)
            loss.backward()

-            # clip all parameters in network:
-            clip = paddle.nn.GradientClipByValue(min=-1, max=1)
-
-            # clip a part of parameters in network: (e.g. linear_0.w_0)
-            # pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
-            # def fileter_func(ParamBase):
-            # # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
-            #   return ParamBase.name == "linear_0.w_0"
-            # # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
-            #   return ParamBase.name == linear.weight.name
-            # clip = paddle.nn.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)
-
+            clip = paddle.nn.ClipGradByValue(min=-1, max=1)
            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            sdg.step()
    """

-    def __init__(self, max, min=None, need_clip=None):
-        super(GradientClipByValue, self).__init__(need_clip)
+    def __init__(self, max, min=None):
+        super(ClipGradByValue, self).__init__()
        if min is None:
            assert (max > 0.0)
            min = -max
@@ -214,7 +199,7 @@ class GradientClipByValue(GradientClipBase):
        self.min = float(min)

    def __str__(self):
-        return "Gradient Clip By Value, min = %f, max=%f" % (self.min, self.max)
+        return "Clip Gradient By Value, min = %f, max=%f" % (self.min, self.max)

    @imperative_base.no_grad
    def _dygraph_clip(self, params_grads):
@@ -222,7 +207,7 @@ class GradientClipByValue(GradientClipBase):
        for p, g in params_grads:
            if g is None:
                continue
-            if self._need_clip_func is not None and not self._need_clip_func(p):
+            if getattr(p, 'need_clip', True) is False:
                params_and_grads.append((p, g))
                continue
            new_grad = layers.clip(x=g, min=self.min, max=self.max)
@@ -236,8 +221,7 @@ class GradientClipByValue(GradientClipBase):
            for p, g in params_grads:
                if g is None:
                    continue
-                if self._need_clip_func is not None and not self._need_clip_func(
-                        p):
+                if getattr(p, 'need_clip', True) is False:
                    params_and_grads.append((p, g))
                    continue

@@ -256,7 +240,7 @@ class GradientClipByValue(GradientClipBase):
        return param, new_grad


-class GradientClipByNorm(GradientClipBase):
+class ClipGradByNorm(ClipGradBase):
    """
    Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
    
@@ -264,8 +248,8 @@ class GradientClipByNorm(GradientClipBase):
    
    - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
    
-    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
-    is not None, then only part of gradients can be selected for gradient clipping.
+    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
+    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
    
    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
    (for example: :ref:`api_paddle_optimizer_SGD`).
@@ -287,11 +271,12 @@ class GradientClipByNorm(GradientClipBase):
    .. math::
        norm(X) = ( \\sum_{i=1}^{n}|x\_i|^2)^{ \\frac{1}{2}}

+    Note:
+        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0. 
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+
    Args:
        clip_norm(float): The maximum norm value.
-        need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool`` 
-            (True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None, 
-            and gradients of all parameters in the network will be clipped.

    Examples:
        .. code-block:: python
@@ -299,29 +284,20 @@ class GradientClipByNorm(GradientClipBase):
            import paddle

            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(10, 10)
+            linear = paddle.nn.Linear(in_features=10, out_features=10, 
+                                      weight_attr=paddle.ParamAttr(need_clip=True), 
+                                      bias_attr=paddle.ParamAttr(need_clip=False))
            out = linear(x)
            loss = paddle.mean(out)
            loss.backward()

-            # clip all parameters in network:
-            clip = paddle.nn.GradientClipByNorm(clip_norm=1.0)
-
-            # clip a part of parameters in network: (e.g. linear_0.w_0)
-            # pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
-            # def fileter_func(ParamBase):
-            # # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
-            #   return ParamBase.name == "linear_0.w_0"
-            # # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
-            #   return ParamBase.name == linear.weight.name
-            # clip = paddle.nn.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func)
-
+            clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            sdg.step()
    """

-    def __init__(self, clip_norm, need_clip=None):
-        super(GradientClipByNorm, self).__init__(need_clip)
+    def __init__(self, clip_norm):
+        super(ClipGradByNorm, self).__init__()
        self.clip_norm = float(clip_norm)

    def __str__(self):
@@ -333,7 +309,7 @@ class GradientClipByNorm(GradientClipBase):
        for p, g in params_grads:
            if g is None:
                continue
-            if self._need_clip_func is not None and not self._need_clip_func(p):
+            if getattr(p, 'need_clip', True) is False:
                params_and_grads.append((p, g))
                continue
            new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
@@ -347,8 +323,7 @@ class GradientClipByNorm(GradientClipBase):
            for p, g in params_grads:
                if g is None:
                    continue
-                if self._need_clip_func is not None and not self._need_clip_func(
-                        p):
+                if getattr(p, 'need_clip', True) is False:
                    params_and_grads.append((p, g))
                    continue

@@ -367,7 +342,7 @@ class GradientClipByNorm(GradientClipBase):
        return param, new_grad


-class GradientClipByGlobalNorm(GradientClipBase):
+class ClipGradByGlobalNorm(ClipGradBase):
    """
    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in 
    :math:`t\_list` , and limit it to ``clip_norm`` .
@@ -376,8 +351,8 @@ class GradientClipByGlobalNorm(GradientClipBase):
    
    - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
    
-    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
-    is not None, then only part of gradients can be selected for gradient clipping.
+    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
+    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
    
    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
    (for example: :ref:`api_paddle_optimizer_SGD`).
@@ -394,12 +369,13 @@ class GradientClipByGlobalNorm(GradientClipBase):

        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}

+    Note:
+        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0. 
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+
    Args:
        clip_norm (float): The maximum norm value.
-        group_name (str, optional): The group name for this clip. Default value is ``default_group``
-        need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool`` 
-            (True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None, 
-            and gradients of all parameters in the network will be clipped.
+        group_name (str, optional): The group name for this clip. Default value is ``default_group``.

    Examples:
        .. code-block:: python
@@ -407,29 +383,20 @@ class GradientClipByGlobalNorm(GradientClipBase):
            import paddle

            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(10, 10)
+            linear = paddle.nn.Linear(in_features=10, out_features=10, 
+                                      weight_attr=paddle.ParamAttr(need_clip=True), 
+                                      bias_attr=paddle.ParamAttr(need_clip=False))
            out = linear(x)
            loss = paddle.mean(out)
            loss.backward()

-            # clip all parameters in network:
-            clip = paddle.nn.GradientClipByGlobalNorm(clip_norm=1.0)
-
-            # clip a part of parameters in network: (e.g. linear_0.w_0)
-            # pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
-            # def fileter_func(ParamBase):
-            # # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
-            #   return ParamBase.name == "linear_0.w_0"
-            # # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
-            #   return ParamBase.name == linear.weight.name
-            # clip = paddle.nn.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func)
-
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            sdg.step()
    """

-    def __init__(self, clip_norm, group_name="default_group", need_clip=None):
-        super(GradientClipByGlobalNorm, self).__init__(need_clip)
+    def __init__(self, clip_norm, group_name="default_group"):
+        super(ClipGradByGlobalNorm, self).__init__()
        self.clip_norm = float(clip_norm)
        self.group_name = group_name

@@ -443,7 +410,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
        for p, g in params_grads:
            if g is None:
                continue
-            if self._need_clip_func is not None and not self._need_clip_func(p):
+            if getattr(p, 'need_clip', True) is False:
                continue
            merge_grad = g
            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
@@ -469,7 +436,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
        for p, g in params_grads:
            if g is None:
                continue
-            if self._need_clip_func is not None and not self._need_clip_func(p):
+            if getattr(p, 'need_clip', True) is False:
                params_and_grads.append((p, g))
                continue
            new_grad = layers.elementwise_mul(x=g, y=clip_var)
@@ -484,8 +451,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
            for p, g in params_grads:
                if g is None:
                    continue
-                if self._need_clip_func is not None and not self._need_clip_func(
-                        p):
+                if getattr(p, 'need_clip', True) is False:
                    continue
                merge_grad = g
                with p.block.program._optimized_guard([p, g]):
@@ -518,8 +484,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
            for p, g in params_grads:
                if g is None:
                    continue
-                if self._need_clip_func is not None and not self._need_clip_func(
-                        p):
+                if getattr(p, 'need_clip', True) is False:
                    params_and_grads.append((p, g))
                    continue

@@ -670,9 +635,9 @@ def set_gradient_clip(clip, param_list=None, program=None):
                  "This method can reduce the mistakes, please "
                  "refer to documention of 'optimizer'.")

-    if not isinstance(clip, GradientClipBase):
+    if not isinstance(clip, ClipGradBase):
        raise TypeError(
-            "'clip' should be an instance of GradientClipBase's derived class")
+            "'clip' should be an instance of ClipGradBase's derived class")
    if program is None:
        program = framework.default_main_program()

@@ -708,7 +673,7 @@ def append_gradient_clip_ops(param_grads):
            clip_attr = getattr(p, 'gradient_clip_attr', None)
            if clip_attr is None:
                return param_grads
-            if not isinstance(clip_attr, GradientClipBase):
+            if not isinstance(clip_attr, ClipGradBase):
                raise TypeError(
                    "clip attribute should be an instance of GradientClipBase")

@@ -754,6 +719,7 @@ def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
                    op._set_attr('op_role_var', correct_p_g)


-ClipByValue = GradientClipByValue
-ClipByNorm = GradientClipByNorm
-ClipByGlobalNorm = GradientClipByGlobalNorm
+GradientClipBase = ClipGradBase
+GradientClipByValue = ClipGradByValue
+GradientClipByNorm = ClipGradByNorm
+GradientClipByGlobalNorm = ClipGradByGlobalNorm
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -5123,6 +5123,8 @@ class Parameter(Variable):
            be applied on the parameter. Default: None
        do_model_average(bool): True if the model average strategy will
            be applied on this parameter.
+        need_clip (bool): Whether the parameter gradient need to be cliped 
+            in optimizer. Default is True.
    """

    def __init__(self,
@@ -5162,6 +5164,8 @@ class Parameter(Variable):

        self.do_model_average = kwargs.get('do_model_average', None)

+        self.need_clip = kwargs.get('need_clip', True)
+
        self.is_distributed = False

    def __str__(self):
@@ -5194,7 +5198,7 @@ class Parameter(Variable):
        if with_details:
            res_str = Variable.to_string(self, throw_on_error, True)
            additional_attr = ("trainable", "optimize_attr", "regularizer",
-                               "do_model_average")
+                               "do_model_average", "need_clip")
            for attr_name in additional_attr:
                res_str += "%s: %s\n" % (attr_name,
                                         cpt.to_text(getattr(self, attr_name)))
@@ -5226,6 +5230,8 @@ class ParamBase(core.VarBase):
            be applied on the ParamBase. Default: None
        do_model_average(bool): True if the model average strategy will
            be applied on this ParamBase.
+        need_clip (bool): Whether the parameter gradient need to be cliped 
+            in optimizer. Default is True.
    """

    @dygraph_only
@@ -5265,6 +5271,8 @@ class ParamBase(core.VarBase):

        self.do_model_average = kwargs.get('do_model_average', None)

+        self.need_clip = kwargs.get('need_clip', True)
+
        self.is_distributed = False
        # self.block = default_main_program().global_block()


--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -36,8 +36,8 @@ class ParamAttr(object):
    
    Note:
        ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0. 
-        It is recommended to set ``grad_clip`` in ``optimizer`` to clip gradient. 
-        There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` , 
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+        There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` , 
        :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .

    Parameters:
@@ -57,6 +57,7 @@ class ParamAttr(object):
        trainable (bool): Whether this parameter is trainable. Default True.
        do_model_average (bool): Whether this parameter should do model average
                when model average is enabled. Default False.
+        need_clip (bool): Whether the parameter gradient need to be cliped in optimizer. Default is True.

    Examples:
        .. code-block:: python
@@ -78,7 +79,8 @@ class ParamAttr(object):
                 learning_rate=1.0,
                 regularizer=None,
                 trainable=True,
-                 do_model_average=True):
+                 do_model_average=True,
+                 need_clip=True):

        if sys.version_info.major == 2:
            check_type(name, "name", (str, type(None), unicode), "ParamAttr")
@@ -87,6 +89,7 @@ class ParamAttr(object):
        check_type(learning_rate, "learning_rate", (float, int), "ParamAttr")
        check_type(trainable, "trainable", (bool), "ParamAttr")
        check_type(do_model_average, "do_model_average", (bool), "ParamAttr")
+        check_type(need_clip, "need_clip", (bool), "ParamAttr")
        check_type(initializer, "initializer", (Initializer, type(None)),
                   "ParamAttr")
        check_type(regularizer, "regularizer",
@@ -101,6 +104,7 @@ class ParamAttr(object):
        self.regularizer = regularizer
        self.trainable = trainable
        self.do_model_average = do_model_average
+        self.need_clip = need_clip

    def _set_default_initializer(self, initializer):
        """
@@ -197,7 +201,8 @@ class ParamAttr(object):
            },
            'regularizer': self.regularizer,
            'trainable': self.trainable,
-            'do_model_average': self.do_model_average
+            'do_model_average': self.do_model_average,
+            'need_clip': self.need_clip
        }
        if with_initializer:
            kwargs['initializer'] = self.initializer
@@ -219,9 +224,9 @@ class WeightNormParamAttr(ParamAttr):
    <https://arxiv.org/pdf/1602.07868.pdf>`_.
      
    Note:
-        ``gradient_clip`` of ``WeightNormParamAttr`` HAS BEEN DEPRECATED since 2.0. 
-        It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient. 
-        There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` , 
+        ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0. 
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+        There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` , 
        :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
        

@@ -248,6 +253,7 @@ class WeightNormParamAttr(ParamAttr):
        trainable(bool, optional): Whether this parameter is trainable. Default True.
        do_model_average(bool, optional): Whether this parameter should do model average.
            Default False.
+        need_clip (bool, optional): Whether the parameter gradient need to be cliped in optimizer. Default is True.

    Examples:
        .. code-block:: python
@@ -267,7 +273,8 @@ class WeightNormParamAttr(ParamAttr):
                                                learning_rate=1.0,
                                                regularizer=paddle.regularizer.L2Decay(0.1),
                                                trainable=True,
-                                                do_model_average=False))
+                                                do_model_average=False,
+                                                need_clip=True))

    """
    # List to record the parameters reparameterized by weight normalization.
@@ -283,12 +290,14 @@ class WeightNormParamAttr(ParamAttr):
                 learning_rate=1.0,
                 regularizer=None,
                 trainable=True,
-                 do_model_average=False):
+                 do_model_average=False,
+                 need_clip=True):
        super(WeightNormParamAttr, self).__init__(
            name=name,
            initializer=initializer,
            learning_rate=learning_rate,
            regularizer=regularizer,
            trainable=trainable,
-            do_model_average=do_model_average)
+            do_model_average=do_model_average,
+            need_clip=need_clip)
        self.dim = dim
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -185,12 +185,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
    # invoke 'set_gradient_clip' in a wrong order
    def test_wrong_API_order(self):
        def backward_func(cost):
-            # no clip gradient
-            def fileter_func(param):
-                return param.name == "fc.w_0"
-
-            clip = fluid.clip.GradientClipByGlobalNorm(
-                clip_norm=5.0, need_clip=fileter_func)
+            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
            fluid.clip.set_gradient_clip(clip)
            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01,
                                                grad_clip=clip)
@@ -205,11 +200,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):

    # if grad is None or not need clip
    def test_none_grad(self):
-        def fileter_func(param):
-            return param.name == "x"
-
-        clip = fluid.clip.GradientClipByGlobalNorm(
-            self.clip_norm, need_clip=fileter_func)
+        clip = fluid.clip.GradientClipByGlobalNorm(self.clip_norm)
        x = fluid.default_main_program().global_block().create_parameter(
            name="x", shape=[2, 3], dtype="float32")
        y = fluid.default_main_program().global_block().create_parameter(
@@ -228,11 +219,6 @@ class TestGradientClipByGlobalNorm(TestGradientClip):

    # raise typeError
    def test_tpyeError(self):
-        # the type of need_clip must be an funciton
-        with self.assertRaises(TypeError):
-            clip = fluid.clip.GradientClipByGlobalNorm(
-                clip_norm=self.clip_norm, need_clip="test")
-
        # the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class
        with self.assertRaises(TypeError):
            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1,
@@ -264,26 +250,22 @@ class TestGradientClipByNorm(TestGradientClip):

    # if grad is None or not need clip
    def test_none_grad(self):
-        def fileter_func(param):
-            return param.name == "z"
-
-        clip = fluid.clip.GradientClipByNorm(
-            self.clip_norm, need_clip=fileter_func)
+        clip = fluid.clip.GradientClipByNorm(self.clip_norm)
        x = fluid.default_main_program().global_block().create_parameter(
-            name="x", shape=[2, 3], dtype="float32")
+            name="x", shape=[2, 3], dtype="float32", need_clip=False)
        y = fluid.default_main_program().global_block().create_parameter(
-            name="y", shape=[2, 3], dtype="float32")
+            name="y", shape=[2, 3], dtype="float32", need_clip=False)

        # (x, None) should not be returned
        params_grads = [(x, None), (x, y)]
        params_grads = clip(params_grads)
        self.assertTrue(
            len(clip(params_grads)) == 1,
-            "ClipByNorm: when grad is None, it shouldn't be returned by gradient clip!"
+            "ClipGradByNorm: when grad is None, it shouldn't be returned by gradient clip!"
        )
        self.assertTrue(
            params_grads[0][1].name == 'y',
-            "ClipByNorm: grad should not be clipped when filtered out!")
+            "ClipGradByNorm: grad should not be clipped when filtered out!")


 class TestGradientClipByValue(TestGradientClip):
@@ -312,26 +294,22 @@ class TestGradientClipByValue(TestGradientClip):

    # if grad is None or not need clip
    def test_none_grad(self):
-        def fileter_func(param):
-            return param.name == "z"
-
-        clip = fluid.clip.GradientClipByValue(
-            self.max, self.min, need_clip=fileter_func)
+        clip = fluid.clip.GradientClipByValue(self.max, self.min)
        x = fluid.default_main_program().global_block().create_parameter(
-            name="x", shape=[2, 3], dtype="float32")
+            name="x", shape=[2, 3], dtype="float32", need_clip=False)
        y = fluid.default_main_program().global_block().create_parameter(
-            name="y", shape=[2, 3], dtype="float32")
+            name="y", shape=[2, 3], dtype="float32", need_clip=False)

        # (x, None) should not be returned
        params_grads = [(x, None), (x, y)]
        params_grads = clip(params_grads)
        self.assertTrue(
            len(clip(params_grads)) == 1,
-            "ClipByValue: when grad is None, it shouldn't be returned by gradient clip!"
+            "ClipGradByValue: when grad is None, it shouldn't be returned by gradient clip!"
        )
        self.assertTrue(
            params_grads[0][1].name == 'y',
-            "ClipByValue: grad should not be clipped when filtered out!")
+            "ClipGradByValue: grad should not be clipped when filtered out!")


 class TestDygraphGradientClip(unittest.TestCase):
@@ -355,13 +333,9 @@ class TestDygraphGradientClip(unittest.TestCase):

 class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
    def setUp(self):
-        # only clip gradient of x (ParamBase)
-        def fileter_func(param):
-            return param.name == "x"
-
        self.clip_norm = 0.8
        self.clip1 = fluid.clip.GradientClipByGlobalNorm(
-            clip_norm=self.clip_norm, need_clip=fileter_func)
+            clip_norm=self.clip_norm)
        self.clip2 = fluid.clip.GradientClipByGlobalNorm(
            clip_norm=self.clip_norm)

@@ -401,13 +375,8 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):

 class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
    def setUp(self):
-        # only clip gradient of linear_0.w_0 (ParamBase)
-        def fileter_func(param):
-            return param.name == "linear_0.w_0"
-
        self.clip_norm = 0.8
-        self.clip = fluid.clip.GradientClipByNorm(
-            clip_norm=self.clip_norm, need_clip=fileter_func)
+        self.clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)

    def check_clip_result(self, loss, optimizer):
        # if grad is None
@@ -435,14 +404,9 @@ class TestDygraphGradientClipByNorm(TestDygraphGradientClip):

 class TestDygraphGradientClipByValue(TestDygraphGradientClip):
    def setUp(self):
-        # only clip gradient of linear_0.w_0 (ParamBase)
-        def fileter_func(param):
-            return param.name == "linear_0.w_0"
-
        self.max = 0.2
        self.min = 0.1
-        self.clip = fluid.clip.GradientClipByValue(
-            max=self.max, min=self.min, need_clip=fileter_func)
+        self.clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)

    def check_clip_result(self, loss, optimizer):
        # if grad is None

--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -31,9 +31,9 @@ __all__ += rnn.__all__
 __all__ += weight_norm_hook.__all__

 # TODO: define alias in nn directory
-from .clip import GradientClipByGlobalNorm  #DEFINE_ALIAS
-from .clip import GradientClipByNorm  #DEFINE_ALIAS
-from .clip import GradientClipByValue  #DEFINE_ALIAS
+from .clip import ClipGradByGlobalNorm  #DEFINE_ALIAS
+from .clip import ClipGradByNorm  #DEFINE_ALIAS
+from .clip import ClipGradByValue  #DEFINE_ALIAS
 # from .clip import set_gradient_clip        #DEFINE_ALIAS
 from .clip import clip  #DEFINE_ALIAS
 from .clip import clip_by_norm  #DEFINE_ALIAS
@@ -51,13 +51,13 @@ from .decode import beam_search_decode  #DEFINE_ALIAS
 # from .decode import dynamic_decode        #DEFINE_ALIAS
 from .decode import gather_tree  #DEFINE_ALIAS
 # from .input import Input        #DEFINE_ALIAS
-from .layer.activation import ELU
-from .layer.activation import GELU
-from .layer.activation import Tanh
-from .layer.activation import Hardshrink
-from .layer.activation import Hardtanh
-from .layer.activation import PReLU
-from .layer.activation import ReLU
+from .layer.activation import ELU  #DEFINE_ALIAS
+from .layer.activation import GELU  #DEFINE_ALIAS
+from .layer.activation import Tanh  #DEFINE_ALIAS
+from .layer.activation import Hardshrink  #DEFINE_ALIAS
+from .layer.activation import Hardtanh  #DEFINE_ALIAS
+from .layer.activation import PReLU  #DEFINE_ALIAS
+from .layer.activation import ReLU  #DEFINE_ALIAS
 from .layer.activation import ReLU6  #DEFINE_ALIAS
 from .layer.activation import SELU  #DEFINE_ALIAS
 from .layer.activation import LeakyReLU  #DEFINE_ALIAS

--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
@@ -13,18 +13,18 @@
 # limitations under the License.

 # TODO: define the functions to clip gradient of parameter  
-from ..fluid.clip import GradientClipByGlobalNorm  #DEFINE_ALIAS
-from ..fluid.clip import GradientClipByNorm  #DEFINE_ALIAS
-from ..fluid.clip import GradientClipByValue  #DEFINE_ALIAS
+from ..fluid.clip import ClipGradByGlobalNorm  #DEFINE_ALIAS
+from ..fluid.clip import ClipGradByNorm  #DEFINE_ALIAS
+from ..fluid.clip import ClipGradByValue  #DEFINE_ALIAS
 from ..fluid.layers import clip  #DEFINE_ALIAS

 from ..fluid.layers import clip_by_norm  #DEFINE_ALIAS

 __all__ = [
    #       'ErrorClipByValue',
-    'GradientClipByGlobalNorm',
-    'GradientClipByNorm',
-    'GradientClipByValue',
+    'ClipGradByGlobalNorm',
+    'ClipGradByNorm',
+    'ClipGradByValue',
    #       'set_gradient_clip',
    'clip',
    'clip_by_norm'