add new method of gradient_clip, better to use,test=develop (#23224)

7fda333a · Zhou Wei · GitHub · b7b0b359 · 7fda333a · 7fda333a
11 changed file
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -75,7 +75,6 @@ from .transpiler import DistributeTranspiler, \
    memory_optimize, release_memory, DistributeTranspilerConfig
 from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
 from . import clip
-from . import dygraph_grad_clip
 from . import profiler
 from . import unique_name
 from . import parallel_executor
@@ -122,7 +121,6 @@ __all__ = framework.__all__ + executor.__all__ + \
        'WeightNormParamAttr',
        'DataFeeder',
        'clip',
-        'dygraph_grad_clip',
        'profiler',
        'unique_name',
        'Scope',

--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -16,19 +16,18 @@ from __future__ import print_function

 import copy
 import six
+import warnings

 import functools
 from . import layers
 from . import framework
 from . import core
 from . import name_scope
+from .dygraph import base as imperative_base

 __all__ = [
-    'set_gradient_clip',
-    'ErrorClipByValue',
-    'GradientClipByValue',
-    'GradientClipByNorm',
-    'GradientClipByGlobalNorm',
+    'set_gradient_clip', 'ErrorClipByValue', 'GradientClipByValue',
+    'GradientClipByNorm', 'GradientClipByGlobalNorm'
 ]


@@ -116,29 +115,51 @@ def error_clip_callback(block, context):
            error_clip._append_clip_op(block, grad_n)


-class BaseGradientClipAttr(object):
+class GradientClipBase(object):
+    def __init__(self, need_clip=None):
+        if need_clip is not None and not callable(need_clip):
+            raise TypeError(
+                "The type of need_clip must be funciton, and it can filter out "
+                "parameter that does't need gradient clip. This function must return "
+                "True or False, and True means that clipping is required. Please refer to "
+                "API documention of GradientClipByGlobalNorm / GradientClipByNorm "
+                "/GradientClipByValue.")
+        self._need_clip_func = need_clip
+
    def __str__(self):
        raise NotImplementedError()

-    def _process_context(self, context, param, grad):
-        raise NotImplementedError()
-
-    def _create_operators(self, param, grad):
-        raise NotImplementedError()
+    @imperative_base.no_grad
+    def _dygraph_clip(self, params_grads):
+        raise NotImplementedError

+    def _static_clip(self, params_grads):
+        raise NotImplementedError

-class NullGradientClipAttr(BaseGradientClipAttr):
-    def __str__(self):
-        return "Null"
+    def __call__(self, params_grads):
+        assert len(
+            params_grads
+        ) > 0, "The number of trainable parameters should be greater than 0."
+        if framework.in_dygraph_mode():
+            return self._dygraph_clip(params_grads)
+        else:
+            for p, g in params_grads:
+                if getattr(p, 'gradient_clip_attr', None) is not None:
+                    warnings.warn(
+                        "'set_gradient_clip' will be ineffective, because you have "
+                        "pass 'grad_clip' into 'minimize'. So, 'set_gradient_clip' "
+                        "is redundant and you can remove it.")
+                    break
+            return self._static_clip(params_grads)

    def _process_context(self, context, param, grad):
-        pass
+        raise NotImplementedError()

    def _create_operators(self, param, grad):
-        return param, grad
+        raise NotImplementedError()


-class GradientClipByValue(BaseGradientClipAttr):
+class GradientClipByValue(GradientClipBase):
    """
    Clips gradient values to the range [min, max].

@@ -168,17 +189,46 @@ class GradientClipByValue(BaseGradientClipAttr):
                input=x, size=1, param_attr=w_param_attrs)
    """

-    def __init__(self, max, min=None):
-        max = float(max)
+    def __init__(self, max, min=None, need_clip=None):
+        super(GradientClipByValue, self).__init__(need_clip)
        if min is None:
+            assert (max > 0.0)
            min = -max
-        else:
-            min = float(min)
-        self.max = max
-        self.min = min
+        self.max = float(max)
+        self.min = float(min)

    def __str__(self):
-        return "ByValue, min=%f, max=%f" % (self.min, self.max)
+        return "Gradient Clip By Value, min = %f, max=%f" % (self.min, self.max)
+
+    @imperative_base.no_grad
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if self._need_clip_func is not None and not self._need_clip_func(p):
+                params_and_grads.append((p, g))
+                continue
+            new_grad = layers.clip(x=g, min=self.min, max=self.max)
+            params_and_grads.append((p, new_grad))
+        return params_and_grads
+
+    def _static_clip(self, params_grads):
+        params_and_grads = []
+        with framework.name_scope('gradient_clip'):
+            for p, g in params_grads:
+                if g is None:
+                    continue
+                if self._need_clip_func is not None and not self._need_clip_func(
+                        p):
+                    params_and_grads.append((p, g))
+                    continue
+
+                with p.block.program._optimized_guard([p, g]):
+                    new_grad = layers.clip(x=g, min=self.min, max=self.max)
+                params_and_grads.append((p, new_grad))
+        _correct_clip_op_role_var(params_and_grads)
+        return params_and_grads

    def _process_context(self, context, param, grad):
        pass
@@ -188,7 +238,7 @@ class GradientClipByValue(BaseGradientClipAttr):
        return param, new_grad


-class GradientClipByNorm(BaseGradientClipAttr):
+class GradientClipByNorm(GradientClipBase):
    """
    Convert the input multidimensional Tensor :math:`X` to a multidimensional Tensor whose L2 norm does not exceed the given two-norm maximum ( :math:`clip\_norm` ).

@@ -268,11 +318,42 @@ class GradientClipByNorm(BaseGradientClipAttr):

    """

-    def __init__(self, clip_norm):
-        self.clip_norm = clip_norm
+    def __init__(self, clip_norm, need_clip=None):
+        super(GradientClipByNorm, self).__init__(need_clip)
+        self.clip_norm = float(clip_norm)

    def __str__(self):
-        return "ByNorm, clip_norm=%f" % self.clip_norm
+        return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm
+
+    @imperative_base.no_grad
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if self._need_clip_func is not None and not self._need_clip_func(p):
+                params_and_grads.append((p, g))
+                continue
+            new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
+            params_and_grads.append((p, new_grad))
+        return params_and_grads
+
+    def _static_clip(self, params_grads):
+        params_and_grads = []
+        with framework.name_scope('gradient_clip'):
+            for p, g in params_grads:
+                if g is None:
+                    continue
+                if self._need_clip_func is not None and not self._need_clip_func(
+                        p):
+                    params_and_grads.append((p, g))
+                    continue
+
+                with p.block.program._optimized_guard([p, g]):
+                    new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
+                params_and_grads.append((p, new_grad))
+        _correct_clip_op_role_var(params_and_grads)
+        return params_and_grads

    def _process_context(self, context, param, grad):
        pass
@@ -282,7 +363,7 @@ class GradientClipByNorm(BaseGradientClipAttr):
        return param, new_grad


-class GradientClipByGlobalNorm(BaseGradientClipAttr):
+class GradientClipByGlobalNorm(GradientClipBase):
    """
    Clips values of multiple tensors by the ratio of the sum of their norms.

@@ -371,16 +452,104 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):

    """

-    def __init__(self, clip_norm, group_name="default_group"):
-        if not isinstance(group_name, six.string_types):
-            raise TypeError("'group_name' must be a %s." % (six.string_types))
-
-        self.clip_norm = clip_norm
+    def __init__(self, clip_norm, group_name="default_group", need_clip=None):
+        super(GradientClipByGlobalNorm, self).__init__(need_clip)
+        self.clip_norm = float(clip_norm)
        self.group_name = group_name

    def __str__(self):
-        return "ByGlobalNorm, group_name=%s, clip_norm=%f" % (self.group_name,
-                                                              self.clip_norm)
+        return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)
+
+    @imperative_base.no_grad
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+        sum_square_list = []
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if self._need_clip_func is not None and not self._need_clip_func(p):
+                continue
+            merge_grad = g
+            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
+                merge_grad = layers.merge_selected_rows(g)
+                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
+            square = layers.square(merge_grad)
+            sum_square = layers.reduce_sum(square)
+            sum_square_list.append(sum_square)
+
+        # all parameters have been filterd out
+        if len(sum_square_list) == 0:
+            return params_grads
+
+        global_norm_var = layers.concat(sum_square_list)
+        global_norm_var = layers.reduce_sum(global_norm_var)
+        global_norm_var = layers.sqrt(global_norm_var)
+        max_global_norm = layers.fill_constant(
+            shape=[1], dtype='float32', value=self.clip_norm)
+        clip_var = layers.elementwise_div(
+            x=max_global_norm,
+            y=layers.elementwise_max(
+                x=global_norm_var, y=max_global_norm))
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if self._need_clip_func is not None and not self._need_clip_func(p):
+                params_and_grads.append((p, g))
+                continue
+            new_grad = layers.elementwise_mul(x=g, y=clip_var)
+            params_and_grads.append((p, new_grad))
+
+        return params_and_grads
+
+    def _static_clip(self, params_grads):
+        params_and_grads = []
+        sum_square_list = []
+        with framework.name_scope('gradient_clip'):
+            for p, g in params_grads:
+                if g is None:
+                    continue
+                if self._need_clip_func is not None and not self._need_clip_func(
+                        p):
+                    continue
+                merge_grad = g
+                with p.block.program._optimized_guard([p, g]):
+                    if g.type == core.VarDesc.VarType.SELECTED_ROWS:
+                        merge_grad = layers.merge_selected_rows(g)
+                        merge_grad = layers.get_tensor_from_selected_rows(
+                            merge_grad)
+
+                    square = layers.square(merge_grad)
+                    sum_square = layers.reduce_sum(input=square)
+                    sum_square_list.append(sum_square)
+
+            # all parameters have been filterd out
+            if len(sum_square_list) == 0:
+                return params_grads
+
+            with p.block.program._optimized_guard([p, g]):
+                global_norm_var = layers.sums(sum_square_list)
+                global_norm_var = layers.sqrt(x=global_norm_var)
+                max_global_norm = layers.fill_constant(
+                    shape=[1], dtype="float32", value=self.clip_norm)
+                scale_var = layers.elementwise_div(
+                    x=max_global_norm,
+                    y=layers.elementwise_max(
+                        x=max_global_norm, y=global_norm_var))
+
+            for p, g in params_grads:
+                if g is None:
+                    continue
+                if self._need_clip_func is not None and not self._need_clip_func(
+                        p):
+                    params_and_grads.append((p, g))
+                    continue
+
+                with p.block.program._optimized_guard([p, g]):
+                    new_grad = layers.elementwise_mul(x=g, y=scale_var)
+                params_and_grads.append((p, new_grad))
+
+        _correct_clip_op_role_var(params_and_grads)
+        return params_and_grads

    def _process_context(self, context, param, grad):
        if self.group_name not in context:
@@ -486,12 +655,28 @@ def set_gradient_clip(clip, param_list=None, program=None):
                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
                sgd.minimize(loss)
    """
-    if not isinstance(clip, BaseGradientClipAttr):
+    warnings.warn("Caution! 'set_gradient_clip' is not recommended "
+                  "and may be deprecated in future! "
+                  "We recommend a new strategy: clip gradient by "
+                  "'optimizer.minimize(loss, grad_clip=clip)'. "
+                  "This method can reduce the mistakes, please "
+                  "see documention of 'optimzier.minimize'.")
+
+    if not isinstance(clip, GradientClipBase):
        raise TypeError(
-            "'clip' should be an instance of BaseGradientClipAttr's derived class"
-        )
+            "'clip' should be an instance of GradientClipBase's derived class")
    if program is None:
        program = framework.default_main_program()
+
+    for op in program.block(0).ops:
+        if 'op_namescope' in op.all_attrs() and "optimizer" in op.attr(
+                "op_namescope"):
+            warnings.warn(
+                "'minimize' has been invoked before, this will make 'set_gradient_clip' "
+                "be ineffective! Please invoke 'set_gradient_clip' before 'minimize'."
+            )
+            break
+
    if param_list is None:
        param_list = program.block(0).all_parameters()
    if all(isinstance(elem, six.string_types) for elem in param_list):
@@ -511,46 +696,45 @@ def append_gradient_clip_ops(param_grads):
        if g is None:
            continue
        with p.block.program._optimized_guard(
-            [p, g]), framework.name_scope('append_clip_@CLIP'):
-            clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
+            [p, g]), framework.name_scope('gradient_clip_@CLIP'):
+            clip_attr = getattr(p, 'gradient_clip_attr', None)
            if clip_attr is None:
-                clip_attr = NullGradientClipAttr()
-            if not isinstance(clip_attr, BaseGradientClipAttr):
+                return param_grads
+            if not isinstance(clip_attr, GradientClipBase):
                raise TypeError(
-                    "clip attribute should be an instance of BaseGradientClipAttr"
-                )
+                    "clip attribute should be an instance of GradientClipBase")

            clip_attr._process_context(context=context, param=p, grad=g)

    res = []
-    param_new_grad_dict = dict()
    for p, g in param_grads:
        if g is None:
            continue
        with p.block.program._optimized_guard(
-            [p, g]), framework.name_scope('append_graident_clip_@CLIP'):
+            [p, g]), framework.name_scope('graident_clip_@CLIP'):
            param, new_grad = clip_attr._create_operators(param=p, grad=g)
-            param_new_grad_dict[param.name] = new_grad.name
            res.append([param, new_grad])

-    # change wrong mapping relation between param & grad in clip op
-    clip_flag = '@CLIP'
-    block_id_list = []
-    for p, g in param_grads:
-        if g is None:
-            continue
-        block_id = p.block.idx
-        if block_id in block_id_list:
+    _correct_clip_op_role_var(res)
+    return res
+
+
+# change wrong mapping relation between param & grad in clip op
+def _correct_clip_op_role_var(params_grads):
+    for param, grad in params_grads:
+        if grad is None:
            continue
-        block_id_list.append(block_id)
-        for op in p.block.program.global_block().ops:
-            if 'op_namescope' in op.all_attrs() and clip_flag in op.attr(
+        for op in param.block.program.global_block().ops:
+            if 'op_namescope' in op.all_attrs() and "gradient_clip" in op.attr(
                    "op_namescope"):
                if op.attr('op_role_var'):
                    param_name = op.attr('op_role_var')[0]
-                    correct_p_g = [param_name, param_new_grad_dict[param_name]]
+                    index = 0
+                    for i in range(len(params_grads)):
+                        if params_grads[i][0].name == param_name:
+                            index = i
+                    correct_p_g = [param_name, params_grads[index][1].name]
                    op._set_attr('op_role_var', correct_p_g)
-    return res


 ClipByValue = GradientClipByValue

--- a/python/paddle/fluid/dygraph_grad_clip.py
+++ b/python/paddle/fluid/dygraph_grad_clip.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import copy
-import six
-
-import functools
-
-from . import layers
-from . import framework
-from . import core
-from .dygraph import base as imperative_base
-
-__all__ = [
-    'GradClipByValue',
-    'GradClipByNorm',
-    'GradClipByGlobalNorm',
-]
-
-
-class GradClipBase(object):
-    def __str__(self):
-        raise NotImplementedError()
-
-    def _clip(self, para_and_grad):
-        raise NotImplementedError
-
-    @imperative_base.no_grad
-    def __call__(self, para_and_grad):
-        return self._clip(para_and_grad)
-
-
-class GradClipByValue(GradClipBase):
-    """
-    Clips gradient values to the range [min_value, max_value].
-
-    Given a gradient g, this operation clips its value to min_value and max_value.
-
-    - Any values less than min_value are set to min_value.
-    - Any values greater than max_value are set to max_value.
-
-    Args:
-        max_value (float): The maximum value to clip by. 
-        min (float, optional): The minimum value to clip by. if not set by user, \
-        will be set to -max_value(max_value MUST be positive) by framework. 
-
-    Examples:
-        .. code-block:: python
-        
-            import numpy as np
-            import paddle
-            import paddle.fluid as fluid
-
-            from paddle.fluid.dygraph.base import to_variable
-            from paddle.fluid.dygraph.nn import Linear
-
-            from paddle.fluid.clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm
-
-            from paddle.fluid.optimizer import SGDOptimizer
-
-            with fluid.dygraph.guard():
-                value_clip = GradClipByValue( -1.0, 1.0 )
-                sgd = SGDOptimizer(learning_rate=1.0)
-                
-                init_value = np.random.uniform( -1, 1, (10, 10)).astype('float32')
-
-                linear = Linear( 10, 10)
-
-                out = linear( to_variable(init_value) )
-
-                loss = fluid.layers.reduce_mean( out )
-
-                loss.backward()
-                sgd.minimize(loss, grad_clip = value_clip)
-            
-    """
-
-    @imperative_base.no_grad
-    def __init__(self, min_value, max_value=None):
-
-        if min_value is None:
-            assert (max_value > 0.0)
-            min_value = -max_value
-        else:
-            min_value = float(min_value)
-        self.max_value = max_value
-        self.min_value = min_value
-
-    def __str__(self):
-        return "ClipByValue, min = %f, max=%f" % (self.min_value,
-                                                  self.max_value)
-
-    def _clip(self, para_and_grad):
-        out = []
-        for p, g in para_and_grad:
-            if g is None:
-                out.append((p, g))
-                continue
-
-            new_grad = layers.clip(x=g, min=self.min_value, max=self.max_value)
-
-            out.append((p, new_grad))
-
-        return out
-
-
-class GradClipByNorm(GradClipBase):
-    """
-    Clips tensor values to a maximum L2-norm.
-
-    This operator limits the L2 norm of the input :math:`X` within :math:`max\_norm`.
-    If the L2 norm of :math:`X` is less than or equal to :math:`max\_norm`, :math:`Out`
-    will be the same as :math:`X`. If the L2 norm of :math:`X` is greater than
-    :math:`max\_norm`, :math:`X` will be linearly scaled to make the L2 norm of
-    :math:`Out` equal to :math:`max\_norm`, as shown in the following formula:
-
-    .. math::
-
-        Out = \\frac{max\_norm * X}{norm(X)},
-
-    where :math:`norm(X)` represents the L2 norm of :math:`X`.
-
-    Args:
-        clip_norm (float): The maximum norm value
-
-    Examples:
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-            import paddle.fluid as fluid
-
-            from paddle.fluid.dygraph.base import to_variable
-            from paddle.fluid.dygraph.nn import Linear
-
-            from paddle.fluid.clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm
-
-            from paddle.fluid.optimizer import SGDOptimizer
-
-            with fluid.dygraph.guard():
-                norm_clip = GradClipByNorm( 5.0 )
-                sgd = SGDOptimizer(learning_rate=1.0)
-                
-                init_value = np.random.uniform( -1, 1, (10, 10)).astype('float32')
-
-                linear = Linear( 10, 10)
-
-                out = linear( to_variable(init_value) )
-
-                loss = fluid.layers.reduce_mean( out )
-
-                loss.backward()
-                sgd.minimize(loss, grad_clip = norm_clip)
-
-    """
-
-    @imperative_base.no_grad
-    def __init__(self, clip_norm):
-        self.clip_norm = clip_norm
-
-    def __str__(self):
-        return "ClipByNorm, clip_norm=%f" % self.clip_norm
-
-    def _clip(self, para_and_grad):
-        out = []
-
-        for p, g in para_and_grad:
-            if g is None:
-                out.append((p, g))
-                continue
-            new_g = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
-
-            out.append((p, new_g))
-
-        return out
-
-
-class GradClipByGlobalNorm(GradClipBase):
-    """
-    Clips values of multiple tensors by the ratio of the sum of their norms.
-
-    Given a list of tensors t_list, and a clipping ratio max_global_norm, this
-    operation returns a list of clipped tensors list_clipped.
-
-    To perform the clipping, the values :math:`t\_list[i]` are set to:
-
-    .. math::
-
-        t\_list[i] = t\_list[i] * \\frac{max\_global\_norm}{\max(global\_norm, max\_global\_norm)}
-
-    where:
-
-    .. math::
-
-        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
-
-    If :math:`max\_global\_norm > global\_norm` then the entries in t_list remain as they are,
-    otherwise they're all shrunk by the global ratio.
-
-    Args:
-        max_global_norm (float): The maximum norm value.
-        dtype (str, optional): The type of max_global_norm. Default: "float32".
-
-    Examples:
-        .. code-block:: python
-        
-            import numpy as np
-            import paddle
-            import paddle.fluid as fluid
-
-            from paddle.fluid.dygraph.base import to_variable
-            from paddle.fluid.dygraph.nn import Linear
-
-            from paddle.fluid.dygraph_grad_clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm
-
-            from paddle.fluid.optimizer import SGDOptimizer
-
-            with fluid.dygraph.guard():
-                gloabl_norm_clip = GradClipByGlobalNorm( 5.0 )
-                sgd = SGDOptimizer(learning_rate=1.0)
-                
-                init_value = np.random.uniform( -1, 1, (10, 10)).astype('float32')
-
-                linear = Linear( 10, 10)
-
-                out = linear( to_variable(init_value) )
-
-                loss = fluid.layers.reduce_mean( out )
-
-                loss.backward()
-                sgd.minimize(loss, grad_clip = gloabl_norm_clip)
-   
-
-    """
-
-    @imperative_base.no_grad
-    def __init__(self, max_global_norm, dtype='float32'):
-        self.max_global_norm = layers.fill_constant(
-            shape=[1], dtype=dtype, value=max_global_norm)
-
-    def __str__(self):
-        return "ClipByGlobalNorm, max_global_norm=%f" % (self.max_global_norm)
-
-    def _clip(self, para_and_grad):
-
-        out = []
-
-        norm_arr = []
-        for p, g in para_and_grad:
-            if g is None:
-                continue
-            merge_grad = g
-            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                merge_grad = layers.merge_selected_rows(g)
-                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
-            power = layers.square(merge_grad)
-            sum_t = layers.reduce_sum(power)
-            norm_arr.append(sum_t)
-
-        norm_global = layers.concat(norm_arr)
-        norm_global = layers.reduce_sum(norm_global)
-        norm_global = layers.sqrt(norm_global)
-
-        clip_scale = self.max_global_norm / (layers.elementwise_max(
-            x=norm_global, y=self.max_global_norm))
-
-        for p, g in para_and_grad:
-            if g is None:
-                out.append((p, g))
-                continue
-
-            new_grad = g * clip_scale
-
-            out.append((p, new_grad))
-
-        return out
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2409,7 +2409,6 @@ class Block(object):
            trainable = v.trainable
            optimize_attr = v.optimize_attr
            regularizer = v.regularizer
-            gradient_clip_attr = v.gradient_clip_attr
            error_clip = v.error_clip
        elif type(v) == Variable:
            var_type = "Variable"
@@ -2432,7 +2431,6 @@ class Block(object):
                    trainable=trainable,
                    optimize_attr=optimize_attr,
                    regularizer=regularizer,
-                    gradient_clip_attr=gradient_clip_attr,
                    error_clip=error_clip)
            else:
                var = Parameter(
@@ -2445,7 +2443,6 @@ class Block(object):
                    trainable=trainable,
                    optimize_attr=optimize_attr,
                    regularizer=regularizer,
-                    gradient_clip_attr=gradient_clip_attr,
                    error_clip=error_clip)
        elif var_type == "Variable":
            var = Variable(
@@ -2723,7 +2720,6 @@ class Block(object):
                    trainable=p.trainable,
                    optimize_attr=p.optimize_attr,
                    regularizer=p.regularizer,
-                    gradient_clip_attr=p.gradient_clip_attr,
                    error_clip=p.error_clip,
                    name=v.name)
            else:
@@ -2737,7 +2733,6 @@ class Block(object):
                    trainable=p.trainable,
                    optimize_attr=p.optimize_attr,
                    regularizer=p.regularizer,
-                    gradient_clip_attr=p.gradient_clip_attr,
                    error_clip=p.error_clip,
                    name=v.name)
            self.vars[new_p.name] = new_p
@@ -4646,8 +4641,6 @@ class Parameter(Variable):
            Default: {'learning_rate': 1.0}
        regularizer(WeightDecayRegularizer): The Regularizer which will
            be applied on the parameter. Default: None
-        gradient_clip_attr(BaseGradientClipAttr): The gradient clip strategy
-            which will be applied on the parameter. Default: None
        do_model_average(bool): True if the model average strategy will
            be applied on this parameter.
    """
@@ -4687,8 +4680,6 @@ class Parameter(Variable):

        self.regularizer = kwargs.get('regularizer', None)

-        self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
-
        self.do_model_average = kwargs.get('do_model_average', None)

        self.is_distributed = False
@@ -4723,7 +4714,7 @@ class Parameter(Variable):
        if with_details:
            res_str = Variable.to_string(self, throw_on_error, True)
            additional_attr = ("trainable", "optimize_attr", "regularizer",
-                               "gradient_clip_attr", "do_model_average")
+                               "do_model_average")
            for attr_name in additional_attr:
                res_str += "%s: %s\n" % (attr_name,
                                         cpt.to_text(getattr(self, attr_name)))
@@ -4752,8 +4743,6 @@ class ParamBase(core.VarBase):
            Default: {'learning_rate': 1.0}
        regularizer(WeightDecayRegularizer): The Regularizer which will
            be applied on the ParamBase. Default: None
-        gradient_clip_attr(BaseGradientClipAttr): The gradient clip strategy
-            which will be applied on the ParamBase. Default: None
        do_model_average(bool): True if the model average strategy will
            be applied on this ParamBase.
    """
@@ -4792,8 +4781,6 @@ class ParamBase(core.VarBase):

        self.regularizer = kwargs.get('regularizer', None)

-        self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
-
        self.do_model_average = kwargs.get('do_model_average', None)

        self.is_distributed = False

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -24,7 +24,7 @@ from . import framework
 from . import layers
 from . import unique_name
 from .backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
-from .clip import append_gradient_clip_ops, error_clip_callback
+from .clip import GradientClipBase, error_clip_callback, append_gradient_clip_ops
 from .framework import program_guard
 from .initializer import Constant
 from .layer_helper import LayerHelper
@@ -109,6 +109,8 @@ class Optimizer(object):
        self._opti_name_list = []
        self._accumulators_holder = {}
        self._param_device_map = dict()
+        # if pass grad_clip into minimize, it will not be None
+        self._grad_clip = None

    @framework.dygraph_only
    def state_dict(self):
@@ -690,12 +692,17 @@ class Optimizer(object):
                # ...
                optimizer.apply_gradients(params_grads)
        """
+
        params_grads = sorted(params_grads, key=lambda x: x[0].name)

        params_grads, table_param_and_grad, table_optimize_op = \
            self._process_distribute_lookuptable(params_grads)

-        params_grads = append_gradient_clip_ops(params_grads)
+        # 'minimize(grad_clip)' or 'set_gradient_clip'
+        if self._grad_clip is not None:
+            params_grads = self._grad_clip(params_grads)
+        else:
+            params_grads = append_gradient_clip_ops(params_grads)

        # Add regularization if any
        params_grads = append_regularization_ops(params_grads,
@@ -712,19 +719,19 @@ class Optimizer(object):
        """
        Second part of `minimize`, appending optimization operators for
        given `params_grads` pairs.
-
        Args:
            loss (Variable): loss variable to run optimizations.
            startup_program (Program): startup_program for initializing parameters
                in `parameter_list`.
            params_grads (list): list of (param, grad) pair to do optimization.
-
        Returns:
            list: A list of operators appended to the current program.
        """
        if framework.in_dygraph_mode():
            with program_guard(framework.default_main_program(),
                               framework.default_startup_program()):
+                if self._grad_clip is not None:
+                    params_grads = self._grad_clip(params_grads)
                params_grads = append_regularization_ops(params_grads,
                                                         self.regularization)
                optimize_ops = self._create_optimization_pass(params_grads)
@@ -809,16 +816,19 @@ class Optimizer(object):
            Please refer to the example of current Optimizer.
        """
        assert isinstance(loss, Variable), "The loss should be an Variable."
+        if grad_clip is not None:
+            if not isinstance(grad_clip, GradientClipBase):
+                raise TypeError(
+                    "'grad_clip' should be an instance of GradientClipBase's derived class"
+                )
+            self._grad_clip = grad_clip
+
        params_grads = self.backward(
            loss,
            startup_program=startup_program,
            parameter_list=parameter_list,
            no_grad_set=no_grad_set)

-        if grad_clip is not None and framework.in_dygraph_mode():
-            # TODO(hongyu): FIX later, this is only for dygraph, should be work for static mode
-            params_grads = grad_clip(params_grads)
-
        optimize_ops = self.apply_optimize(
            loss, startup_program=startup_program, params_grads=params_grads)

@@ -1148,6 +1158,7 @@ class DGCMomentumOptimizer(Optimizer):

        self.regular_type, self.regular_coeff = self._get_regularization_param(
            self.regularization)
+        self._grad_clip = None

    def _get_regularization_param(self, regularization):
        regular_type = 0
@@ -1404,24 +1415,28 @@ class DGCMomentumOptimizer(Optimizer):
        dgc_op._set_attr(op_maker.kOpRoleVarAttrName(),
                         [param_var.name, grad_var.name])

+    @imperative_base.no_grad
    def apply_gradients(self, params_grads):
        params_grads = sorted(params_grads, key=lambda x: x[0].name)
-
        params_grads, table_param_and_grad, table_optimize_op = \
            self._process_distribute_lookuptable(params_grads)

        not_dgc_params_grads = []
        dgc_params_grads = []
+        # DGC clip and regularization in optimizer.backward
        for param, grad in params_grads:
            if not self._is_use_dgc(param, grad):
                not_dgc_params_grads.append((param, grad))
            else:
                dgc_params_grads.append((param, grad))

-        # DGC clip and regularization in local
-        not_dgc_params_grads = append_gradient_clip_ops(not_dgc_params_grads)
+        # 'minimize(grad_clip)' or 'set_gradient_clip'
+        if self._grad_clip is not None:
+            not_dgc_params_grads = self._grad_clip(not_dgc_params_grads)
+        else:
+            not_dgc_params_grads = append_gradient_clip_ops(
+                not_dgc_params_grads)

-        # Add regularization if any
        not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads,
                                                         self.regularization)

@@ -3942,16 +3957,13 @@ class RecomputeOptimizer(Optimizer):
    def apply_optimize(self, loss, startup_program, params_grads):
        """
        call the apply_optimize function of self._optimizer
-
        Args:
            loss (Variable): loss variable to run optimizations.
            startup_program (Program): startup_program for initializing parameters
                in `parameter_list`.
            params_grads (list): list of (param, grad) pair to do optimization.
-
        Examples:
            .. code-block:: python
-
                import paddle.fluid as fluid
                
                def mlp(input_x, input_y, hid_dim=128, label_dim=2):
@@ -3979,7 +3991,6 @@ class RecomputeOptimizer(Optimizer):
                    cost, startup_program=None, params_grads=params_grads)
                
                print("Finished apply_optimize")
-
        """

        return self._optimizer.apply_optimize(
@@ -3991,24 +4002,24 @@ class RecomputeOptimizer(Optimizer):
                 parameter_list=None,
                 no_grad_set=None,
                 grad_clip=None):
-
-        assert (isinstance(loss, Variable)), "The loss should be an Variable."
+        assert isinstance(loss, Variable), "The loss should be an Variable."
        assert (self._checkpoints is not None
                ), "You should call _set_checkpoints first"
        if framework.in_dygraph_mode():
            raise NotImplementedError(
                "DyGraph current does not support recompute")
-
+        if grad_clip is not None:
+            if not isinstance(grad_clip, GradientClipBase):
+                raise TypeError(
+                    "'grad_clip' should be an instance of GradientClipBase's derived class"
+                )
+            self._optimizer._grad_clip = grad_clip
        params_grads = self.backward(
            loss,
            startup_program=startup_program,
            parameter_list=parameter_list,
            no_grad_set=no_grad_set)

-        if grad_clip:
-            # TODO(guru4elephant): should add grad_clip for static graph
-            pass
-
        optimize_ops = self.apply_optimize(
            loss, startup_program=startup_program, params_grads=params_grads)


--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -15,6 +15,7 @@
 from __future__ import print_function

 import six
+import warnings

 from .initializer import Initializer, Xavier, Constant
 from .regularizer import WeightDecayRegularizer
@@ -68,7 +69,6 @@ class ParamAttr(object):
                 learning_rate=1.0,
                 regularizer=None,
                 trainable=True,
-                 gradient_clip=None,
                 do_model_average=True):
        self.name = name
        if isinstance(self.name, six.string_types) and self.name == "":
@@ -78,7 +78,6 @@ class ParamAttr(object):
        self.learning_rate = learning_rate
        self.regularizer = regularizer
        self.trainable = trainable
-        self.gradient_clip = gradient_clip
        self.do_model_average = do_model_average

    def _set_default_initializer(self, initializer):
@@ -176,7 +175,6 @@ class ParamAttr(object):
            },
            'regularizer': self.regularizer,
            'trainable': self.trainable,
-            'gradient_clip_attr': self.gradient_clip,
            'do_model_average': self.do_model_average
        }
        if with_initializer:
@@ -248,7 +246,6 @@ class WeightNormParamAttr(ParamAttr):
                 learning_rate=1.0,
                 regularizer=None,
                 trainable=True,
-                 gradient_clip=None,
                 do_model_average=False):
        super(WeightNormParamAttr, self).__init__(
            name=name,
@@ -256,6 +253,5 @@ class WeightNormParamAttr(ParamAttr):
            learning_rate=learning_rate,
            regularizer=regularizer,
            trainable=trainable,
-            gradient_clip=gradient_clip,
            do_model_average=do_model_average)
        self.dim = dim
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -476,15 +476,18 @@ class TestL2Decay(TranspilerTest):
            size=1000,
            act=None,
            param_attr=fluid.ParamAttr(
-                name='fc_w',
-                regularizer=fluid.regularizer.L2Decay(),
-                gradient_clip=fluid.clip.GradientClipByValue(0.1)),
+                name='fc_w', regularizer=fluid.regularizer.L2Decay()),
            bias_attr=fluid.ParamAttr(name='fc_b'))
        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
        avg_cost = fluid.layers.mean(cost)
        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
-        sgd_optimizer.minimize(avg_cost)
+
+        def filter(param):
+            return param.name == "fc_w"
+
+        clip = fluid.clip.GradientClipByValue(0.1, need_clip=filter)
+        sgd_optimizer.minimize(avg_cost, grad_clip=clip)

    def transpiler_test_impl(self):
        pserver, startup = self.get_pserver(self.pserver1_ep)

--- a/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
+++ b/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
@@ -25,7 +25,7 @@ from paddle.fluid import core

 from paddle.fluid.dygraph.base import to_variable

-from paddle.fluid.dygraph_grad_clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm
+from paddle.fluid.clip import GradientClipByValue, GradientClipByNorm, GradientClipByGlobalNorm


 class TestGradClipByGlobalNorm(unittest.TestCase):
@@ -65,7 +65,7 @@ class TestGradClipByGlobalNorm(unittest.TestCase):
    def get_dygrap_global_norm_result(self):
        with fluid.dygraph.guard():

-            gloabl_norm_clip = GradClipByGlobalNorm(self.max_global_norm)
+            gloabl_norm_clip = GradientClipByGlobalNorm(self.max_global_norm)
            p_g_var = []
            for p, g in self.para_and_grad:
                new_p = to_variable(p)
@@ -135,7 +135,7 @@ class TestGradClipByNorm(unittest.TestCase):
    def get_dygrap_norm_result(self):
        with fluid.dygraph.guard():

-            norm_clip = GradClipByNorm(self.max_norm)
+            norm_clip = GradientClipByNorm(self.max_norm)
            p_g_var = []
            for p, g in self.para_and_grad:
                new_p = to_variable(p)
@@ -200,8 +200,8 @@ class TestGradClipByValue(unittest.TestCase):

    def get_dygrap_clip_result(self):
        with fluid.dygraph.guard():
-
-            value_clip = GradClipByValue(self.min_value, self.max_value)
+            value_clip = GradientClipByValue(
+                max=self.max_value, min=self.min_value)
            p_g_var = []
            for p, g in self.para_and_grad:
                new_p = to_variable(p)
@@ -225,7 +225,7 @@ class TestGradClipByValue(unittest.TestCase):
        for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g):
            self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8))

-    def test_clip_by_norm_2(self):
+    def test_clip_by_value_2(self):
        self.init_value()

        self.init_scale = 0.2
@@ -236,7 +236,7 @@ class TestGradClipByValue(unittest.TestCase):
        for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g):
            self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8))

-    def test_clip_by_norm_3(self):
+    def test_clip_by_value_3(self):
        self.init_value()

        self.init_scale = 0.5

--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -54,26 +54,32 @@ class TestGradientClip(unittest.TestCase):
        self.BATCH_SIZE = 2
        reader = fake_imdb_reader(self.word_dict_len, self.BATCH_SIZE * 100)
        self.train_data = paddle.batch(reader, batch_size=self.BATCH_SIZE)
+        self.init()
+
+    def init(self):
+        pass

    def get_places(self):
-        places = [core.CPUPlace()]
+        places = [fluid.CPUPlace()]
        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
+            places.append(fluid.CUDAPlace(0))
        return places

-    def check_operators(self, place):
-        CLIP = 1
+    def clip_gradient(self, params_grads):
+        pass

-        prog = fluid.framework.Program()
-        startup_program = fluid.framework.Program()
+    def check_clip_result(self, out, out_clip):
+        pass
+
+    def check_gradient_clip(self, place):
+        prog = fluid.Program()
+        startup_program = fluid.Program()
        with fluid.program_guard(
                main_program=prog, startup_program=startup_program):
-            image = fluid.layers.data(name='x', shape=[784], dtype='float32')
-            label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-
-            hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
-            hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
-            predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
+            image = fluid.data(name='x', shape=[-1, 784], dtype='float32')
+            label = fluid.data(name='y', shape=[-1, 1], dtype='int64')
+            hidden = fluid.layers.fc(input=image, size=32, act='relu')
+            predict = fluid.layers.fc(input=hidden, size=10, act='softmax')

            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(cost)
@@ -84,45 +90,26 @@ class TestGradientClip(unittest.TestCase):
        p_g = fluid.backward.append_backward(loss=avg_cost)
        p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)

+        p_g = sorted(p_g, key=lambda x: x[0].name)
+        p_g_clip = sorted(p_g_clip, key=lambda x: x[0].name)
        with fluid.program_guard(
                main_program=prog_clip, startup_program=startup_program):
-            fluid.clip.set_gradient_clip(
-                fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP))
-            p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
+            p_g_clip = self.clip_gradient(p_g_clip)

        grad_list = [elem[1] for elem in p_g]
        grad_clip_list = [elem[1] for elem in p_g_clip]

-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=8192),
-            batch_size=128)
-
+        train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=3)
        exe = fluid.Executor(place)
        feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
        exe.run(startup_program)

-        count = 0
-        for data in train_reader():
-            count += 1
-            if count > 5:
-                break
-            out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list)
-            out_clip = exe.run(prog_clip,
-                               feed=feeder.feed(data),
-                               fetch_list=grad_clip_list)
-            global_norm = 0
-            for v in out:
-                global_norm += np.sum(np.power(v, 2))
-            global_norm = np.sqrt(global_norm)
-
-            global_norm_clip = 0
-            for v in out_clip:
-                global_norm_clip += np.sum(np.power(v, 2))
-            global_norm_clip = np.sqrt(global_norm_clip)
-
-            assert np.isclose(
-                a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3)
+        data = next(train_reader())
+        out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list)
+        out_clip = exe.run(prog_clip,
+                           feed=feeder.feed(data),
+                           fetch_list=grad_clip_list)
+        self.check_clip_result(out, out_clip)

    def check_sparse_gradient_clip(self, place):
        prog = fluid.framework.Program()
@@ -134,11 +121,7 @@ class TestGradientClip(unittest.TestCase):
            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
            cost = bow_net(data, label, self.word_dict_len)

-            fluid.clip.set_gradient_clip(
-                clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0))
-
-            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
-            sgd_optimizer.minimize(cost)
+            self.backward_and_optimize(cost)

        exe = fluid.Executor(place)
        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
@@ -150,13 +133,345 @@ class TestGradientClip(unittest.TestCase):
        print(val)
        self.assertFalse(np.isnan(val))

-    def test_operators(self):
-        self.check_operators(core.CPUPlace())
+    def backward_and_optimize(cost):
+        pass
+
+
+class TestGradientClipByGlobalNorm(TestGradientClip):
+    def init(self):
+        self.clip_norm = 0.2
+
+    def clip_gradient(self, params_grads):
+        clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
+        print(clip)
+        return clip(params_grads)
+
+    def check_clip_result(self, out, out_clip):
+        global_norm = 0
+        for v in out:
+            global_norm += np.sum(np.power(v, 2))
+        global_norm = np.sqrt(global_norm)
+        scale = self.clip_norm / np.maximum(self.clip_norm, global_norm)
+        res = []
+        for i in range(len(out)):
+            out[i] = scale * out[i]
+
+        for u, v in zip(out, out_clip):
+            self.assertTrue(
+                np.allclose(
+                    a=u, b=v, rtol=1e-5, atol=1e-8),
+                "gradient clip by global norm has wrong results!")
+
+    # test whether the ouput is right when use 'set_gradient_clip'
+    def test_old_gradient_clip(self):
+        def func(params_grads):
+            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
+            fluid.clip.set_gradient_clip(clip)
+            return fluid.clip.append_gradient_clip_ops(params_grads)
+
+        self.clip_gradient = func
+        self.check_gradient_clip(fluid.CPUPlace())
+
+    # test whether the ouput is right when use 'minimize(grad_clip)'
+    def test_new_gradient_clip(self):
+        def func(params_grads):
+            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
+            print(clip)
+            return clip(params_grads)

-    def test_sparse_gradient_clip(self):
+        self.clip_gradient = func
+        self.check_gradient_clip(fluid.CPUPlace())
+
+    # invoke 'set_gradient_clip' in a wrong order
+    def test_wrong_API_order(self):
+        def backward_func(cost):
+            # no clip gradient
+            def fileter_func(param):
+                return param.name == "fc.w_0"
+
+            clip = fluid.clip.GradientClipByGlobalNorm(
+                clip_norm=5.0, need_clip=fileter_func)
+            fluid.clip.set_gradient_clip(clip)
+            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
+            # if 'set_gradient_clip' and 'minimize(grad_clip)' together, 'set_gradient_clip' will be ineffective
+            sgd_optimizer.minimize(cost, grad_clip=clip)
+            # 'set_gradient_clip' must before 'minimize', otherwise, 'set_gradient_clip' will be ineffective
+            fluid.clip.set_gradient_clip(clip)
+
+        self.backward_and_optimize = backward_func
        for place in self.get_places():
            self.check_sparse_gradient_clip(place)

+    # if grad is None or not need clip
+    def test_none_grad(self):
+        def fileter_func(param):
+            return param.name == "x"
+
+        clip = fluid.clip.GradientClipByGlobalNorm(
+            self.clip_norm, need_clip=fileter_func)
+        x = fluid.default_main_program().global_block().create_parameter(
+            name="x", shape=[2, 3], dtype="float32")
+        y = fluid.default_main_program().global_block().create_parameter(
+            name="y", shape=[2, 3], dtype="float32")
+
+        # (x, None) should not be returned
+        params_grads = [(x, None), (x, y), (y, x)]
+        params_grads = clip(params_grads)
+        self.assertTrue(
+            len(clip(params_grads)) == 2,
+            "ClipByGlobalNorm: when grad is None, it shouldn't be returned by gradient clip!"
+        )
+        self.assertTrue(
+            params_grads[0][1].name != 'y',
+            "ClipByGlobalNorm: param_grad (x, y) should be clipped!")
+
+    # raise typeError
+    def test_tpyeError(self):
+        # the type of need_clip must be an funciton
+        with self.assertRaises(TypeError):
+            clip = fluid.clip.GradientClipByGlobalNorm(
+                clip_norm=self.clip_norm, need_clip="test")
+
+        # the type of minimize(grad_clip=) must be an instance of GradientClipBase's derived class
+        with self.assertRaises(TypeError):
+            x = fluid.default_main_program().global_block().create_parameter(
+                name="x", shape=[2, 3], dtype="float32")
+            loss = fluid.layers.reduce_mean(x)
+            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+            sgd_optimizer.minimize(loss, grad_clip="test")
+
+        # the type of RecomputeOptimizer.minimize(grad_clip=) must be an instance of GradientClipBase's derived class
+        with self.assertRaises(TypeError):
+            x = fluid.default_main_program().global_block().create_parameter(
+                name="x", shape=[2, 3], dtype="float32")
+            loss = fluid.layers.reduce_mean(x)
+            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+            recompute_optimizer = fluid.optimizer.RecomputeOptimizer(
+                sgd_optimizer)
+            recompute_optimizer._set_checkpoints([x])
+            recompute_optimizer.minimize(loss, grad_clip="test")
+
+
+class TestGradientClipByNorm(TestGradientClip):
+    def init(self):
+        self.clip_norm = 0.2
+
+    def clip_gradient(self, params_grads):
+        clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
+        print(clip)
+        return clip(params_grads)
+
+    def check_clip_result(self, out, out_clip):
+        for u, v in zip(out, out_clip):
+            norm = np.sqrt(np.sum(np.power(u, 2)))
+            scale = self.clip_norm / np.maximum(self.clip_norm, norm)
+            u = u * scale
+            self.assertTrue(
+                np.allclose(
+                    a=u, b=v, rtol=1e-5, atol=1e-8),
+                "gradient clip by norm has wrong results!")
+
+    # test whether the ouput is right when use 'minimize(grad_clip)'
+    def test_gradient_clip(self):
+        self.check_gradient_clip(fluid.CPUPlace())
+
+    # if grad is None or not need clip
+    def test_none_grad(self):
+        def fileter_func(param):
+            return param.name == "z"
+
+        clip = fluid.clip.GradientClipByNorm(
+            self.clip_norm, need_clip=fileter_func)
+        x = fluid.default_main_program().global_block().create_parameter(
+            name="x", shape=[2, 3], dtype="float32")
+        y = fluid.default_main_program().global_block().create_parameter(
+            name="y", shape=[2, 3], dtype="float32")
+
+        # (x, None) should not be returned
+        params_grads = [(x, None), (x, y)]
+        params_grads = clip(params_grads)
+        self.assertTrue(
+            len(clip(params_grads)) == 1,
+            "ClipByNorm: when grad is None, it shouldn't be returned by gradient clip!"
+        )
+        self.assertTrue(
+            params_grads[0][1].name == 'y',
+            "ClipByNorm: grad should not be clipped when filtered out!")
+
+
+class TestGradientClipByValue(TestGradientClip):
+    def init(self):
+        self.max = 0.2
+        self.min = 0.1
+
+    def clip_gradient(self, params_grads):
+        clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
+        print(clip)
+        return clip(params_grads)
+
+    def check_clip_result(self, out, out_clip):
+        for i, v in enumerate(out):
+            out[i] = np.clip(v, self.min, self.max)
+        for u, v in zip(out, out_clip):
+            u = np.clip(u, self.min, self.max)
+            self.assertTrue(
+                np.allclose(
+                    a=u, b=v, rtol=1e-6, atol=1e-8),
+                "gradient clip by value has wrong results!")
+
+    # test whether the ouput is right when use 'minimize(grad_clip)'
+    def test_gradient_clip(self):
+        self.check_gradient_clip(fluid.CPUPlace())
+
+    # if grad is None or not need clip
+    def test_none_grad(self):
+        def fileter_func(param):
+            return param.name == "z"
+
+        clip = fluid.clip.GradientClipByValue(
+            self.max, self.min, need_clip=fileter_func)
+        x = fluid.default_main_program().global_block().create_parameter(
+            name="x", shape=[2, 3], dtype="float32")
+        y = fluid.default_main_program().global_block().create_parameter(
+            name="y", shape=[2, 3], dtype="float32")
+
+        # (x, None) should not be returned
+        params_grads = [(x, None), (x, y)]
+        params_grads = clip(params_grads)
+        self.assertTrue(
+            len(clip(params_grads)) == 1,
+            "ClipByValue: when grad is None, it shouldn't be returned by gradient clip!"
+        )
+        self.assertTrue(
+            params_grads[0][1].name == 'y',
+            "ClipByValue: grad should not be clipped when filtered out!")
+
+
+class TestDygraphGradientClip(unittest.TestCase):
+    def test_gradient_clip(self):
+        with fluid.dygraph.guard():
+            linear = fluid.dygraph.Linear(5, 5)
+            inputs = fluid.layers.uniform_random(
+                [16, 5], min=-10, max=10).astype('float32')
+            out = linear(fluid.dygraph.to_variable(inputs))
+            loss = fluid.layers.reduce_mean(out)
+            loss.backward()
+            sgd_optimizer = fluid.optimizer.SGD(
+                learning_rate=0.0, parameter_list=linear.parameters())
+            self.check_clip_result(loss, sgd_optimizer)
+
+    def check_clip_result(self, loss, optimizer):
+        pass
+
+
+class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
+    def setUp(self):
+        # only clip gradient of x (ParamBase)
+        def fileter_func(param):
+            return param.name == "x"
+
+        self.clip_norm = 0.8
+        self.clip1 = fluid.clip.GradientClipByGlobalNorm(
+            clip_norm=self.clip_norm, need_clip=fileter_func)
+        self.clip2 = fluid.clip.GradientClipByGlobalNorm(
+            clip_norm=self.clip_norm)
+
+    def check_clip_result(self, loss, optimizer):
+        # if grad is None
+        x = fluid.dygraph.to_variable(
+            np.array([2, 3]).astype("float32"), name="x")
+        y = fluid.dygraph.to_variable(
+            np.array([3, 4]).astype("float32"), name="y")
+        assert len(self.clip1([(x, x), (x, y), (x, None)])) == 2
+        # get params and grads from network
+        opt, params_grads = optimizer.minimize(loss, grad_clip=self.clip2)
+        _, grads = zip(*params_grads)
+        params_grads = self.clip2(params_grads)
+        _, grads_clip = zip(*params_grads)
+
+        global_norm = 0
+        for u in grads:
+            u = u.numpy()
+            global_norm += np.sum(np.power(u, 2))
+        global_norm = np.sqrt(global_norm)
+
+        global_norm_clip = 0
+        for v in grads_clip:
+            v = v.numpy()
+            global_norm_clip += np.sum(np.power(v, 2))
+        global_norm_clip = np.sqrt(global_norm_clip)
+
+        a = np.minimum(global_norm, self.clip_norm)
+        b = global_norm_clip
+        self.assertTrue(
+            np.isclose(
+                a=a, b=b, rtol=1e-6, atol=1e-8),
+            "gradient clip by global norm has wrong results, expetcd:%f, but recieved:%f"
+            % (a, b))
+
+
+class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
+    def setUp(self):
+        # only clip gradient of linear_0.w_0 (ParamBase)
+        def fileter_func(param):
+            return param.name == "linear_0.w_0"
+
+        self.clip_norm = 0.8
+        self.clip = fluid.clip.GradientClipByNorm(
+            clip_norm=self.clip_norm, need_clip=fileter_func)
+
+    def check_clip_result(self, loss, optimizer):
+        # if grad is None
+        x = fluid.dygraph.to_variable(np.array([2, 3]).astype("float32"))
+        assert len(self.clip([(x, None)])) == 0
+        # get params and grads from network
+        self.clip([(fluid.dygraph.to_variable(np.array([2, 3])), None)])
+        params_grads = optimizer.backward(loss)
+        _, grads = zip(*params_grads)
+        params_grads = self.clip(params_grads)
+        _, grads_clip = zip(*params_grads)
+
+        for u, v in zip(grads, grads_clip):
+            u = u.numpy()
+            v = v.numpy()
+            a = np.sqrt(np.sum(np.power(u, 2)))
+            a = np.minimum(a, self.clip_norm)
+            b = np.sqrt(np.sum(np.power(v, 2)))
+            self.assertTrue(
+                np.isclose(
+                    a=a, b=b, rtol=1e-6, atol=1e-8),
+                "gradient clip by norm has wrong results, expetcd:%f, but recieved:%f"
+                % (a, b))
+
+
+class TestDygraphGradientClipByValue(TestDygraphGradientClip):
+    def setUp(self):
+        # only clip gradient of linear_0.w_0 (ParamBase)
+        def fileter_func(param):
+            return param.name == "linear_0.w_0"
+
+        self.max = 0.2
+        self.min = 0.1
+        self.clip = fluid.clip.GradientClipByValue(
+            max=self.max, min=self.min, need_clip=fileter_func)
+
+    def check_clip_result(self, loss, optimizer):
+        # if grad is None
+        x = fluid.dygraph.to_variable(np.array([2, 3]).astype("float32"))
+        assert len(self.clip([(x, None)])) == 0
+        # get params and grads from network
+        params_grads = optimizer.backward(loss)
+        _, grads = zip(*params_grads)
+        params_grads = self.clip(params_grads)
+        _, grads_clip = zip(*params_grads)
+        for u, v in zip(grads, grads_clip):
+            u = np.clip(u.numpy(), self.min, self.max)
+            v = v.numpy()
+            self.assertTrue(
+                np.allclose(
+                    a=u, b=v, rtol=1e-6, atol=1e-8),
+                "gradient clip by value has wrong results!")
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -331,7 +331,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
            model = MyLayer(size, vocab_size, size)
            optimizer = fluid.optimizer.AdamOptimizer(
                0.001, parameter_list=model.parameters())
-            grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001)
+            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)

            indices = fluid.dygraph.to_variable(indices)
            embed = fluid.dygraph.to_variable(embed)
@@ -350,7 +350,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
            model = MyLayer2(size, vocab_size, size)
            optimizer = fluid.optimizer.AdamOptimizer(
                0.001, parameter_list=model.parameters())
-            grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001)
+            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)

            indices = fluid.dygraph.to_variable(indices)
            emebd = fluid.dygraph.to_variable(embed)

--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
@@ -49,7 +49,7 @@ class TestSimpleNet(unittest.TestCase):
                    with fluid.dygraph.guard(place):
                        backward_strategy = fluid.dygraph.BackwardStrategy()
                        backward_strategy.sort_sum_gradient = sort_sum_gradient
-                        # grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0)
+                        # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)

                        input_word = np.array([[1, 2], [2, 1]]).astype('int64')
                        input = to_variable(input_word)
@@ -83,8 +83,7 @@ class TestSimpleNet(unittest.TestCase):
                with fluid.dygraph.guard(place):
                    backward_strategy = fluid.dygraph.BackwardStrategy()
                    backward_strategy.sort_sum_gradient = sort_sum_gradient
-                    grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(
-                        5.0)
+                    grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)

                    input_word = np.array([[1, 2], [2, 1]]).astype('int64')
                    input = to_variable(input_word)