diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index 3f41ebaa96d07aa10de6e38bf2c80791f2c4b24d..5c97fe90a2e1771dd8edb47d54df54d3eaa51e99 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -20,11 +20,11 @@ __all__ = []
 
 import paddle
 from paddle.common_ops_import import LayerHelper
-from paddle.fluid.clip import GradientClipByNorm, append_gradient_clip_ops
 from paddle.fluid.dygraph import base as imperative_base
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.optimizer import Momentum, Optimizer
 from paddle.framework import core
+from paddle.nn.clip import ClipGradByNorm, append_gradient_clip_ops
 from paddle.static import create_global_var
 
 
@@ -76,9 +76,9 @@ class DGCMomentumOptimizer(Optimizer):
 
         self._dgc_clip_norm = None
         if grad_clip is not None:
-            if not isinstance(grad_clip, GradientClipByNorm):
+            if not isinstance(grad_clip, ClipGradByNorm):
                 raise TypeError(
-                    "The type of grad_clip should be 'GradientClipByNorm', because DGCMomentumOptimizer only support GradientClipByNorm"
+                    "The type of grad_clip should be 'ClipGradByNorm', because DGCMomentumOptimizer only support ClipGradByNorm"
                 )
             assert isinstance(num_trainers, int), (
                 "The type of num_trainers should be 'int', but received %s"
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
index 75f0061b2ca20be4c7f4f7dc10bf3c48a8374366..9eca2e667a8fd8c81aa3a4b1083ada9204cbecb6 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -15,9 +15,8 @@
 import paddle
 from paddle import framework
 from paddle.autograd import no_grad
-from paddle.fluid import layers
-from paddle.fluid.clip import ClipGradByGlobalNorm
 from paddle.framework import core
+from paddle.nn import ClipGradByGlobalNorm, clip
 
 from ...base.topology import ParallelMode
 from ...utils.hybrid_parallel_util import (
@@ -62,8 +61,8 @@ class HybridParallelClipGrad:
                 continue
             merge_grad = g
             if g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                merge_grad = layers.merge_selected_rows(g)
-                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
+                merge_grad = clip.merge_selected_rows(g)
+                merge_grad = clip.get_tensor_from_selected_rows(merge_grad)
             square = paddle.square(merge_grad)
             sum_square = paddle.sum(square)
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
index b1a572d4edfc30d9fdccc45b1b056ef7411cf44d..9a25d7c4912bacc49c727c09958c1daaaf5c7c0c 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
@@ -30,7 +30,7 @@ import paddle
 import paddle.distributed as dist
 from paddle.distributed import ParallelMode, fleet
 from paddle.fluid import core
-from paddle.fluid.clip import ClipGradByGlobalNorm
+from paddle.nn import ClipGradByGlobalNorm
 from paddle.optimizer import Optimizer
 
 HybridParallelClipGrad = (
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
index 3d3debb252d400ddf3962f064682cf1b829af131..d99683d481450309d95d13dfb26b0bc3471ea5e3 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
@@ -25,8 +25,8 @@ import paddle.fluid.framework as framework
 from paddle import nn
 from paddle.autograd import PyLayer
 from paddle.distributed import collective
-from paddle.fluid.clip import ClipGradByGlobalNorm
 from paddle.fluid.framework import EagerParamBase
+from paddle.nn import ClipGradByGlobalNorm
 
 from .group_sharded_storage import GradStorage
 from .group_sharded_utils import GroupShardedClipGrad, Type, device_guard
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
index 620540fea58761f8930b33bd8d65f6bafc7ff369..f8c86e02b7b52490dde4ad3c69068b9709c39250 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -23,6 +23,7 @@ from paddle import _legacy_C_ops
 from paddle.fluid import core, layers
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.framework import dygraph_only
+from paddle.nn import clip
 
 
 class Taskflow:
@@ -65,8 +66,8 @@ class GroupShardedClipGrad:
 
             merge_grad = g
             if g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                merge_grad = layers.get_tensor_from_selected_rows(
-                    layers.merge_selected_rows(g)
+                merge_grad = clip.get_tensor_from_selected_rows(
+                    clip.merge_selected_rows(g)
                 )
             square = paddle.square(merge_grad)
             sum_square = paddle.sum(square)
diff --git a/python/paddle/distributed/fleet/metrics/metric.py b/python/paddle/distributed/fleet/metrics/metric.py
index 8c15e47307381d862b15518cf860e34d4f9c4280..39284fa9f5a3f151747547b42409385d470571cd 100644
--- a/python/paddle/distributed/fleet/metrics/metric.py
+++ b/python/paddle/distributed/fleet/metrics/metric.py
@@ -159,7 +159,7 @@ def auc(stat_pos, stat_neg, scope=None, util=None):
         .. code-block:: python
 
           # in model.py
-          similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(output, min=-15.0, max=15.0))
+          similarity_norm = fluid.layers.sigmoid(paddle.clip(output, min=-15.0, max=15.0))
           binary_predict = fluid.layers.concat(
               input=[paddle.subtract(fluid.layers.ceil(similarity_norm), similarity_norm), similarity_norm], axis=1)
           self.auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg] =
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 8045c8cb5a62c49ff32c2e758c3985cecb568a51..eaf64e6dc6c0bd3c4bd0f3642f32953e52a81ea3 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -90,7 +90,6 @@ from .transpiler import (
     DistributeTranspilerConfig,
 )
 from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
-from . import clip
 from . import profiler
 from . import unique_name
 from . import parallel_executor
@@ -164,7 +163,6 @@ __all__ = (
         'ParamAttr',
         'WeightNormParamAttr',
         'DataFeeder',
-        'clip',
         'profiler',
         'unique_name',
         'Scope',
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
deleted file mode 100644
index ffaa84ed3e53c5aadbb6dc3e8d51a48bc00a9fb6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/clip.py
+++ /dev/null
@@ -1,944 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import warnings
-
-import functools
-import paddle
-from . import layers
-from . import framework
-from . import core
-from . import name_scope
-from .dygraph import base as imperative_base
-from .data_feeder import check_variable_and_dtype
-from .framework import in_dygraph_mode
-from .layer_helper import LayerHelper
-from .framework import default_main_program
-from paddle import _C_ops, _legacy_C_ops
-
-__all__ = [
-    'set_gradient_clip',
-    'ErrorClipByValue',
-    'ClipGradByValue',
-    'ClipGradByNorm',
-    'ClipGradByGlobalNorm',
-]
-
-_clip_by_global_norm_using_mp_type_flag = False
-
-
-def _clip_by_global_norm_using_mp_type(*args):
-    global _clip_by_global_norm_using_mp_type_flag
-    assert len(args) <= 1
-    if len(args) == 1:
-        assert isinstance(args[0], bool)
-        old_value = _clip_by_global_norm_using_mp_type_flag
-        _clip_by_global_norm_using_mp_type_flag = args[0]
-        return old_value
-    else:
-        return _clip_by_global_norm_using_mp_type_flag
-
-
-def _cast_to_mp_type_if_enabled(x):
-    if (
-        x.dtype == core.VarDesc.VarType.FP16
-        or x.dtype == core.VarDesc.VarType.BF16
-    ) and _clip_by_global_norm_using_mp_type():
-        return x.astype(core.VarDesc.VarType.FP32)
-    else:
-        return x
-
-
-def _squared_l2_norm(x):
-    r"""
-    This OP returns the squared L2 norm of a tensor.
-    """
-
-    x = _cast_to_mp_type_if_enabled(x)
-    if (
-        core.is_compiled_with_xpu()
-        or x.dtype == core.VarDesc.VarType.FP16
-        or x.dtype == core.VarDesc.VarType.BF16
-    ):
-        square = paddle.square(x)
-        sum_square = paddle.sum(square)
-        return sum_square
-
-    if in_dygraph_mode():
-        return _C_ops.squared_l2_norm(x)
-    else:
-        op_type = 'squared_l2_norm'
-        check_variable_and_dtype(x, 'x', ['float32', 'float64'], op_type)
-        helper = LayerHelper(op_type, **locals())
-        out = helper.create_variable_for_type_inference(x.dtype)
-
-        inputs = {"X": x}
-        outputs = {'Out': out}
-        helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
-        return out
-
-
-class BaseErrorClipAttr:
-    def __str__(self):
-        raise NotImplementedError()
-
-    def _append_clip_op(self, block, grad_name):
-        raise NotImplementedError()
-
-
-class ErrorClipByValue(BaseErrorClipAttr):
-    r"""
-    Clips tensor values to the range [min, max].
-
-    Given a tensor ``t`` (see Examples below), this operation clips its value \
-    to ``min`` and ``max`` inplace.
-
-    - Any values less than min are set to min.
-    - Any values greater than max are set to max.
-
-    Args:
-        max (float): The maximum value to clip by.
-        min (float, optional): The minimum value to clip by. if not set by user, \
-        will be set to ``-max`` by framework.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle
-            paddle.enable_static()
-            BATCH_SIZE = 128
-            CLIP_MAX = 2e-6
-            CLIP_MIN = -1e-6
-            prog = fluid.framework.Program()
-            with fluid.program_guard(main_program=prog):
-                image = fluid.layers.data(
-                    name='x', shape=[784], dtype='float32')
-                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
-                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
-                predict = fluid.layers.fc(
-                    input=hidden2, size=10, act='softmax')
-                label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-                cost = paddle.nn.functional.cross_entropy(input=predict, label=label, reduction='none', use_softmax=False)
-                avg_cost = paddle.mean(cost)
-            prog_clip = prog.clone()
-            prog_clip.block(0).var(hidden1.name)._set_error_clip(
-                fluid.clip.ErrorClipByValue(
-                    max=CLIP_MAX, min=CLIP_MIN
-                )
-            )
-    """
-
-    def __init__(self, max, min=None):
-        max = float(max)
-        if min is None:
-            min = -max
-        else:
-            min = float(min)
-        self.max = max
-        self.min = min
-
-    def __str__(self):
-        return "ByValue, min=%f, max=%f" % (self.min, self.max)
-
-    def _append_clip_op(self, block, grad_name):
-        clip_op_desc = block.desc.append_op()
-        clip_op_desc.set_type("clip")
-        clip_op_desc.set_input("X", [grad_name])
-        clip_op_desc.set_output("Out", [grad_name])
-        clip_op_desc._set_attr("min", self.min)
-        clip_op_desc._set_attr("max", self.max)
-
-
-def error_clip_callback(block, context):
-    # the context is a grad_to_var map
-    grad_to_var = context
-    op_desc = block.desc.op(block.desc.op_size() - 1)
-    for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
-        fwd_var = block._var_recursive(grad_to_var[grad_n])
-        error_clip = getattr(fwd_var, "error_clip", None)
-        if not (
-            error_clip is None or isinstance(error_clip, BaseErrorClipAttr)
-        ):
-            raise TypeError(
-                "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
-            )
-        if error_clip is not None:
-            error_clip._append_clip_op(block, grad_n)
-
-
-class ClipGradBase:
-    def __init__(self):
-        super().__init__()
-
-    def __str__(self):
-        raise NotImplementedError()
-
-    @imperative_base.no_grad
-    def _dygraph_clip(self, params_grads):
-        raise NotImplementedError
-
-    def _static_clip(self, params_grads):
-        raise NotImplementedError
-
-    def __call__(self, params_grads):
-        if in_dygraph_mode():
-            return self._dygraph_clip(params_grads)
-        else:
-            for p, g in params_grads:
-                if getattr(p, 'gradient_clip_attr', None) is not None:
-                    warnings.warn(
-                        "'set_gradient_clip' will be ineffective, because you have "
-                        "set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' "
-                        "is redundant and you can remove it."
-                    )
-                    break
-            return self._static_clip(params_grads)
-
-    def _process_context(self, context, param, grad):
-        raise NotImplementedError()
-
-    def _create_operators(self, param, grad):
-        raise NotImplementedError()
-
-
-class ClipGradByValue(ClipGradBase):
-    """
-    Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
-
-    - Any values less than min are set to ``min``.
-
-    - Any values greater than max are set to ``max``.
-
-    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
-    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
-
-    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
-    (for example: :ref:`api_paddle_optimizer_SGD`).
-
-    Note:
-        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
-        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
-
-    Args:
-        max (float): The maximum value to clip by.
-        min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
-            automatically. In this case, ``max`` must be greater than 0.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10,
-                                      weight_attr=paddle.ParamAttr(need_clip=True),
-                                      bias_attr=paddle.ParamAttr(need_clip=False))
-            out = linear(x)
-            loss = paddle.mean(out)
-            loss.backward()
-
-            clip = paddle.nn.ClipGradByValue(min=-1, max=1)
-            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
-            sdg.step()
-    """
-
-    def __init__(self, max, min=None):
-        super().__init__()
-        if min is None:
-            assert max > 0.0
-            min = -max
-        self.max = float(max)
-        self.min = float(min)
-
-    def __str__(self):
-        return "Clip Gradient By Value, min = %f, max=%f" % (self.min, self.max)
-
-    @imperative_base.no_grad
-    def _dygraph_clip(self, params_grads):
-        params_and_grads = []
-        for p, g in params_grads:
-            if g is None:
-                continue
-            if getattr(p, 'need_clip', True) is False:
-                params_and_grads.append((p, g))
-                continue
-            new_grad = paddle.clip(x=g, min=self.min, max=self.max)
-            params_and_grads.append((p, new_grad))
-        return params_and_grads
-
-    def _static_clip(self, params_grads):
-        params_and_grads = []
-        param_new_grad_name_dict = dict()
-        with framework.name_scope('gradient_clip'):
-            for p, g in params_grads:
-                if g is None:
-                    continue
-                if getattr(p, 'need_clip', True) is False:
-                    params_and_grads.append((p, g))
-                    continue
-
-                with p.block.program._optimized_guard([p, g]):
-                    new_grad = layers.clip(x=g, min=self.min, max=self.max)
-                params_and_grads.append((p, new_grad))
-                param_new_grad_name_dict[p.name] = new_grad.name
-        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
-        return params_and_grads
-
-    def _process_context(self, context, param, grad):
-        pass
-
-    def _create_operators(self, param, grad):
-        new_grad = layers.clip(x=grad, min=self.min, max=self.max)
-        return param, new_grad
-
-
-class ClipGradByNorm(ClipGradBase):
-    r"""
-    Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
-
-    - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.
-
-    - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
-
-    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
-    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
-
-    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
-    (for example: :ref:`api_paddle_optimizer_SGD`).
-
-    The clipping formula is:
-
-    .. math::
-        Out =
-        \left\{
-            \begin{array}{ccl}
-                X & & if (norm(X) \leq clip\_norm) \\
-                \frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\
-        \end{array}
-        \right.
-
-
-    where :math:`norm(X)` represents the L2 norm of :math:`X`.
-
-    .. math::
-        norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}
-
-    Note:
-        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
-        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
-
-    Args:
-        clip_norm(float): The maximum norm value.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10,
-                                      weight_attr=paddle.ParamAttr(need_clip=True),
-                                      bias_attr=paddle.ParamAttr(need_clip=False))
-            out = linear(x)
-            loss = paddle.mean(out)
-            loss.backward()
-
-            clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
-            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
-            sdg.step()
-    """
-
-    def __init__(self, clip_norm):
-        super().__init__()
-        self.clip_norm = float(clip_norm)
-
-    def __str__(self):
-        return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm
-
-    @imperative_base.no_grad
-    def _dygraph_clip(self, params_grads):
-        params_and_grads = []
-        for p, g in params_grads:
-            if g is None:
-                continue
-            if getattr(p, 'need_clip', True) is False:
-                params_and_grads.append((p, g))
-                continue
-            new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
-            params_and_grads.append((p, new_grad))
-        return params_and_grads
-
-    def _static_clip(self, params_grads):
-        params_and_grads = []
-        with framework.name_scope('gradient_clip'):
-            param_new_grad_name_dict = dict()
-            for p, g in params_grads:
-                if g is None:
-                    continue
-                if getattr(p, 'need_clip', True) is False:
-                    params_and_grads.append((p, g))
-                    continue
-
-                with p.block.program._optimized_guard([p, g]):
-                    new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
-                param_new_grad_name_dict[p.name] = new_grad.name
-                params_and_grads.append((p, new_grad))
-        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
-        return params_and_grads
-
-    def _process_context(self, context, param, grad):
-        pass
-
-    def _create_operators(self, param, grad):
-        new_grad = layers.clip_by_norm(x=grad, max_norm=self.clip_norm)
-        return param, new_grad
-
-
-_allow_pure_fp16_global_norm_clip_flag = False
-
-
-def _allow_pure_fp16_global_norm_clip(*args):
-    global _allow_pure_fp16_global_norm_clip_flag
-    if len(args) == 0:
-        return _allow_pure_fp16_global_norm_clip_flag
-    else:
-        assert len(args) == 1 and isinstance(args[0], bool)
-        old_value = _allow_pure_fp16_global_norm_clip_flag
-        _allow_pure_fp16_global_norm_clip_flag = args[0]
-        return old_value
-
-
-class ClipGradByGlobalNorm(ClipGradBase):
-    r"""
-    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
-    :math:`t\_list` , and limit it to ``clip_norm`` .
-
-    - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.
-
-    - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
-
-    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
-    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
-
-    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
-    (for example: :ref:`api_paddle_optimizer_SGD`).
-
-    The clipping formula is:
-
-    .. math::
-
-        t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)}
-
-    where:
-
-    .. math::
-
-        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
-
-    Note:
-        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
-        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
-
-    Args:
-        clip_norm (float): The maximum norm value.
-        group_name (str, optional): The group name for this clip. Default value is ``default_group``.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10,
-                                      weight_attr=paddle.ParamAttr(need_clip=True),
-                                      bias_attr=paddle.ParamAttr(need_clip=False))
-            out = linear(x)
-            loss = paddle.mean(out)
-            loss.backward()
-
-            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
-            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
-            sdg.step()
-    """
-
-    def __init__(
-        self, clip_norm, group_name="default_group", auto_skip_clip=False
-    ):
-        super().__init__()
-        self.clip_norm = float(clip_norm)
-        self.group_name = group_name
-        assert isinstance(auto_skip_clip, bool)
-        self.auto_skip_clip = auto_skip_clip
-
-    def __str__(self):
-        return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)
-
-    @imperative_base.no_grad
-    def _dygraph_clip(self, params_grads):
-        params_and_grads = []
-        sum_square_list = []
-        sum_square_list_fp16 = []
-        sum_square_list_fp32 = []
-        for p, g in params_grads:
-            if g is None:
-                continue
-            if getattr(p, 'need_clip', True) is False:
-                continue
-            merge_grad = g
-
-            if in_dygraph_mode() and g.is_selected_rows():
-                merge_grad = layers.merge_selected_rows(g)
-                merge_grad = merge_grad._get_tensor_from_selected_rows()
-
-            elif g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                merge_grad = layers.merge_selected_rows(g)
-                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
-
-            sum_square = _squared_l2_norm(merge_grad)
-            if (
-                sum_square.dtype == core.VarDesc.VarType.FP16
-                or sum_square.dtype == core.VarDesc.VarType.BF16
-            ):
-                sum_square_list_fp16.append(sum_square)
-            elif sum_square.dtype == core.VarDesc.VarType.FP32:
-                sum_square_list_fp32.append(sum_square)
-            else:
-                sum_square_list.append(sum_square)
-
-        # all parameters have been filterd out
-        if (
-            len(sum_square_list)
-            + len(sum_square_list_fp16)
-            + len(sum_square_list_fp32)
-            == 0
-        ):
-            return params_grads
-
-        sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
-        global_norm_var = []
-        if len(sum_square_list_fp16) > 0:
-            global_norm_var_fp16 = paddle.add_n(sum_square_list_fp16)
-            global_norm_var.append(global_norm_var_fp16.astype(sum_dtype))
-        if len(sum_square_list_fp32) > 0:
-            global_norm_var_fp32 = paddle.add_n(sum_square_list_fp32)
-            if sum_dtype == 'float32':
-                global_norm_var.append(global_norm_var_fp32)
-            else:
-                global_norm_var.append(global_norm_var_fp32.astype(sum_dtype))
-        if len(sum_square_list) > 0:
-            global_norm_var_fp64 = paddle.add_n(sum_square_list)
-            global_norm_var.append(global_norm_var_fp64)
-        global_norm_var = paddle.add_n(global_norm_var)
-        global_norm_var = paddle.sqrt(global_norm_var)
-        max_global_norm = layers.fill_constant(
-            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
-        )
-
-        need_clip = False
-        if not self.auto_skip_clip:  # always apply clip
-            need_clip = True
-            clip_var = paddle.divide(
-                x=max_global_norm,
-                y=paddle.maximum(x=global_norm_var, y=max_global_norm),
-            )
-        elif global_norm_var > max_global_norm:
-            # only when global_norm_var > max_global_norm, grad need clip
-            need_clip = True
-            clip_var = paddle.divide(x=max_global_norm, y=global_norm_var)
-
-        for p, g in params_grads:
-            if g is None:
-                continue
-            if getattr(p, 'need_clip', True) is False:
-                params_and_grads.append((p, g))
-                continue
-            # TODO(wangxi): use inplace elementwise_mul
-            if need_clip:
-                clip_input = (
-                    clip_var.astype(g.dtype)
-                    if clip_var.dtype != g.dtype
-                    else clip_var
-                )
-                new_grad = paddle.multiply(g, clip_input)
-                params_and_grads.append((p, new_grad))
-            else:
-                params_and_grads.append((p, g))
-
-        return params_and_grads
-
-    def _static_clip(self, params_grads):
-        params_and_grads = []
-        sum_square_list = []
-        sum_square_list_fp16 = []
-        sum_square_list_fp32 = []
-        with framework.name_scope('gradient_clip'):
-            for p, g in params_grads:
-                if g is None:
-                    continue
-                if getattr(p, 'need_clip', True) is False:
-                    continue
-                merge_grad = g
-                with p.block.program._optimized_guard([p, g]):
-                    if g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                        merge_grad = layers.merge_selected_rows(g)
-                        merge_grad = layers.get_tensor_from_selected_rows(
-                            merge_grad
-                        )
-                    sum_square = _squared_l2_norm(merge_grad)
-                    if sum_square.dtype == core.VarDesc.VarType.FP16:
-                        sum_square_list_fp16.append(sum_square)
-                    elif sum_square.dtype == core.VarDesc.VarType.FP32:
-                        sum_square_list_fp32.append(sum_square)
-                    else:
-                        sum_square_list.append(sum_square)
-
-            # all parameters have been filterd out
-            if (
-                len(sum_square_list)
-                + len(sum_square_list_fp16)
-                + len(sum_square_list_fp32)
-                == 0
-            ):
-                return params_grads
-
-            with p.block.program._optimized_guard([p, g]):
-                sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
-
-                global_norm_var = []
-                if len(sum_square_list_fp16) > 0:
-                    global_norm_var_fp16 = layers.sums(sum_square_list_fp16)
-                    if (
-                        sum_square_list_fp32
-                        or sum_square_list
-                        or not _allow_pure_fp16_global_norm_clip()
-                    ):
-                        global_norm_var.append(
-                            global_norm_var_fp16.astype(sum_dtype)
-                        )
-                    else:
-                        global_norm_var.append(global_norm_var_fp16)
-                if len(sum_square_list_fp32) > 0:
-                    global_norm_var_fp32 = layers.sums(sum_square_list_fp32)
-                    if sum_dtype == 'float32':
-                        global_norm_var.append(global_norm_var_fp32)
-                    else:
-                        global_norm_var.append(
-                            global_norm_var_fp32.astype(sum_dtype)
-                        )
-                if len(sum_square_list) > 0:
-                    # fp64
-                    global_norm_var_other_dtype = layers.sums(sum_square_list)
-                    global_norm_var.append(global_norm_var_other_dtype)
-
-                global_norm_var = (
-                    layers.sums(global_norm_var)
-                    if len(global_norm_var) > 1
-                    else global_norm_var[0]
-                )
-                global_norm_var = paddle.sqrt(x=global_norm_var)
-                max_global_norm = layers.fill_constant(
-                    shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
-                )
-                scale_var = paddle.divide(
-                    x=max_global_norm,
-                    y=paddle.maximum(x=max_global_norm, y=global_norm_var),
-                )
-            param_new_grad_name_dict = dict()
-            for p, g in params_grads:
-                if g is None:
-                    continue
-                if getattr(p, 'need_clip', True) is False:
-                    params_and_grads.append((p, g))
-                    continue
-
-                with p.block.program._optimized_guard([p, g]):
-                    new_g = _cast_to_mp_type_if_enabled(g)
-                    # inplace
-                    scale_input = (
-                        scale_var.astype('float16')
-                        if new_g.dtype == core.VarDesc.VarType.FP16
-                        and scale_var.dtype != core.VarDesc.VarType.FP16
-                        else scale_var
-                    )
-                    # NOTE(Yuang Liu): For pure dp with gradient merge, the p and g
-                    # will be in different blocks with the gradient clip related ops.
-                    # We need to handle the correct block, otherwise will encounter
-                    # a 'NotFoundError' during compile time.
-                    block = default_main_program().current_block()
-                    block.append_op(
-                        type='elementwise_mul',
-                        inputs={'X': new_g, 'Y': scale_input},
-                        outputs={'Out': new_g},
-                    )
-                    if new_g is not g:
-                        block.append_op(
-                            type='cast',
-                            inputs={'X': new_g},
-                            outputs={'Out': g},
-                            attrs={
-                                'in_dtype': new_g.dtype,
-                                'out_dtype': g.dtype,
-                            },
-                        )
-
-                param_new_grad_name_dict[p.name] = g.name
-                params_and_grads.append((p, g))
-
-        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
-        return params_and_grads
-
-    def _process_context(self, context, param, grad):
-        if self.group_name not in context:
-            context[self.group_name] = []
-            context[self.group_name + "_clip_value"] = self.clip_norm
-            context[self.group_name + "_clip"] = layers.fill_constant(
-                shape=[1], dtype=grad.dtype, value=self.clip_norm
-            )
-        else:
-            if not self.clip_norm == context[self.group_name + "_clip_value"]:
-                raise ValueError(
-                    "All parameters' 'clip_norm' of a same group should be the same"
-                )
-
-        merge_grad = grad
-        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
-            merge_grad = layers.merge_selected_rows(grad)
-            merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
-
-        local_norm_var = _squared_l2_norm(merge_grad)
-        context[self.group_name].append(local_norm_var)
-
-        self.context = context
-
-    def _create_operators(self, param, grad):
-        group_scale_name = self.group_name + "_scale"
-        if group_scale_name not in self.context:
-            group_norm_var = layers.sums(input=self.context[self.group_name])
-            group_norm_var = paddle.sqrt(x=group_norm_var)
-            clip_var = self.context[self.group_name + "_clip"]
-            group_scale_var = paddle.divide(
-                x=clip_var,
-                y=paddle.maximum(x=clip_var, y=group_norm_var),
-            )
-            assert group_scale_var.shape == (1,)
-            self.context[group_scale_name] = group_scale_var
-
-        # inplace
-        param.block.append_op(
-            type='elementwise_mul',
-            inputs={'X': grad, 'Y': self.context[group_scale_name]},
-            outputs={'Out': grad},
-        )
-
-        return param, grad
-
-
-@framework.dygraph_not_support
-def set_gradient_clip(clip, param_list=None, program=None):
-    """
-    :api_attr: Static Graph
-
-    Warning:
-
-        This API must be used after building network, and before ``minimize`` ,
-        and it may be removed in future releases, so it is not recommended.
-        It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
-        this is a better method to clip gradient. There are three clipping strategies:
-         :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
-         :ref:`api_fluid_clip_GradientClipByValue` .
-
-    To specify parameters that require gradient clip.
-
-    Args:
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
-            some derived class of ``GradientClipBase`` . There are three cliping strategies
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
-            :ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no
-            gradient clipping.
-        param_list (list(Variable), optional): Parameters that require gradient clip.
-                It can be a list of parameter or a list of parameter's name.
-                Default None, meaning that all parameters in the program will be included.
-        program (Program, optional): The program where parameters are located.
-                Default None, meaning that using :ref:`api_fluid_default_main_program` .
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            def network():
-                image = fluid.data(name='image', shape=[
-                                   None, 28], dtype='float32')
-                param_attr1 = fluid.ParamAttr("fc1_param")
-                fc1 = fluid.layers.fc(image, size=10, param_attr=param_attr1)
-                param_attr2 = fluid.ParamAttr("fc2_param")
-                fc2 = fluid.layers.fc(fc1, size=10, param_attr=param_attr2)
-                loss = fluid.layers.reduce_mean(fc2)
-                return loss
-
-
-            # network 1: clip all parameter gradient
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                loss = network()
-                fluid.clip.set_gradient_clip(
-                    fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
-                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                sgd.minimize(loss)
-
-            # network 2: clip parameter gradient by name
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                loss = network()
-                fluid.clip.set_gradient_clip(
-                    fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
-                    param_list=["fc1_param", "fc2_param"])
-                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                sgd.minimize(loss)
-
-            # network 3: clip parameter gradient by value
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                loss = network()
-                param_var1 = fluid.default_main_program().global_block().var("fc1_param")
-                param_var2 = fluid.default_main_program().global_block().var("fc2_param")
-                fluid.clip.set_gradient_clip(
-                    fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
-                    param_list=[param_var1, param_var2])
-                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                sgd.minimize(loss)
-
-            # network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                loss = network()
-                clip1 = fluid.clip.GradientClipByValue(min=-1.0, max=1.0)
-                clip2 = fluid.clip.GradientClipByNorm(clip_norm=1.0)
-                # Set the gradient clipping strategy: clip1
-                fluid.clip.set_gradient_clip(clip1)
-                # Set the gradient clipping strategy: clip2
-                sgd = fluid.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
-                sgd.minimize(loss)
-                # 'set_gradient_clip' will not take effect when setting has a conflict,
-                # and the gradient clipping strategy will be 'clip2'
-
-
-    """
-    warnings.warn(
-        "Caution! 'set_gradient_clip' is not recommended "
-        "and may be deprecated in future! "
-        "We recommend a new strategy: set 'grad_clip' "
-        "when initializing the 'optimizer'. "
-        "This method can reduce the mistakes, please "
-        "refer to documention of 'optimizer'."
-    )
-
-    if not isinstance(clip, ClipGradBase):
-        raise TypeError(
-            "'clip' should be an instance of ClipGradBase's derived class"
-        )
-    if program is None:
-        program = framework.default_main_program()
-
-    for op in program.block(0).ops:
-        if 'op_namescope' in op.all_attrs() and "optimizer" in op.attr(
-            "op_namescope"
-        ):
-            warnings.warn(
-                "'minimize' has been invoked before, this will make 'set_gradient_clip' "
-                "be ineffective! Please invoke 'set_gradient_clip' before 'minimize'."
-            )
-            break
-
-    if param_list is None:
-        param_list = program.block(0).all_parameters()
-    if all(isinstance(elem, str) for elem in param_list):
-        param_list = [program.block(0).var(elem) for elem in param_list]
-    if not all(isinstance(elem, framework.Parameter) for elem in param_list):
-        raise TypeError(
-            "'param_list' should be a list of Parameter or basestring(parameter's name)."
-        )
-
-    for param in param_list:
-        param.gradient_clip_attr = copy.deepcopy(clip)
-
-
-def append_gradient_clip_ops(param_grads):
-    context = dict()
-    for p, g in param_grads:
-        if g is None:
-            continue
-        with p.block.program._optimized_guard([p, g]), framework.name_scope(
-            'gradient_clip'
-        ):
-            clip_attr = getattr(p, 'gradient_clip_attr', None)
-            if clip_attr is None:
-                return param_grads
-            if not isinstance(clip_attr, ClipGradBase):
-                raise TypeError(
-                    "clip attribute should be an instance of GradientClipBase"
-                )
-
-            clip_attr._process_context(context=context, param=p, grad=g)
-
-    res = []
-    param_new_grad_name_dict = dict()
-    for p, g in param_grads:
-        if g is None:
-            continue
-        with p.block.program._optimized_guard([p, g]), framework.name_scope(
-            'gradient_clip'
-        ):
-            param, new_grad = clip_attr._create_operators(param=p, grad=g)
-            param_new_grad_name_dict[param.name] = new_grad.name
-            res.append([param, new_grad])
-
-    _correct_clip_op_role_var(res, param_new_grad_name_dict)
-    return res
-
-
-# change wrong mapping relation between param & grad in clip op
-# Note: This function is sensitive to the time cost of the network with gradient clipping
-# and should not be changed easily. If you must change, please test the time cost.
-def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
-    block_id_list = []
-    if len(param_new_grad_name_dict) == 0:
-        return
-    for param, grad in params_grads:
-        if grad is None:
-            continue
-        block_id = param.block.idx
-        if block_id in block_id_list:
-            continue
-        block_id_list.append(block_id)
-        for op in param.block.program.global_block().ops:
-            if (
-                op.has_attr("op_namescope")
-                and "gradient_clip" in op.attr("op_namescope")
-                and op.attr('op_role_var')
-            ):
-                param_name = op.attr('op_role_var')[0]
-                if param_name in param_new_grad_name_dict:
-                    correct_p_g = [
-                        param_name,
-                        param_new_grad_name_dict[param_name],
-                    ]
-                    op._set_attr('op_role_var', correct_p_g)
-
-
-GradientClipBase = ClipGradBase
-GradientClipByValue = ClipGradByValue
-GradientClipByNorm = ClipGradByNorm
-GradientClipByGlobalNorm = ClipGradByGlobalNorm
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
index df198931199f59520368faee025a77b42b5bdcd7..4ec3c1d16e077ea00672c664bac3b1b4ea5e491c 100644
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
@@ -185,7 +185,7 @@ class FleetUtil:
 
               # below is part of model
               emb = my_slot_net(slots, label) # emb can be fc layer of size 1
-              similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
+              similarity_norm = fluid.layers.sigmoid(paddle.clip(\
                   emb, min=-15.0, max=15.0), name="similarity_norm")\
               binary_predict = fluid.layers.concat(input=[\
                   paddle.subtract(\
@@ -1374,7 +1374,7 @@ class FleetUtil:
               label = fluid.layers.data(name="click", shape=[-1, 1],\
                   dtype="int64", lod_level=0, append_batch_size=False)
               emb = my_slot_net(slots, label) # emb can be fc layer of size 1
-              similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
+              similarity_norm = fluid.layers.sigmoid(paddle.clip(\
                   emb, min=-15.0, max=15.0), name="similarity_norm")\
               binary_predict = fluid.layers.concat(input=[\
                   paddle.subtract(\
@@ -1574,7 +1574,7 @@ class FleetUtil:
               label = fluid.layers.data(name="click", shape=[-1, 1],\
                   dtype="int64", lod_level=0, append_batch_size=False)
               emb = my_slot_net(slots, label) # emb can be fc layer of size 1
-              similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
+              similarity_norm = fluid.layers.sigmoid(paddle.clip(\
                   emb, min=-15.0, max=15.0), name="similarity_norm")\
               binary_predict = fluid.layers.concat(input=[\
                   paddle.subtract(\
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 264c8ce6da94e8af7fb1d3b27c429880983f8bf8..c11a541df5326794a72390086442664aee26a142 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -63,10 +63,6 @@ __all__ = [
     'fc',
     'embedding',
     'autoincreased_step_counter',
-    'clip',
-    'clip_by_norm',
-    'merge_selected_rows',
-    'get_tensor_from_selected_rows',
 ]
 
 OP_NAMEMAPPING = {
@@ -997,199 +993,3 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
             )
 
         return out
-
-
-@templatedoc()
-def clip(x, min, max, name=None):
-    """
-        :old_api: paddle.fluid.layers.clip
-
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        min(float): ${min_comment}
-        max(float): ${max_comment}
-        name(str, optional): The default value is None.
-                             Normally there is no need for user to set this property.
-                             For more information, please refer to :ref:`api_guide_Name`
-
-    Returns:
-        ${out_comment}
-
-    Return Type:
-        ${out_type}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.data(
-                name='data', shape=[1], dtype='float32')
-            reward = fluid.layers.clip(x=input, min=-1.0, max=1.0)
-    """
-
-    helper = LayerHelper("clip", **locals())
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'clip')
-
-    if name is None:
-        name = unique_name.generate_with_ignorable_key(
-            ".".join([helper.name, 'tmp'])
-        )
-
-    out = helper.create_variable(
-        type=x.type, name=name, dtype=x.dtype, persistable=False
-    )
-
-    helper.append_op(
-        type="clip",
-        inputs={"X": x},
-        attrs={"min": min, "max": max},
-        outputs={"Out": out},
-    )
-
-    return out
-
-
-@templatedoc()
-def clip_by_norm(x, max_norm, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        max_norm(${max_norm_type}): ${max_norm_comment}
-        name(str, optional): For detailed information, please refer
-            to :ref:`api_guide_Name`. Usually name is no need to set and
-            None by default.
-
-    Returns:
-        Tensor:
-
-        out(${out_type}): ${out_comment}
-
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-
-            input = paddle.to_tensor([[2.0, 2.0], [2.0, 2.0]], dtype='float32')
-            reward = fluid.layers.clip_by_norm(x=input, max_norm=1.0)
-            # [[0.5, 0.5], [0.5, 0.5]]
-    """
-
-    if in_dygraph_mode():
-        return _C_ops.clip_by_norm(x, max_norm)
-    else:
-        helper = LayerHelper("clip_by_norm", **locals())
-        check_variable_and_dtype(x, 'X', ['float32', 'float16'], 'clip_by_norm')
-        check_type(max_norm, 'max_norm', (float), 'clip_by_norm')
-
-        if name is None:
-            name = unique_name.generate_with_ignorable_key(
-                ".".join([helper.name, 'tmp'])
-            )
-
-        out = helper.create_variable(
-            type=x.type, name=name, dtype=x.dtype, persistable=False
-        )
-
-        helper.append_op(
-            type="clip_by_norm",
-            inputs={"X": x},
-            attrs={"max_norm": max_norm},
-            outputs={"Out": out},
-        )
-
-        return out
-
-
-@templatedoc()
-def merge_selected_rows(x, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        name(basestring|None): Name of the output.
-
-    Returns:
-        out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            b = fluid.default_main_program().global_block()
-            var = b.create_var(
-                name="X", dtype="float32", persistable=True,
-                type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
-            y = fluid.layers.merge_selected_rows(var)
-    """
-    if in_dygraph_mode():
-        return _C_ops.merge_selected_rows(x)
-    else:
-        helper = LayerHelper("merge_selected_rows", **locals())
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-        helper.append_op(
-            type="merge_selected_rows",
-            inputs={"X": x},
-            attrs={},
-            outputs={"Out": out},
-        )
-        return out
-
-
-@templatedoc()
-def get_tensor_from_selected_rows(x, name=None):
-    """
-    This operator gets tensor data from input with SelectedRows type, and outputs a LoDTensor.
-
-    .. code-block:: text
-
-        input x is SelectedRows:
-           x.rows = [0, 5, 5, 4, 19]
-           x.height = 20
-           x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]]
-
-        Output is LoDTensor:
-           out.shape = [5, 2]
-           out.data = [[1, 1],
-                       [2, 2],
-                       [2, 2],
-                       [3, 3],
-                       [6, 6]]
-
-    Args:
-        x(SelectedRows): Input with SelectedRows type. The data type is float32, float64, int32 or int64.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
-
-    Returns:
-        Variable: LoDTensor transformed from SelectedRows. The data type is same with input.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            b = fluid.default_main_program().global_block()
-            input = b.create_var(name="X", dtype="float32", persistable=True, type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
-            out = fluid.layers.get_tensor_from_selected_rows(input)
-    """
-
-    check_type(x, 'x', Variable, 'get_tensor_from_selected_rows')
-    if x.type != core.VarDesc.VarType.SELECTED_ROWS:
-        raise TypeError(
-            "The type of 'x' in get_tensor_from_selected_rows must be SELECTED_ROWS."
-        )
-    helper = LayerHelper('get_tensor_from_selected_rows', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='get_tensor_from_selected_rows',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={},
-    )
-    return out
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 3e59ca2472ab4decea0dc6f93cbb5fa1492ec0c4..cbbe8dbadef12f6584371c7fcd500b2c3b0b7c5a 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -38,13 +38,6 @@ from .backward import (
     _append_grad_suffix_,
     _get_no_grad_set_name,
 )
-from .clip import (
-    GradientClipBase,
-    GradientClipByNorm,
-    error_clip_callback,
-    append_gradient_clip_ops,
-    ClipGradByGlobalNorm,
-)
 from .framework import program_guard
 from .initializer import Constant
 from .layer_helper import LayerHelper
@@ -160,7 +153,7 @@ class Optimizer:
                 )
 
         if grad_clip is not None:
-            if not isinstance(grad_clip, GradientClipBase):
+            if not isinstance(grad_clip, paddle.nn.clip.GradientClipBase):
                 raise TypeError(
                     "'grad_clip' should be an instance of GradientClipBase's derived class"
                 )
@@ -1030,7 +1023,7 @@ class Optimizer:
                     params_grads.append((param, grad_var))
         else:
             if callbacks is None:
-                callbacks = [error_clip_callback]
+                callbacks = [paddle.nn.clip.error_clip_callback]
             else:
                 assert isinstance(callbacks, list)
             program = loss.block.program
@@ -1260,7 +1253,7 @@ class Optimizer:
         # NOTE(zhiqiu): currently, only support ClipGradByGlobalNorm and without regularization.
         if self._flatten_param_grads and self.regularization is None:
             if self._grad_clip is None or isinstance(
-                self._grad_clip, ClipGradByGlobalNorm
+                self._grad_clip, paddle.nn.ClipGradByGlobalNorm
             ):
                 params_grads = self.flatten_param_grads(params_grads)
 
@@ -1268,7 +1261,7 @@ class Optimizer:
         if self._grad_clip is not None:
             params_grads = self._grad_clip(params_grads)
         else:
-            params_grads = append_gradient_clip_ops(params_grads)
+            params_grads = paddle.nn.clip.append_gradient_clip_ops(params_grads)
 
         # Add regularization if any
         params_grads = self.append_regularization_ops(
diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py
index dcfe477a76b3e24ec8df2e02d3fe07121f16d9cf..65483d1c6adf68dba55e43180e9993d712193811 100644
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -38,13 +38,13 @@ with fluid.program_guard(main_program=prog):
 
 prog_clip = prog.clone()
 prog_clip.block(0).var(hidden1.name)._set_error_clip(
-    fluid.clip.ErrorClipByValue(max=CLIP_MAX, min=CLIP_MIN)
+    paddle.nn.clip.ErrorClipByValue(max=CLIP_MAX, min=CLIP_MIN)
 )
 
 avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
 fluid.backward.append_backward(loss=avg_cost)
 fluid.backward.append_backward(
-    loss=avg_cost_clip, callbacks=[fluid.clip.error_clip_callback]
+    loss=avg_cost_clip, callbacks=[paddle.nn.clip.error_clip_callback]
 )
 
 hidden1_grad = prog.block(0).var(hidden1.name + "@GRAD")
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
index f974709ce87abec8679b3846746bbe087e495778..f97faed1d584fce94d8715323e525fea7ac57d49 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
@@ -122,7 +122,7 @@ class TestDistMnist2x2(TestDistRunnerBase):
 
         opt = paddle.optimizer.AdamW(
             learning_rate=lr_val,
-            grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
+            grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
         )
 
         acc_steps = 2  # accumulated steps for pipeline
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
index ecc71abe6252cd864f997a0059837efc73a66990..170243fc962839f063a0aafc39adef62fc0d4737 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
@@ -122,7 +122,7 @@ class TestDistMnist2x2(TestDistRunnerBase):
         opt = fluid.optimizer.Momentum(
             learning_rate=lr_val,
             momentum=0.9,
-            grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
+            grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
         )
 
         acc_steps = 2  # accumulated steps for pipeline
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py
index c7b44fe305d25aa2cac4fd5f4f8ffda56b479940..0d499393f12155aa1d0b73af9f45e2f98a0d2f56 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py
@@ -15,10 +15,10 @@
 import unittest
 
 import paddle
-import paddle.fluid.clip as clip
 import paddle.fluid.framework as framework
 import paddle.fluid.optimizer as optimizer
 import paddle.fluid.regularizer as regularizer
+import paddle.nn.clip as clip
 
 paddle.enable_static()
 
@@ -76,7 +76,7 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
             rampup_begin_step=0,
             num_trainers=2,
             regularization=regularization,
-            grad_clip=clip.GradientClipByNorm(1.0),
+            grad_clip=clip.ClipGradByNorm(1.0),
         )
 
         if use_recompute:
@@ -144,14 +144,14 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
                 print("dgc regular_coeff=" + str(coeff))
 
     def test_tpyeError(self):
-        # the type of DGCMomentumOptimizer(grad_clip=) must be 'GradientClipByNorm'
+        # the type of DGCMomentumOptimizer(grad_clip=) must be 'ClipGradByNorm'
         with self.assertRaises(TypeError):
             dgc_momentum_optimizer = self.MockDGCMomentum(
                 learning_rate=0.01,
                 momentum=0.2,
                 rampup_begin_step=0,
                 num_trainers=2,
-                grad_clip=clip.GradientClipByGlobalNorm(1.0),
+                grad_clip=clip.ClipGradByGlobalNorm(1.0),
             )
 
     def test_momentum_without_dgc(self):
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py
index eee1235670805f8d66b8206bbdd954129adfba97..0982ab86117c9f1302bb604737ec143902963725 100755
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py
@@ -354,7 +354,7 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer):
         }
         strategy.fuse_all_reduce_ops = True
         strategy.fuse_grad_size_in_MB = 32
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(1.0)
 
         self.optimizer(
             avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
@@ -552,7 +552,7 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer):
         strategy.fuse_all_reduce_ops = True
         strategy.fuse_grad_size_in_MB = 32
         strategy.fuse_grad_merge = True
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(1.0)
 
         self.optimizer(
             avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
@@ -940,7 +940,7 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer):
         }
         strategy.fuse_all_reduce_ops = True
         strategy.fuse_grad_size_in_MB = 32
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(1.0)
 
         self.optimizer(
             avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
@@ -1044,7 +1044,7 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer):
         }
         strategy.fuse_all_reduce_ops = True
         strategy.fuse_grad_size_in_MB = 32
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(1.0)
 
         self.optimizer(
             avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py
index d59c074c03f11dd5ce9acc635216a417e7437f07..46b5fe9ed4b6a641d21d42a0cf1d730314f5a964 100755
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py
@@ -640,7 +640,7 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
         )
         avg_cost, strategy = self.net(train_prog, startup_prog)
         self.set_strategy(strategy, 'sharding')
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
         self.optimizer(
             avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
         )
@@ -1309,7 +1309,7 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
             "micro_batch_size": 2,
             "accumulate_steps": 4,
         }
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
         self.optimizer(
             avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
         )
@@ -1547,7 +1547,7 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
             "micro_batch_size": 2,
             "accumulate_steps": 4,
         }
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
         self.optimizer(
             avg_cost,
             strategy,
diff --git a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
index ff9122b1191b64e36ddb40c93f9770d0d5135646..3fa9c12529272c495644508e947d63c6a3f973b2 100644
--- a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
@@ -22,8 +22,8 @@ import paddle
 import paddle.distributed.fleet as fleet
 import paddle.fluid.core as core
 from paddle.distributed.fleet.meta_optimizers.common import CollectiveHelper
-from paddle.fluid.clip import ClipGradBase, _clip_by_global_norm_using_mp_type
 from paddle.incubate import DistributedFusedLamb
+from paddle.nn.clip import ClipGradBase, _clip_by_global_norm_using_mp_type
 from paddle.vision.models import resnet18 as resnet
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py
index de3508afcbe2bca43c4a5e762f4519a7e2e4c714..218e3ed4326ad5c0e9282b4dc0026464304ab363 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py
@@ -19,6 +19,7 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 from paddle.jit.dy2static import Call
+from paddle.nn import clip
 
 SEED = 2020
 np.random.seed(SEED)
@@ -89,11 +90,11 @@ def len_with_selected_rows(place):
         type=fluid.core.VarDesc.VarType.SELECTED_ROWS,
     )
     # y is Variable(SelectedRows)
-    y = fluid.layers.merge_selected_rows(var)
+    y = clip.merge_selected_rows(var)
     y_len = Call(len)(y)
 
     # z is inner tensor with shape [4, 2]
-    z = fluid.layers.get_tensor_from_selected_rows(y)
+    z = clip.get_tensor_from_selected_rows(y)
     z_len = Call(len)(z)
 
     # set data for selected_rows
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
index a0a45ddbde2bea07d67b7d40299e695b7ce11ff5..64d0d816ba0a5bf0a2e54d5096aeafb2f900f999 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
@@ -22,8 +22,8 @@ from seq2seq_dygraph_model import AttentionModel, BaseModel
 from seq2seq_utils import Seq2SeqModelHyperParams, get_data_iter
 
 import paddle.fluid as fluid
-from paddle.fluid.clip import GradientClipByGlobalNorm
 from paddle.jit import ProgramTranslator
+from paddle.nn import ClipGradByGlobalNorm
 
 place = (
     fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
@@ -71,7 +71,7 @@ def train(args, attn_model=False):
                 dropout=args.dropout,
             )
 
-        gloabl_norm_clip = GradientClipByGlobalNorm(args.max_grad_norm)
+        gloabl_norm_clip = ClipGradByGlobalNorm(args.max_grad_norm)
         optimizer = fluid.optimizer.SGD(
             args.learning_rate,
             parameter_list=model.parameters(),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
index b1890ea95ab9782187c66d58027422e7481b0602..24a63751cfec431d4335baa793543da3ba48d83d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
@@ -127,7 +127,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_Clip(
 ):
     def set_params(self):
         self.operand = paddle.add
-        self.act = fluid.layers.clip
+        self.act = paddle.clip
         self.act_alpha = 0.0
         self.act_beta = 10.0
 
@@ -219,7 +219,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_Clip(
 ):
     def set_params(self):
         self.operand = paddle.subtract
-        self.act = fluid.layers.clip
+        self.act = paddle.clip
         self.act_alpha = 0.0
         self.act_beta = 10.0
 
@@ -319,7 +319,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_Clip(
 ):
     def set_params(self):
         self.operand = paddle.multiply
-        self.act = fluid.layers.clip
+        self.act = paddle.clip
         self.act_alpha = 0.0
         self.act_beta = 10.0
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
index 3a39c84141ced2c0f0538350b3d70c7d9bcaf9c3..0c205fbee7c87079035221e457663c24b0234ced 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
@@ -106,7 +106,7 @@ class TensorRTSubgraphPassHardSwishPluginTest(
 
 class TensorRTSubgraphPassClipTest(TensorRTSubgraphPassActivationTest):
     def append_act(self, x):
-        return fluid.layers.clip(x, 0, 1)
+        return paddle.clip(x, 0, 1)
 
 
 class TensorRTSubgraphPassTanhTest(TensorRTSubgraphPassActivationTest):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py
index 8b13546d9a2852009dfba4744b5bdfaaac07d3d0..122429a7f8454cd687b90b3e503b531727d478f0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py
@@ -117,13 +117,13 @@ class TestClipOpError(unittest.TestCase):
             input_data = np.random.random((2, 4)).astype("float32")
 
             def test_Variable():
-                fluid.layers.clip(x=input_data, min=-1.0, max=1.0)
+                paddle.clip(x=input_data, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_dtype():
                 x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32')
-                fluid.layers.clip(x=x2, min=-1.0, max=1.0)
+                paddle.clip(x=x2, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_dtype)
         paddle.disable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 912f52969d712e1a03da97b9a9d119ab99161b22..d0e6c98e25a422c8eeeccb1feb1544b144152316 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -686,7 +686,7 @@ class TestAdamOpV2(unittest.TestCase):
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = fluid.dygraph.to_variable(value)
         linear = paddle.nn.Linear(13, 5)
-        clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
         adam = paddle.optimizer.Adam(
             0.1, parameters=linear.parameters(), grad_clip=clip
         )
diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
index a9d79f81bf310b9a1d94202c655571c948857909..ce3dd7509ce1d8cfddfc06af95a7f2d2358c8b5c 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
@@ -20,12 +20,13 @@ from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.nn import clip
 
 
 class TestClipByNormOp(OpTest):
     def setUp(self):
         self.max_relative_error = 0.006
-        self.python_api = fluid.layers.clip_by_norm
+        self.python_api = clip.clip_by_norm
         self.init_dtype()
         self.initTestCase()
         input = np.random.random(self.shape).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index a25edccb97a4edca00c4f24e4cd020c11062c449..359220a7a601f131f89e68c6da8b424d20070c3d 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -128,15 +128,9 @@ class TestClipOpError(unittest.TestCase):
             input_data = np.random.random((2, 4)).astype("float32")
 
             def test_Variable():
-                fluid.layers.clip(x=input_data, min=-1.0, max=1.0)
+                paddle.clip(x=input_data, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_Variable)
-
-            def test_dtype():
-                x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32')
-                fluid.layers.clip(x=x2, min=-1.0, max=1.0)
-
-            self.assertRaises(TypeError, test_dtype)
         paddle.disable_static()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index d5ad18fc434cbe9075604b9bef0798afeaa0c8a6..c6bdd59d496634744da2673d7f2ca8b103346376 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -584,7 +584,7 @@ class TestL2Decay(TranspilerTest):
         def filter(param):
             return param.name == "fc_w"
 
-        clip = fluid.clip.GradientClipByValue(0.1, need_clip=filter)
+        clip = paddle.nn.ClipGradByValue(0.1, need_clip=filter)
         sgd_optimizer.minimize(avg_cost, grad_clip=clip)
 
     def transpiler_test_impl(self):
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
index d0256b5dfb8994c3ee27fb0c2c29ab3bd136d4ac..80bc977f091bac9e57c5e4774e5236a96115c22c 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
@@ -504,8 +504,8 @@ class PaddingRNNTestBase(unittest.TestCase):
                     self.feed_order,
                 ) = res_vars
 
-                fluid.clip.set_gradient_clip(
-                    clip=fluid.clip.GradientClipByGlobalNorm(
+                paddle.nn.clip.set_gradient_clip(
+                    clip=paddle.nn.ClipGradByGlobalNorm(
                         clip_norm=config.max_grad_norm
                     )
                 )
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
index e81fc34ea2ca0fb8eea864a51791bf7b13a5abc0..400009f820de3c59cafb87582ca43c77dc7ae176 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
@@ -64,7 +64,7 @@ class TestFleetExecutor(unittest.TestCase):
             )
             opt = paddle.optimizer.AdamW(
                 learning_rate=lr_val,
-                grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
+                grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
             )
             opt.minimize(loss)
         # TODO: section_program will be removed in the future
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py
index 0de28e9839efa344244eeba0e60ad93afdca0291..d24348b7d77b58234f2dbc1ef9d7ae7d563a19d3 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py
@@ -64,7 +64,7 @@ class TestFleetExecutor(unittest.TestCase):
             )
             opt = paddle.optimizer.AdamW(
                 learning_rate=lr_val,
-                grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
+                grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
             )
             opt.minimize(loss)
         # TODO: section_program will be removed in the future
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py
index 36a85e2d74fc7af46061dc3ccef0e1255cdaa056..46eb0dc6f0bf8428ca0b5b6989fb6444ca5b2495 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py
@@ -47,7 +47,7 @@ class TestFleetExecutor(unittest.TestCase):
             )
             opt = paddle.optimizer.AdamW(
                 learning_rate=lr_val,
-                grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
+                grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
             )
             opt.minimize(loss)
         # TODO: section_program will be removed in the future
diff --git a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
index ab5b9096dcc8ad1c3df4485e3c805abbb3a05eec..d1e3e6df335b002a64d4dc33e5de001dab8c5546 100644
--- a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
@@ -20,6 +20,7 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid import Program, program_guard
 from paddle.fluid.op import Operator
+from paddle.nn import clip
 
 
 class TestGetTensorFromSelectedRowsError(unittest.TestCase):
@@ -31,12 +32,12 @@ class TestGetTensorFromSelectedRowsError(unittest.TestCase):
             x_data = np.random.random((2, 4)).astype("float32")
 
             def test_Variable():
-                fluid.layers.get_tensor_from_selected_rows(x=x_data)
+                clip.get_tensor_from_selected_rows(x=x_data)
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_SELECTED_ROWS():
-                fluid.layers.get_tensor_from_selected_rows(x=x_var)
+                clip.get_tensor_from_selected_rows(x=x_var)
 
             self.assertRaises(TypeError, test_SELECTED_ROWS)
 
diff --git a/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py b/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
index db34123d3bdd8f3faf27f5a8ba51ddb881fcbe87..4cb4b5d773b48ded81187c29993ec9912cb56457 100644
--- a/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
+++ b/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
@@ -17,12 +17,8 @@ import unittest
 import numpy as np
 
 import paddle.fluid as fluid
-from paddle.fluid.clip import (
-    GradientClipByGlobalNorm,
-    GradientClipByNorm,
-    GradientClipByValue,
-)
 from paddle.fluid.dygraph.base import to_variable
+from paddle.nn import ClipGradByGlobalNorm, ClipGradByNorm, ClipGradByValue
 
 
 class TestGradClipByGlobalNorm(unittest.TestCase):
@@ -67,7 +63,7 @@ class TestGradClipByGlobalNorm(unittest.TestCase):
     def get_dygrap_global_norm_result(self):
         with fluid.dygraph.guard():
 
-            gloabl_norm_clip = GradientClipByGlobalNorm(self.max_global_norm)
+            gloabl_norm_clip = ClipGradByGlobalNorm(self.max_global_norm)
             p_g_var = []
             for p, g in self.para_and_grad:
                 new_p = to_variable(p)
@@ -142,7 +138,7 @@ class TestGradClipByNorm(unittest.TestCase):
     def get_dygrap_norm_result(self):
         with fluid.dygraph.guard():
 
-            norm_clip = GradientClipByNorm(self.max_norm)
+            norm_clip = ClipGradByNorm(self.max_norm)
             p_g_var = []
             for p, g in self.para_and_grad:
                 new_p = to_variable(p)
@@ -212,9 +208,7 @@ class TestGradClipByValue(unittest.TestCase):
 
     def get_dygrap_clip_result(self):
         with fluid.dygraph.guard():
-            value_clip = GradientClipByValue(
-                max=self.max_value, min=self.min_value
-            )
+            value_clip = ClipGradByValue(max=self.max_value, min=self.min_value)
             p_g_var = []
             for p, g in self.para_and_grad:
                 new_p = to_variable(p)
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index 2243ae8c45602a694e1ce79e72cbc033abaf1636..b5b0b20c6f48bc841bd0dfb5f9a61449cadc93bf 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -20,7 +20,7 @@ from fake_reader import fake_imdb_reader
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.clip import _allow_pure_fp16_global_norm_clip
+from paddle.nn.clip import _allow_pure_fp16_global_norm_clip
 
 paddle.enable_static()
 
@@ -173,9 +173,9 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
     # test whether the output is right when use 'set_gradient_clip'
     def test_old_gradient_clip(self):
         def func(params_grads):
-            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
-            fluid.clip.set_gradient_clip(clip)
-            return fluid.clip.append_gradient_clip_ops(params_grads)
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
+            paddle.nn.clip.set_gradient_clip(clip)
+            return paddle.nn.clip.append_gradient_clip_ops(params_grads)
 
         self.clip_gradient = func
         self.check_gradient_clip(fluid.CPUPlace())
@@ -183,7 +183,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
     # test whether the output is right when use grad_clip
     def test_new_gradient_clip(self):
         def func(params_grads):
-            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
             return clip(params_grads)
 
         self.clip_gradient = func
@@ -192,7 +192,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
     # test whether the output is right when use grad_clip under float64
     def test_new_gradient_clip_fp64(self):
         def func(params_grads):
-            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
             return clip(params_grads)
 
         self.clip_gradient = func
@@ -201,15 +201,15 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
     # invoke 'set_gradient_clip' in a wrong order
     def test_wrong_API_order(self):
         def backward_func(cost):
-            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
-            fluid.clip.set_gradient_clip(clip)
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=5.0)
+            paddle.nn.clip.set_gradient_clip(clip)
             sgd_optimizer = fluid.optimizer.SGD(
                 learning_rate=0.01, grad_clip=clip
             )
             # if 'set_gradient_clip' and 'optimize(grad_clip)' together, 'set_gradient_clip' will be ineffective
             sgd_optimizer.minimize(cost)
             # 'set_gradient_clip' must before 'minimize', otherwise, 'set_gradient_clip' will be ineffective
-            fluid.clip.set_gradient_clip(clip)
+            paddle.nn.clip.set_gradient_clip(clip)
 
         self.backward_and_optimize = backward_func
         for place in self.get_places():
@@ -269,7 +269,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
         with fluid.program_guard(
             main_program=prog, startup_program=startup_program
         ):
-            clip = fluid.clip.GradientClipByGlobalNorm(self.clip_norm)
+            clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
             x = (
                 fluid.default_main_program()
                 .global_block()
@@ -313,7 +313,7 @@ class TestGradientClipByNorm(TestGradientClip):
     # test whether the output is right when use grad_clip
     def test_gradient_clip(self):
         def func(params_grads):
-            clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
+            clip = paddle.nn.ClipGradByNorm(clip_norm=self.clip_norm)
             return clip(params_grads)
 
         self.clip_gradient = func
@@ -321,7 +321,7 @@ class TestGradientClipByNorm(TestGradientClip):
 
     # if grad is None or not need clip
     def test_none_grad(self):
-        clip = fluid.clip.GradientClipByNorm(self.clip_norm)
+        clip = paddle.nn.ClipGradByNorm(self.clip_norm)
         x = (
             fluid.default_main_program()
             .global_block()
@@ -371,7 +371,7 @@ class TestGradientClipByValue(TestGradientClip):
     # test whether the output is right when use grad_clip
     def test_gradient_clip(self):
         def func(params_grads):
-            clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
+            clip = paddle.nn.ClipGradByValue(max=self.max, min=self.min)
             return clip(params_grads)
 
         self.clip_gradient = func
@@ -379,7 +379,7 @@ class TestGradientClipByValue(TestGradientClip):
 
     # if grad is None or not need clip
     def test_none_grad(self):
-        clip = fluid.clip.GradientClipByValue(self.max, self.min)
+        clip = paddle.nn.ClipGradByValue(self.max, self.min)
         x = (
             fluid.default_main_program()
             .global_block()
@@ -419,7 +419,7 @@ class TestDygraphGradientClip(unittest.TestCase):
             sgd_optimizer = fluid.optimizer.SGD(
                 learning_rate=0.0,
                 parameter_list=linear.parameters(),
-                grad_clip=fluid.clip.GradientClipByGlobalNorm(0.1),
+                grad_clip=paddle.nn.ClipGradByGlobalNorm(0.1),
             )
             self.check_clip_result(loss, sgd_optimizer)
 
@@ -430,12 +430,8 @@ class TestDygraphGradientClip(unittest.TestCase):
 class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
     def setUp(self):
         self.clip_norm = 0.8
-        self.clip1 = fluid.clip.GradientClipByGlobalNorm(
-            clip_norm=self.clip_norm
-        )
-        self.clip2 = fluid.clip.GradientClipByGlobalNorm(
-            clip_norm=self.clip_norm
-        )
+        self.clip1 = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
+        self.clip2 = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
 
     def check_clip_result(self, loss, optimizer):
         # if grad is None
@@ -476,7 +472,7 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
 class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
     def setUp(self):
         self.clip_norm = 0.8
-        self.clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
+        self.clip = paddle.nn.ClipGradByNorm(clip_norm=self.clip_norm)
 
     def check_clip_result(self, loss, optimizer):
         # if grad is None
@@ -506,7 +502,7 @@ class TestDygraphGradientClipByValue(TestDygraphGradientClip):
     def setUp(self):
         self.max = 0.2
         self.min = 0.1
-        self.clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
+        self.clip = paddle.nn.ClipGradByValue(max=self.max, min=self.min)
 
     def check_clip_result(self, loss, optimizer):
         # if grad is None
@@ -572,7 +568,7 @@ class TestDygraphGradientClipFP16(unittest.TestCase):
                         params_grads.append((param, param._grad_ivar()))
                 _, grads = zip(*params_grads)
                 # clip grads
-                clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.8)
+                clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=0.8)
                 params_grads = clip(params_grads)
                 _, grads_clip = zip(*params_grads)
                 # param update
@@ -616,7 +612,7 @@ class TestDygraphGradientClipFP64(unittest.TestCase):
                     params_grads.append((param, param._grad_ivar()))
             _, grads = zip(*params_grads)
             # clip grads
-            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.1)
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=0.1)
             params_grads = clip(params_grads)
             _, grads_clip = zip(*params_grads)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
index ecb35e8eaf950cb3f88bea4fecf70c42d1f45363..54cba6eb800295e6a69c9e64be53d7798743383a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -361,7 +361,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
         place = fluid.CPUPlace()
         with fluid.dygraph.guard(place):
             model = MyLayer(size, vocab_size, size)
-            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
+            grad_clip = paddle.nn.ClipGradByGlobalNorm(0.001)
             optimizer = fluid.optimizer.AdamOptimizer(
                 0.001, parameter_list=model.parameters(), grad_clip=grad_clip
             )
@@ -380,7 +380,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
 
         with fluid.dygraph.guard(place):
             model = MyLayer2(size, vocab_size, size)
-            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
+            grad_clip = paddle.nn.ClipGradByGlobalNorm(0.001)
             optimizer = fluid.optimizer.AdamOptimizer(
                 0.001, parameter_list=model.parameters(), grad_clip=grad_clip
             )
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
index cea97398d17159aa0756d5e985b77de0db772ddc..5cc7f63eb7883b1dc260445dcd4f9f1a98c28b99 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
@@ -52,7 +52,7 @@ class TestSimpleNet(unittest.TestCase):
                     fluid.set_flags(
                         {'FLAGS_sort_sum_gradient': sort_sum_gradient}
                     )
-                    # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
+                    # grad_clip = paddle.nn.ClipGradByGlobalNorm(5.0)
 
                     input_word = np.array([[1, 2], [2, 1]]).astype('int64')
                     input = paddle.to_tensor(input_word)
@@ -91,7 +91,7 @@ class TestSimpleNet(unittest.TestCase):
                     fluid.set_flags(
                         {'FLAGS_sort_sum_gradient': sort_sum_gradient}
                     )
-                    grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
+                    grad_clip = paddle.nn.ClipGradByGlobalNorm(5.0)
 
                     input_word = np.array([[1, 2], [2, 1]]).astype('int64')
                     input = to_variable(input_word)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
index 9efb334ac7dd5e0618491c98aee1ae0e2c5a83e7..e4982c42e4e100a3008c9431621c505a042d237e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
@@ -131,13 +131,13 @@ class TestClipOpError(unittest.TestCase):
             input_data = np.random.random((2, 4)).astype("float32")
 
             def test_Variable():
-                fluid.layers.clip(x=input_data, min=-1.0, max=1.0)
+                paddle.clip(x=input_data, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_dtype():
                 x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32')
-                fluid.layers.clip(x=x2, min=-1.0, max=1.0)
+                paddle.clip(x=x2, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_dtype)
         paddle.disable_static()
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 86cab526398dd4412227a1b99dd562bd4fcb1cbb..52a0f8b4b3c4f4790008ed3224a3696da1f41cda 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -1535,7 +1535,7 @@ class Model:
                 assert isinstance(
                     self._optimizer._grad_clip,
                     (paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm),
-                ), "Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently."
+                ), "Only ClipGradByNorm and ClipGradByGlobalNorm are supported in amp training with level=O2 currently."
 
         self._adapter._amp_custom_lists = {}
         self._adapter._amp_configs = {}
diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py
index ca4922700b8f49f4b8a3a9222ce0afcdb9228b1f..6bee79b871cd5e721be31545c4037afa6a5668ea 100644
--- a/python/paddle/incubate/distributed/models/moe/grad_clip.py
+++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py
@@ -15,13 +15,14 @@
 import paddle
 import paddle.distributed as dist
 from paddle.fluid import core, layers
-from paddle.fluid.clip import ClipGradBase, _squared_l2_norm
 from paddle.fluid.dygraph import base as imperative_base
+from paddle.nn import clip
+from paddle.nn.clip import ClipGradBase, _squared_l2_norm
 
 
 class ClipGradForMOEByGlobalNorm(ClipGradBase):
     r"""
-    The Algrithm is the same as paddle.fluid.clip.ClipGradByGlobalNorm
+    The Algrithm is the same as paddle.nn.ClipGradByGlobalNorm
     Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
     :math:`t\_list` , and limit it to ``clip_norm`` .
 
@@ -113,8 +114,8 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
                 continue
             merge_grad = g
             if g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                merge_grad = layers.merge_selected_rows(g)
-                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
+                merge_grad = clip.merge_selected_rows(g)
+                merge_grad = clip.get_tensor_from_selected_rows(merge_grad)
             sum_square = _squared_l2_norm(merge_grad)
             if sum_square.dtype == core.VarDesc.VarType.FP16:
                 sum_square_list_fp16.append(sum_square)
diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
index bc2837fa2fe58f8b2e5dcaddf59e806471823b29..9aa51cd8122e68114e610714672980ba132f9629 100644
--- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py
+++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
@@ -16,11 +16,11 @@ import os
 
 import paddle
 from paddle.fluid import core, framework, unique_name
-from paddle.fluid.clip import ClipGradByGlobalNorm
 from paddle.fluid.executor import global_scope
 from paddle.fluid.framework import Variable, name_scope
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.optimizer import Optimizer
+from paddle.nn import ClipGradByGlobalNorm
 
 
 def init_communicator(block, rank, ranks, ring_id):
diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py
index 61143175fd4af5070ab72036de7c0cc47778aa43..10eeb6319063c1468b20bc2b03c0528e82b77bf6 100644
--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
@@ -12,9 +12,1074 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define the functions to clip gradient of parameter
-from ..fluid.clip import ClipGradByGlobalNorm  # noqa: F401
-from ..fluid.clip import ClipGradByNorm  # noqa: F401
-from ..fluid.clip import ClipGradByValue  # noqa: F401
+import copy
+import warnings
+
+import paddle
+import paddle.autograd as imperative_base
+from paddle import _C_ops, _legacy_C_ops
+from paddle.common_ops_import import Variable, check_type, default_main_program
+from paddle.fluid import core, framework, layers, unique_name
+from paddle.fluid.data_feeder import check_variable_and_dtype
+from paddle.framework import LayerHelper, _non_static_mode, in_dygraph_mode
+from paddle.tensor.layer_function_generator import templatedoc
 
 __all__ = []
+
+
+@templatedoc()
+def clip_by_norm(x, max_norm, name=None):
+    """
+    ${comment}
+
+    Args:
+        x(${x_type}): ${x_comment}
+        max_norm(${max_norm_type}): ${max_norm_comment}
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
+
+    Returns:
+        Tensor:
+
+        out(${out_type}): ${out_comment}
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.nn import clip
+
+            input = paddle.to_tensor([[2.0, 2.0], [2.0, 2.0]], dtype='float32')
+            reward = clip.clip_by_norm(x=input, max_norm=1.0)
+            # [[0.5, 0.5], [0.5, 0.5]]
+    """
+
+    if in_dygraph_mode():
+        return _C_ops.clip_by_norm(x, max_norm)
+    if _non_static_mode():
+        return _legacy_C_ops.clip_by_norm(x, 'max_norm', max_norm)
+
+    helper = LayerHelper("clip_by_norm", **locals())
+    check_variable_and_dtype(x, 'X', ['float32', 'float16'], 'clip_by_norm')
+    check_type(max_norm, 'max_norm', (float), 'clip_by_norm')
+
+    if name is None:
+        name = unique_name.generate_with_ignorable_key(
+            ".".join([helper.name, 'tmp'])
+        )
+
+    out = helper.create_variable(
+        type=x.type, name=name, dtype=x.dtype, persistable=False
+    )
+
+    helper.append_op(
+        type="clip_by_norm",
+        inputs={"X": x},
+        attrs={"max_norm": max_norm},
+        outputs={"Out": out},
+    )
+
+    return out
+
+
+@templatedoc()
+def merge_selected_rows(x, name=None):
+    """
+    ${comment}
+
+    Args:
+        x(${x_type}): ${x_comment}
+        name(basestring|None): Name of the output.
+
+    Returns:
+        out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            b = fluid.default_main_program().global_block()
+            var = b.create_var(
+                name="X", dtype="float32", persistable=True,
+                type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
+            y = nn.merge_selected_rows(var)
+    """
+    if in_dygraph_mode():
+        return _C_ops.merge_selected_rows(x)
+
+    if _non_static_mode():
+        return _legacy_C_ops.merge_selected_rows(x)
+
+    helper = LayerHelper("merge_selected_rows", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="merge_selected_rows",
+        inputs={"X": x},
+        attrs={},
+        outputs={"Out": out},
+    )
+    return out
+
+
+@templatedoc()
+def get_tensor_from_selected_rows(x, name=None):
+    """
+    Get tensor data from input with SelectedRows type, and outputs a Tensor.
+
+    .. code-block:: text
+
+        input x is SelectedRows:
+           x.rows = [0, 5, 5, 4, 19]
+           x.height = 20
+           x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]]
+
+        Output is LoDTensor:
+           out.shape = [5, 2]
+           out.data = [[1, 1],
+                       [2, 2],
+                       [2, 2],
+                       [3, 3],
+                       [6, 6]]
+
+    Args:
+        x(SelectedRows): Input with SelectedRows type. The data type is float32, float64, int32 or int64.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Variable: LoDTensor transformed from SelectedRows. The data type is same with input.
+
+    Examples:
+        .. code-block:: python
+
+            from paddle import nnp.py
+            b = fluid.default_main_program().global_block()
+            input = b.create_var(name="X", dtype="float32", persistable=True, type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
+            out = nn.get_tensor_from_selected_rows(input)
+    """
+
+    check_type(x, 'x', Variable, 'get_tensor_from_selected_rows')
+    if x.type != core.VarDesc.VarType.SELECTED_ROWS:
+        raise TypeError(
+            "The type of 'x' in get_tensor_from_selected_rows must be SELECTED_ROWS."
+        )
+    helper = LayerHelper('get_tensor_from_selected_rows', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='get_tensor_from_selected_rows',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={},
+    )
+    return out
+
+
+_clip_by_global_norm_using_mp_type_flag = False
+
+
+def _clip_by_global_norm_using_mp_type(*args):
+    global _clip_by_global_norm_using_mp_type_flag
+    assert len(args) <= 1
+    if len(args) == 1:
+        assert isinstance(args[0], bool)
+        old_value = _clip_by_global_norm_using_mp_type_flag
+        _clip_by_global_norm_using_mp_type_flag = args[0]
+        return old_value
+    else:
+        return _clip_by_global_norm_using_mp_type_flag
+
+
+def _cast_to_mp_type_if_enabled(x):
+    if (
+        x.dtype == core.VarDesc.VarType.FP16
+        or x.dtype == core.VarDesc.VarType.BF16
+    ) and _clip_by_global_norm_using_mp_type():
+        return x.astype(core.VarDesc.VarType.FP32)
+    else:
+        return x
+
+
+def _squared_l2_norm(x):
+    r"""
+    Return the squared L2 norm of a tensor.
+    """
+
+    x = _cast_to_mp_type_if_enabled(x)
+    if (
+        core.is_compiled_with_xpu()
+        or x.dtype == core.VarDesc.VarType.FP16
+        or x.dtype == core.VarDesc.VarType.BF16
+    ):
+        square = paddle.square(x)
+        sum_square = paddle.sum(square)
+        return sum_square
+
+    if in_dygraph_mode():
+        return _C_ops.squared_l2_norm(x)
+
+    op_type = 'squared_l2_norm'
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], op_type)
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+
+    inputs = {"X": x}
+    outputs = {'Out': out}
+    helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
+    return out
+
+
+class BaseErrorClipAttr:
+    def __str__(self):
+        raise NotImplementedError()
+
+    def _append_clip_op(self, block, grad_name):
+        raise NotImplementedError()
+
+
+class ErrorClipByValue(BaseErrorClipAttr):
+    r"""
+    Clip tensor values to the range [min, max].
+
+    Given a tensor ``t`` (see Examples below), this operation clips its value \
+    to ``min`` and ``max`` inplace.
+
+    - Any values less than min are set to min.
+    - Any values greater than max are set to max.
+
+    Args:
+        max (float): The maximum value to clip by.
+        min (float, optional): The minimum value to clip by. if not set by user, \
+        will be set to ``-max`` by framework.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
+            BATCH_SIZE = 128
+            CLIP_MAX = 2e-6
+            CLIP_MIN = -1e-6
+            prog = fluid.framework.Program()
+            with fluid.program_guard(main_program=prog):
+                image = fluid.layers.data(
+                    name='x', shape=[784], dtype='float32')
+                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
+                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
+                predict = fluid.layers.fc(
+                    input=hidden2, size=10, act='softmax')
+                label = fluid.layers.data(name='y', shape=[1], dtype='int64')
+                cost = paddle.nn.functional.cross_entropy(input=predict, label=label)
+                avg_cost = paddle.mean(cost)
+            prog_clip = prog.clone()
+            prog_clip.block(0).var(hidden1.name)._set_error_clip(
+                paddle.nn.clip.ErrorClipByValue(
+                    max=CLIP_MAX, min=CLIP_MIN)
+                    )
+    """
+
+    def __init__(self, max, min=None):
+        max = float(max)
+        if min is None:
+            min = -max
+        else:
+            min = float(min)
+        self.max = max
+        self.min = min
+
+    def __str__(self):
+        return "ByValue, min=%f, max=%f" % (self.min, self.max)
+
+    def _append_clip_op(self, block, grad_name):
+        clip_op_desc = block.desc.append_op()
+        clip_op_desc.set_type("clip")
+        clip_op_desc.set_input("X", [grad_name])
+        clip_op_desc.set_output("Out", [grad_name])
+        clip_op_desc._set_attr("min", self.min)
+        clip_op_desc._set_attr("max", self.max)
+
+
+def error_clip_callback(block, context):
+    # the context is a grad_to_var map
+    grad_to_var = context
+    op_desc = block.desc.op(block.desc.op_size() - 1)
+    for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
+        fwd_var = block._var_recursive(grad_to_var[grad_n])
+        error_clip = getattr(fwd_var, "error_clip", None)
+        if not (
+            error_clip is None or isinstance(error_clip, BaseErrorClipAttr)
+        ):
+            raise TypeError(
+                "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
+            )
+        if error_clip is not None:
+            error_clip._append_clip_op(block, grad_n)
+
+
+class ClipGradBase:
+    def __init__(self):
+        super().__init__()
+
+    def __str__(self):
+        raise NotImplementedError()
+
+    @imperative_base.no_grad()
+    def _dygraph_clip(self, params_grads):
+        raise NotImplementedError
+
+    def _static_clip(self, params_grads):
+        raise NotImplementedError
+
+    def __call__(self, params_grads):
+        if _non_static_mode():
+            return self._dygraph_clip(params_grads)
+        else:
+            for p, g in params_grads:
+                if getattr(p, 'gradient_clip_attr', None) is not None:
+                    warnings.warn(
+                        "'set_gradient_clip' will be ineffective, because you have "
+                        "set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' "
+                        "is redundant and you can remove it."
+                    )
+                    break
+            return self._static_clip(params_grads)
+
+    def _process_context(self, context, param, grad):
+        raise NotImplementedError()
+
+    def _create_operators(self, param, grad):
+        raise NotImplementedError()
+
+
+class ClipGradByValue(ClipGradBase):
+    """
+    Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
+
+    - Any values less than min are set to ``min``.
+
+    - Any values greater than max are set to ``max``.
+
+    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
+    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
+
+    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
+    (for example: :ref:`api_paddle_optimizer_SGD`).
+
+    Note:
+        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+
+    Args:
+        max (float): The maximum value to clip by.
+        min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
+            automatically. In this case, ``max`` must be greater than 0.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
+            linear = paddle.nn.Linear(in_features=10, out_features=10,
+                                      weight_attr=paddle.ParamAttr(need_clip=True),
+                                      bias_attr=paddle.ParamAttr(need_clip=False))
+            out = linear(x)
+            loss = paddle.mean(out)
+            loss.backward()
+
+            clip = paddle.nn.ClipGradByValue(min=-1, max=1)
+            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
+            sdg.step()
+    """
+
+    def __init__(self, max, min=None):
+        super().__init__()
+        if min is None:
+            assert max > 0.0
+            min = -max
+        self.max = float(max)
+        self.min = float(min)
+
+    def __str__(self):
+        return "Clip Gradient By Value, min = %f, max=%f" % (self.min, self.max)
+
+    @imperative_base.no_grad()
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                params_and_grads.append((p, g))
+                continue
+            new_grad = paddle.clip(x=g, min=self.min, max=self.max)
+            params_and_grads.append((p, new_grad))
+        return params_and_grads
+
+    def _static_clip(self, params_grads):
+        params_and_grads = []
+        param_new_grad_name_dict = dict()
+        with framework.name_scope('gradient_clip'):
+            for p, g in params_grads:
+                if g is None:
+                    continue
+                if getattr(p, 'need_clip', True) is False:
+                    params_and_grads.append((p, g))
+                    continue
+
+                with p.block.program._optimized_guard([p, g]):
+                    new_grad = paddle.clip(x=g, min=self.min, max=self.max)
+                params_and_grads.append((p, new_grad))
+                param_new_grad_name_dict[p.name] = new_grad.name
+        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
+        return params_and_grads
+
+    def _process_context(self, context, param, grad):
+        pass
+
+    def _create_operators(self, param, grad):
+        new_grad = paddle.clip(x=grad, min=self.min, max=self.max)
+        return param, new_grad
+
+
+class ClipGradByNorm(ClipGradBase):
+    r"""
+    Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
+
+    - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.
+
+    - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
+
+    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
+    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
+
+    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
+    (for example: :ref:`api_paddle_optimizer_SGD`).
+
+    The clipping formula is:
+
+    .. math::
+        Out =
+        \left\{
+            \begin{array}{ccl}
+                X & & if (norm(X) \leq clip\_norm) \\
+                \frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\
+        \end{array}
+        \right.
+
+
+    where :math:`norm(X)` represents the L2 norm of :math:`X`.
+
+    .. math::
+        norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}
+
+    Note:
+        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+
+    Args:
+        clip_norm(float): The maximum norm value.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
+            linear = paddle.nn.Linear(in_features=10, out_features=10,
+                                      weight_attr=paddle.ParamAttr(need_clip=True),
+                                      bias_attr=paddle.ParamAttr(need_clip=False))
+            out = linear(x)
+            loss = paddle.mean(out)
+            loss.backward()
+
+            clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
+            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
+            sdg.step()
+    """
+
+    def __init__(self, clip_norm):
+        super().__init__()
+        self.clip_norm = float(clip_norm)
+
+    def __str__(self):
+        return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm
+
+    @imperative_base.no_grad()
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                params_and_grads.append((p, g))
+                continue
+            new_grad = clip_by_norm(x=g, max_norm=self.clip_norm)
+            params_and_grads.append((p, new_grad))
+        return params_and_grads
+
+    def _static_clip(self, params_grads):
+        params_and_grads = []
+        with framework.name_scope('gradient_clip'):
+            param_new_grad_name_dict = dict()
+            for p, g in params_grads:
+                if g is None:
+                    continue
+                if getattr(p, 'need_clip', True) is False:
+                    params_and_grads.append((p, g))
+                    continue
+
+                with p.block.program._optimized_guard([p, g]):
+                    new_grad = clip_by_norm(x=g, max_norm=self.clip_norm)
+                param_new_grad_name_dict[p.name] = new_grad.name
+                params_and_grads.append((p, new_grad))
+        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
+        return params_and_grads
+
+    def _process_context(self, context, param, grad):
+        pass
+
+    def _create_operators(self, param, grad):
+        new_grad = clip_by_norm(x=grad, max_norm=self.clip_norm)
+        return param, new_grad
+
+
+_allow_pure_fp16_global_norm_clip_flag = False
+
+
+def _allow_pure_fp16_global_norm_clip(*args):
+    global _allow_pure_fp16_global_norm_clip_flag
+    if len(args) == 0:
+        return _allow_pure_fp16_global_norm_clip_flag
+    else:
+        assert len(args) == 1 and isinstance(args[0], bool)
+        old_value = _allow_pure_fp16_global_norm_clip_flag
+        _allow_pure_fp16_global_norm_clip_flag = args[0]
+        return old_value
+
+
+class ClipGradByGlobalNorm(ClipGradBase):
+    r"""
+    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
+    :math:`t\_list` , and limit it to ``clip_norm`` .
+
+    - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.
+
+    - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
+
+    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
+    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
+
+    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
+    (for example: :ref:`api_paddle_optimizer_SGD`).
+
+    The clipping formula is:
+
+    .. math::
+
+        t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)}
+
+    where:
+
+    .. math::
+
+        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
+
+    Note:
+        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+
+    Args:
+        clip_norm (float): The maximum norm value.
+        group_name (str, optional): The group name for this clip. Default value is ``default_group``.
+        auto_skip_clip (bool, optional): skip clipping gradient. Default value is ``False``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
+            linear = paddle.nn.Linear(in_features=10, out_features=10,
+                                      weight_attr=paddle.ParamAttr(need_clip=True),
+                                      bias_attr=paddle.ParamAttr(need_clip=False))
+            out = linear(x)
+            loss = paddle.mean(out)
+            loss.backward()
+
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
+            sdg.step()
+    """
+
+    def __init__(
+        self, clip_norm, group_name="default_group", auto_skip_clip=False
+    ):
+        super().__init__()
+        self.clip_norm = float(clip_norm)
+        self.group_name = group_name
+        assert isinstance(auto_skip_clip, bool)
+        self.auto_skip_clip = auto_skip_clip
+
+    def __str__(self):
+        return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)
+
+    @imperative_base.no_grad()
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+        sum_square_list = []
+        sum_square_list_fp16 = []
+        sum_square_list_fp32 = []
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                continue
+            merge_grad = g
+
+            if in_dygraph_mode() and g.is_selected_rows():
+                merge_grad = merge_selected_rows(g)
+                merge_grad = merge_grad._get_tensor_from_selected_rows()
+
+            elif g.type == core.VarDesc.VarType.SELECTED_ROWS:
+                merge_grad = merge_selected_rows(g)
+                merge_grad = get_tensor_from_selected_rows(merge_grad)
+
+            sum_square = _squared_l2_norm(merge_grad)
+            if (
+                sum_square.dtype == core.VarDesc.VarType.FP16
+                or sum_square.dtype == core.VarDesc.VarType.BF16
+            ):
+                sum_square_list_fp16.append(sum_square)
+            elif sum_square.dtype == core.VarDesc.VarType.FP32:
+                sum_square_list_fp32.append(sum_square)
+            else:
+                sum_square_list.append(sum_square)
+
+        # all parameters have been filterd out
+        if (
+            len(sum_square_list)
+            + len(sum_square_list_fp16)
+            + len(sum_square_list_fp32)
+            == 0
+        ):
+            return params_grads
+
+        sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
+        global_norm_var = []
+        if len(sum_square_list_fp16) > 0:
+            global_norm_var_fp16 = paddle.add_n(sum_square_list_fp16)
+            global_norm_var.append(global_norm_var_fp16.astype(sum_dtype))
+        if len(sum_square_list_fp32) > 0:
+            global_norm_var_fp32 = paddle.add_n(sum_square_list_fp32)
+            if sum_dtype == 'float32':
+                global_norm_var.append(global_norm_var_fp32)
+            else:
+                global_norm_var.append(global_norm_var_fp32.astype(sum_dtype))
+        if len(sum_square_list) > 0:
+            global_norm_var_fp64 = paddle.add_n(sum_square_list)
+            global_norm_var.append(global_norm_var_fp64)
+        global_norm_var = paddle.add_n(global_norm_var)
+        global_norm_var = paddle.sqrt(global_norm_var)
+        max_global_norm = paddle.full(
+            shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm
+        )
+
+        need_clip = False
+        if not self.auto_skip_clip:  # always apply clip
+            need_clip = True
+            clip_var = paddle.divide(
+                x=max_global_norm,
+                y=paddle.maximum(x=global_norm_var, y=max_global_norm),
+            )
+        elif global_norm_var > max_global_norm:
+            # only when global_norm_var > max_global_norm, grad need clip
+            need_clip = True
+            clip_var = paddle.divide(x=max_global_norm, y=global_norm_var)
+
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                params_and_grads.append((p, g))
+                continue
+            # TODO(wangxi): use inplace elementwise_mul
+            if need_clip:
+                clip_input = (
+                    clip_var.astype(g.dtype)
+                    if clip_var.dtype != g.dtype
+                    else clip_var
+                )
+                new_grad = paddle.multiply(g, clip_input)
+                params_and_grads.append((p, new_grad))
+            else:
+                params_and_grads.append((p, g))
+
+        return params_and_grads
+
+    def _static_clip(self, params_grads):
+        params_and_grads = []
+        sum_square_list = []
+        sum_square_list_fp16 = []
+        sum_square_list_fp32 = []
+        with framework.name_scope('gradient_clip'):
+            for p, g in params_grads:
+                if g is None:
+                    continue
+                if getattr(p, 'need_clip', True) is False:
+                    continue
+                merge_grad = g
+                with p.block.program._optimized_guard([p, g]):
+                    if g.type == core.VarDesc.VarType.SELECTED_ROWS:
+                        merge_grad = merge_selected_rows(g)
+                        merge_grad = get_tensor_from_selected_rows(merge_grad)
+                    sum_square = _squared_l2_norm(merge_grad)
+                    if sum_square.dtype == core.VarDesc.VarType.FP16:
+                        sum_square_list_fp16.append(sum_square)
+                    elif sum_square.dtype == core.VarDesc.VarType.FP32:
+                        sum_square_list_fp32.append(sum_square)
+                    else:
+                        sum_square_list.append(sum_square)
+
+            # all parameters have been filterd out
+            if (
+                len(sum_square_list)
+                + len(sum_square_list_fp16)
+                + len(sum_square_list_fp32)
+                == 0
+            ):
+                return params_grads
+
+            with p.block.program._optimized_guard([p, g]):
+                sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
+
+                global_norm_var = []
+                if len(sum_square_list_fp16) > 0:
+                    global_norm_var_fp16 = layers.sums(sum_square_list_fp16)
+                    if (
+                        sum_square_list_fp32
+                        or sum_square_list
+                        or not _allow_pure_fp16_global_norm_clip()
+                    ):
+                        global_norm_var.append(
+                            global_norm_var_fp16.astype(sum_dtype)
+                        )
+                    else:
+                        global_norm_var.append(global_norm_var_fp16)
+                if len(sum_square_list_fp32) > 0:
+                    global_norm_var_fp32 = layers.sums(sum_square_list_fp32)
+                    if sum_dtype == 'float32':
+                        global_norm_var.append(global_norm_var_fp32)
+                    else:
+                        global_norm_var.append(
+                            global_norm_var_fp32.astype(sum_dtype)
+                        )
+                if len(sum_square_list) > 0:
+                    # fp64
+                    global_norm_var_other_dtype = layers.sums(sum_square_list)
+                    global_norm_var.append(global_norm_var_other_dtype)
+
+                global_norm_var = (
+                    layers.sums(global_norm_var)
+                    if len(global_norm_var) > 1
+                    else global_norm_var[0]
+                )
+                global_norm_var = paddle.sqrt(x=global_norm_var)
+                max_global_norm = paddle.full(
+                    shape=[1],
+                    dtype=global_norm_var.dtype,
+                    fill_value=self.clip_norm,
+                )
+                scale_var = paddle.divide(
+                    x=max_global_norm,
+                    y=paddle.maximum(x=max_global_norm, y=global_norm_var),
+                )
+            param_new_grad_name_dict = dict()
+            for p, g in params_grads:
+                if g is None:
+                    continue
+                if getattr(p, 'need_clip', True) is False:
+                    params_and_grads.append((p, g))
+                    continue
+
+                with p.block.program._optimized_guard([p, g]):
+                    new_g = _cast_to_mp_type_if_enabled(g)
+                    # inplace
+                    scale_input = (
+                        scale_var.astype('float16')
+                        if new_g.dtype == core.VarDesc.VarType.FP16
+                        and scale_var.dtype != core.VarDesc.VarType.FP16
+                        else scale_var
+                    )
+                    # NOTE(Yuang Liu): For pure dp with gradient merge, the p and g
+                    # will be in different blocks with the gradient clip related ops.
+                    # We need to handle the correct block, otherwise will encounter
+                    # a 'NotFoundError' during compile time.
+                    block = default_main_program().current_block()
+                    block.append_op(
+                        type='elementwise_mul',
+                        inputs={'X': new_g, 'Y': scale_input},
+                        outputs={'Out': new_g},
+                    )
+                    if new_g is not g:
+                        block.append_op(
+                            type='cast',
+                            inputs={'X': new_g},
+                            outputs={'Out': g},
+                            attrs={
+                                'in_dtype': new_g.dtype,
+                                'out_dtype': g.dtype,
+                            },
+                        )
+
+                param_new_grad_name_dict[p.name] = g.name
+                params_and_grads.append((p, g))
+
+        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
+        return params_and_grads
+
+    def _process_context(self, context, param, grad):
+        if self.group_name not in context:
+            context[self.group_name] = []
+            context[self.group_name + "_clip_value"] = self.clip_norm
+            context[self.group_name + "_clip"] = paddle.full(
+                shape=[1], dtype=grad.dtype, fill_value=self.clip_norm
+            )
+        else:
+            if not self.clip_norm == context[self.group_name + "_clip_value"]:
+                raise ValueError(
+                    "All parameters' 'clip_norm' of a same group should be the same"
+                )
+
+        merge_grad = grad
+        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            merge_grad = merge_selected_rows(grad)
+            merge_grad = get_tensor_from_selected_rows(merge_grad)
+
+        local_norm_var = _squared_l2_norm(merge_grad)
+        context[self.group_name].append(local_norm_var)
+
+        self.context = context
+
+    def _create_operators(self, param, grad):
+        group_scale_name = self.group_name + "_scale"
+        if group_scale_name not in self.context:
+            group_norm_var = layers.sums(input=self.context[self.group_name])
+            group_norm_var = paddle.sqrt(x=group_norm_var)
+            clip_var = self.context[self.group_name + "_clip"]
+            group_scale_var = paddle.divide(
+                x=clip_var,
+                y=paddle.maximum(x=clip_var, y=group_norm_var),
+            )
+            assert group_scale_var.shape == (1,)
+            self.context[group_scale_name] = group_scale_var
+
+        # inplace
+        param.block.append_op(
+            type='elementwise_mul',
+            inputs={'X': grad, 'Y': self.context[group_scale_name]},
+            outputs={'Out': grad},
+        )
+
+        return param, grad
+
+
+@framework.dygraph_not_support
+def set_gradient_clip(clip, param_list=None, program=None):
+    """
+    Warning:
+
+        This API must be used after building network, and before ``minimize`` ,
+        and it may be removed in future releases, so it is not recommended.
+        It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
+        this is a better method to clip gradient. There are three clipping strategies:
+         :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+         :ref:`api_fluid_clip_GradientClipByValue` .
+
+    To specify parameters that require gradient clip.
+
+    Args:
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no
+            gradient clipping.
+        param_list (list(Variable), optional): Parameters that require gradient clip.
+                It can be a list of parameter or a list of parameter's name.
+                Default None, meaning that all parameters in the program will be included.
+        program (Program, optional): The program where parameters are located.
+                Default None, meaning that using :ref:`api_fluid_default_main_program` .
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+
+            paddle.enable_static()
+
+            def network():
+                image = fluid.data(name='image', shape=[
+                                   None, 28], dtype='float32')
+                param_attr1 = fluid.ParamAttr("fc1_param")
+                fc1 = fluid.layers.fc(image, size=10, param_attr=param_attr1)
+                param_attr2 = fluid.ParamAttr("fc2_param")
+                fc2 = fluid.layers.fc(fc1, size=10, param_attr=param_attr2)
+                loss = paddle.mean(fc2)
+                return loss
+
+
+            # network 1: clip all parameter gradient
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                loss = network()
+                paddle.nn.clip.set_gradient_clip(
+                    paddle.nn.ClipGradByGlobalNorm(clip_norm=2.0))
+                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+                sgd.minimize(loss)
+
+            # network 2: clip parameter gradient by name
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                loss = network()
+                paddle.nn.clip.set_gradient_clip(
+                    paddle.nn.ClipGradByValue(min=-1.0, max=1.0),
+                    param_list=["fc1_param", "fc2_param"])
+                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+                sgd.minimize(loss)
+
+            # network 3: clip parameter gradient by value
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                loss = network()
+                param_var1 = fluid.default_main_program().global_block().var("fc1_param")
+                param_var2 = fluid.default_main_program().global_block().var("fc2_param")
+                paddle.nn.clip.set_gradient_clip(
+                    paddle.nn.ClipGradByValue(min=-1.0, max=1.0),
+                    param_list=[param_var1, param_var2])
+                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+                sgd.minimize(loss)
+
+            # network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                loss = network()
+                clip1 = paddle.nn.ClipGradByValue(min=-1.0, max=1.0)
+                clip2 = paddle.nn.ClipGradByNorm(clip_norm=1.0)
+                # Set the gradient clipping strategy: clip1
+                paddle.nn.clip.set_gradient_clip(clip1)
+                # Set the gradient clipping strategy: clip2
+                sgd = fluid.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
+                sgd.minimize(loss)
+                # 'set_gradient_clip' will not take effect when setting has a conflict,
+                # and the gradient clipping strategy will be 'clip2'
+
+
+    """
+    warnings.warn(
+        "Caution! 'set_gradient_clip' is not recommended "
+        "and may be deprecated in future! "
+        "We recommend a new strategy: set 'grad_clip' "
+        "when initializing the 'optimizer'. "
+        "This method can reduce the mistakes, please "
+        "refer to documention of 'optimizer'."
+    )
+
+    if not isinstance(clip, ClipGradBase):
+        raise TypeError(
+            "'clip' should be an instance of ClipGradBase's derived class"
+        )
+    if program is None:
+        program = framework.default_main_program()
+
+    for op in program.block(0).ops:
+        if 'op_namescope' in op.all_attrs() and "optimizer" in op.attr(
+            "op_namescope"
+        ):
+            warnings.warn(
+                "'minimize' has been invoked before, this will make 'set_gradient_clip' "
+                "be ineffective! Please invoke 'set_gradient_clip' before 'minimize'."
+            )
+            break
+
+    if param_list is None:
+        param_list = program.block(0).all_parameters()
+    if all(isinstance(elem, str) for elem in param_list):
+        param_list = [program.block(0).var(elem) for elem in param_list]
+    if not all(isinstance(elem, framework.Parameter) for elem in param_list):
+        raise TypeError(
+            "'param_list' should be a list of Parameter or basestring(parameter's name)."
+        )
+
+    for param in param_list:
+        param.gradient_clip_attr = copy.deepcopy(clip)
+
+
+def append_gradient_clip_ops(param_grads):
+    context = dict()
+    for p, g in param_grads:
+        if g is None:
+            continue
+        with p.block.program._optimized_guard([p, g]), framework.name_scope(
+            'gradient_clip'
+        ):
+            clip_attr = getattr(p, 'gradient_clip_attr', None)
+            if clip_attr is None:
+                return param_grads
+            if not isinstance(clip_attr, ClipGradBase):
+                raise TypeError(
+                    "clip attribute should be an instance of GradientClipBase"
+                )
+
+            clip_attr._process_context(context=context, param=p, grad=g)
+
+    res = []
+    param_new_grad_name_dict = dict()
+    for p, g in param_grads:
+        if g is None:
+            continue
+        with p.block.program._optimized_guard([p, g]), framework.name_scope(
+            'gradient_clip'
+        ):
+            param, new_grad = clip_attr._create_operators(param=p, grad=g)
+            param_new_grad_name_dict[param.name] = new_grad.name
+            res.append([param, new_grad])
+
+    _correct_clip_op_role_var(res, param_new_grad_name_dict)
+    return res
+
+
+# change wrong mapping relation between param & grad in clip op
+# Note: This function is sensitive to the time cost of the network with gradient clipping
+# and should not be changed easily. If you must change, please test the time cost.
+def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
+    block_id_list = []
+    if len(param_new_grad_name_dict) == 0:
+        return
+    for param, grad in params_grads:
+        if grad is None:
+            continue
+        block_id = param.block.idx
+        if block_id in block_id_list:
+            continue
+        block_id_list.append(block_id)
+        for op in param.block.program.global_block().ops:
+            if (
+                op.has_attr("op_namescope")
+                and "gradient_clip" in op.attr("op_namescope")
+                and op.attr('op_role_var')
+            ):
+                param_name = op.attr('op_role_var')[0]
+                if param_name in param_new_grad_name_dict:
+                    correct_p_g = [
+                        param_name,
+                        param_new_grad_name_dict[param_name],
+                    ]
+                    op._set_attr('op_role_var', correct_p_g)
+
+
+GradientClipBase = ClipGradBase
+GradientClipByValue = ClipGradByValue
+GradientClipByNorm = ClipGradByNorm
+GradientClipByGlobalNorm = ClipGradByGlobalNorm
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index f03b3af2df97084e1bc2e5bd9d67b1442a19d3ee..a4d304b451e7b3cad3fdab97bf05e7854146a260 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -20,10 +20,10 @@ import paddle
 
 from .. import _C_ops
 from ..fluid import core, framework, unique_name
-from ..fluid.clip import GradientClipBase
 from ..fluid.dygraph import base as imperative_base
 from ..fluid.framework import Parameter, Variable
 from ..fluid.layer_helper import LayerHelper
+from ..nn.clip import GradientClipBase
 from .lr import LRScheduler
 from .optimizer import Optimizer
 
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index d5f18130a4c63e0883638773cf015872d2b22288..1799461254ced546eb35ac119d0cf893169c854e 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -18,6 +18,7 @@ from collections import defaultdict
 import numpy as np
 
 import paddle
+import paddle.autograd as imperative_base
 from paddle import _C_ops
 from paddle.fluid import core
 from paddle.fluid.framework import (
@@ -32,12 +33,6 @@ from paddle.fluid.framework import (
 
 from ..fluid import framework, unique_name
 from ..fluid.backward import _get_no_grad_set_name, append_backward
-from ..fluid.clip import (
-    GradientClipBase,
-    append_gradient_clip_ops,
-    error_clip_callback,
-)
-from ..fluid.dygraph import base as imperative_base
 from ..fluid.framework import Parameter, program_guard
 from ..fluid.initializer import Constant
 from ..fluid.layer_helper import LayerHelper
@@ -168,7 +163,7 @@ class Optimizer:
 
     """
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def __init__(
         self,
         learning_rate,
@@ -225,7 +220,7 @@ class Optimizer:
                 % type(learning_rate)
             )
         if grad_clip is not None:
-            if not isinstance(grad_clip, GradientClipBase):
+            if not isinstance(grad_clip, paddle.nn.clip.GradientClipBase):
                 raise TypeError(
                     "'grad_clip' should be an instance of GradientClipBase's derived class"
                 )
@@ -1042,7 +1037,7 @@ class Optimizer:
                     params_grads.append((parameter_list[index], grad))
         else:
             if callbacks is None:
-                callbacks = [error_clip_callback]
+                callbacks = [paddle.nn.clip.error_clip_callback]
             else:
                 assert isinstance(callbacks, list)
             program = loss.block.program
@@ -1103,7 +1098,7 @@ class Optimizer:
             params_grads = self._grad_clip(params_grads)
         else:
 
-            params_grads = append_gradient_clip_ops(params_grads)
+            params_grads = paddle.nn.clip.append_gradient_clip_ops(params_grads)
 
         # Add regularization if any
         params_grads = self.append_regularization_ops(
@@ -1317,7 +1312,7 @@ class Optimizer:
         else:
             core.clear_gradients(param_list, set_to_zero)
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def minimize(
         self, loss, startup_program=None, parameters=None, no_grad_set=None
     ):
@@ -1380,7 +1375,7 @@ class Optimizer:
 
         return optimize_ops, params_grads
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     @framework.dygraph_only
     def step(self):
         """