diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py
index 6669e4f4c70aaf820a994dafc9b1946f52104a07..8bc7b11368680e7f714e2b15a0033b0a21068fe2 100644
--- a/python/paddle/autograd/__init__.py
+++ b/python/paddle/autograd/__init__.py
@@ -26,8 +26,6 @@ else:
     from .py_layer import LegacyPyLayerContext as PyLayerContext  # noqa: F401
 from ..framework import set_grad_enabled, is_grad_enabled  # noqa: F401
 from ..fluid.dygraph.base import no_grad_ as no_grad  # noqa: F401
-from .functional import vjp, jvp, Jacobian, Hessian  # noqa: F401
-from .functional import jacobian, hessian, batch_jacobian, batch_hessian, vhp  # noqa: F401
 
 __all__ = [  # noqa
     'backward',
diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py
deleted file mode 100644
index aa3e99978b72a0412e8199a6d4a5b51506a9ee3d..0000000000000000000000000000000000000000
--- a/python/paddle/autograd/functional.py
+++ /dev/null
@@ -1,1362 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import functools
-import typing
-
-import paddle
-from paddle.fluid import framework
-from paddle.autograd.utils import as_tensors
-
-
-def vjp(func, xs, v=None):
-    r"""Computes the Vector-Jacobian product, a functional form of
-    reverse mode automatic differentiation.
-
-    Warning:
-        This API is in beta, the signatures could be changed in future version.
-
-    Args:
-        func(Callable): A function that takes ``xs`` as inputs parameter and
-            returns a sequence of Tensors or a Tensor.
-        xs(Tensor|Sequence[Tensor]): Used as positional arguments to evaluate
-            ``func``. ``xs`` is accepted as one Tensor or a sequence of Tensors.
-        v(Tensor|Sequence[Tensor]|None, optional): The cotangent vector invovled
-            in the VJP computation. ``v`` matches the size and shape of
-            ``func`` 's output. Defaults to None, which is equivalent to all
-            ones the same size of ``func`` 's output.
-
-    Returns:
-        output(tuple):
-        
-            - func_out(Tensor|tuple[Tensor]): The output of ``func(xs)`` .
-            - vjp(Tensor|tuple[Tensor]): The vjp result.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-
-            def func(x):
-                return paddle.matmul(x, x)
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            _, vjp_result = paddle.incubate.autograd.vjp(func, x)
-            print(vjp_result)
-            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [[4., 4.],
-            #         [4., 4.]])
-
-            v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]])
-            _, vjp_result = paddle.incubate.autograd.vjp(func, x, v)
-            print(vjp_result)
-            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [[2., 1.],
-            #         [1., 0.]])
-    """
-    _check_inputs(func, xs, v)
-
-    # ``_seprate`` breaks the dependencies between ``xs`` and other
-    # variables. See more ``_seprate`` .
-    xs, v = _separate(xs), _separate(v)
-    ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs)
-    _check_v_shape(v, ys)
-
-    return ys, _grad(ys, xs, v)
-
-
-def jvp(func, xs, v=None):
-    r"""
-    Computes the Jacobian-Vector product for a function at the given
-    inputs and a vector in the tangent space induced by the inputs.
-
-    Warning:
-        This API is in beta, the signatures could be changed in future version.
-
-    Args:
-        func(Callable): The ``func`` takes as input a Tensor or a Sequence
-            of Tensors and returns a Tensor or a Sequence of Tensors.
-        xs(Tensor|Sequence[Tensor]): Used as positional arguments to
-            evaluate ``func``.  The ``xs`` is accepted as one Tensor or a
-            Sequence of Tensors.
-        v(Tensor|Sequence[Tensor]|None, Optional): The tangent vector invovled
-            in the JVP computation. The ``v`` matches the size and shape of
-            ``xs`` . Default value is None and in this case is equivalent to 
-            all ones the same size of ``xs`` .
-
-    Returns:
-        output(tuple):
-
-            - func_out(Tensor|tuple[Tensor]): The output of ``func(xs)`` .
-            - jvp(Tensor|tuple[Tensor]): The jvp result.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-
-
-            def func(x):
-                return paddle.matmul(x, x)
-
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            _, jvp_result = paddle.incubate.autograd.jvp(func, x)
-            print(jvp_result)
-            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [[4., 4.],
-            #         [4., 4.]])
-            v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]])
-            _, jvp_result = paddle.incubate.autograd.jvp(func, x, v)
-            print(jvp_result)
-            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [[2., 1.],
-            #         [1., 0.]])
-
-    """
-    _check_inputs(func, xs, v)
-    # ``_seprate`` breaks the dependencies between ``xs`` and other
-    # variables. See more ``_seprate`` .
-    xs, v = _separate(xs), _separate(v)
-    ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs)
-    _check_v_shape(v, xs)
-    return ys, _double_backward_trick(ys, xs, v)
-
-
-def _double_backward_trick(ys, xs, v):
-    """Double backward trick for computing ``jvp`` by ``vjp``
-    see details: https://j-towns.github.io/2017/06/12/A-new-trick.html
-    """
-    # The value of ys_grad is not important, it can be any random value in
-    # theory, but it's required to set stop_gradient=False.
-    ys_grad = _zeros_like_with_grad(ys)
-    xs_grad = _grad(ys, xs, ys_grad)
-    return _grad(xs_grad, ys_grad, v)
-
-
-def _zeros_like_with_grad(xs):
-    """Create a zero or zeros sequence Tensor like ``xs`` with a flag 
-    ``stop_graident=False`` .
-    """
-    if not isinstance(xs, typing.Sequence):
-        ys = paddle.zeros_like(xs)
-        ys.stop_gradient = False
-    else:
-        ys = []
-        for x in xs:
-            y = paddle.zeros_like(x)
-            y.stop_gradient = False
-            ys.append(y)
-    return ys
-
-
-class Jacobian(object):
-    r"""
-    Computes the Jacobian matrix of a given function.
-
-    If the function has multiple inputs and multiple outputs, during internal 
-    implementation, all input tensors are concatenated after being flatten, 
-    the batch dimension is retained, and the output is subject to the same 
-    processing rules.
-
-    Once the Jacobian ``J`` is constructed, you can use a multidimensional index 
-    to retrieve the submatrix of ``J``, as same as slicing a Tensor. The 
-    submatrix is lazily evaluated along row axis, and will be cached once 
-    evaluated.
-
-    For examples, supposing ``is_batched=True``, you can retrieve the submatrix 
-    by following methods:
-
-        * J[:], retrieving the full matrix.
-        * J[:, :, j], retrieving the partial derivatives w.r.t. the j'th input
-          variable.
-        * J[:, i, :], retrieving the partial derivatives w.r.t. the i'th output
-          variable.
-        * J[:, i, j], retrieving the partial derivatives w.r.t. the i'th output
-          variable and the j'th input variable.
-
-    Notes:
-
-        Eclipsis index is not supported currently.
-
-    Warning:
-        This API is in beta, the signatures could be changed in future version.
-
-    Args:
-
-        func (Callable): A python function that takes a Tensor or a sequence of 
-            Tensors as inputs(the first dimension is batch size) and
-            returns a Tensor  a sequence of Tensors.
-        xs (Tensor|Sequence[Tensor]): The input to the function ``func`` .
-        is_batched (bool): If true, the first axis is batch axis. Defaults to 
-            False.
-
-    Returns:
-
-        Jacobian (Object): A python object retains the Jacobian matrix.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-
-
-            def func(x, y):
-                return paddle.matmul(x, y)
-
-
-            x = paddle.to_tensor([[1., 2.], [3., 4.]])
-            J = paddle.incubate.autograd.Jacobian(func, [x, x])
-            print(J[:, :])
-            # Tensor(shape=[4, 8], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [[1., 3., 0., 0., 1., 0., 2., 0.],
-            #         [2., 4., 0., 0., 0., 1., 0., 2.],
-            #         [0., 0., 1., 3., 3., 0., 4., 0.],
-            #         [0., 0., 2., 4., 0., 3., 0., 4.]])
-
-            print(J[0, :])
-            # Tensor(shape=[8], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [1., 3., 0., 0., 1., 0., 2., 0.])
-            print(J[:, 0])
-            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [1., 2., 0., 0.])
-
-    """
-
-    def __init__(self, func, xs, is_batched=False):
-        if not is_batched:
-            self._jacobian = _JacobianNoBatch(func, xs)
-        else:
-            self._jacobian = _JacobianBatchFirst(func, xs)
-
-    def __getitem__(self, indexes):
-        return self._jacobian[indexes]
-
-    @property
-    def shape(self):
-        """The shape of flattened Jacobian matrix.
-        """
-        return self._jacobian.shape
-
-
-class Hessian(object):
-    """
-    Computes the Hessian matrix  with a given ``func`` with respect to ``xs`` .
-
-    If the function has multiple inputs, during internal implementation, 
-    all input tensors are concatenated after being flatten, the batch dimension 
-    is retained.
-
-    The Hessian submatrix is lazily evaluated, and can be retrieved with a 
-    multidimensional indexes. See details ``Jacobian`` .
-
-    Warning:
-        This API is in beta, the signatures could be changed in future version.
-
-    Args:
-        func (Callable): A python function that takes a Tensor or a Tensor
-            sequence as inputs and returns a Tensor with shape 
-            ``[batch_size, 1]`` with batch or ``[1]`` without batch.
-        xs (Tensor|Sequence(Tensor)): The input Tensor or Tensor sequence of 
-            the function ``func``.
-        is_batched (bool): If true, the first axis is batch axis. Defaults to 
-            False.
-
-    Returns:
-
-        Hessian (Object): A python object retains the Hessian matrix.
-
-
-    Examples:
-
-    .. code-block:: python
-
-        import paddle
-
-
-        def reducer(x):
-            return paddle.sum(x * x)
-
-
-        x = paddle.rand([2, 2])
-        h = paddle.incubate.autograd.Hessian(reducer, x)
-        print(h[:])
-        # Tensor(shape=[4, 4], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-        #        [[2., 0., 0., 0.],
-        #         [0., 2., 0., 0.],
-        #         [0., 0., 2., 0.],
-        #         [0., 0., 0., 2.]])
-    """
-
-    def __init__(self, func, xs, is_batched=False):
-
-        def _jac_func(*xs):
-            jac = Jacobian(func, xs, is_batched=is_batched)
-            if (is_batched and jac.shape[1] != 1) or (not is_batched
-                                                      and jac.shape[0] != 1):
-                raise RuntimeError(
-                    "The function given to Hessian shoud return as single element Tensor or batched single element Tensor."
-                )
-            return jac[:, 0, :] if is_batched else jac[0, :]
-
-        self.symbolic = Jacobian(_jac_func, xs, is_batched=is_batched)
-
-    def __getitem__(self, indexes):
-        return self.symbolic[indexes]
-
-    @property
-    def shape(self):
-        """The shape of flattened Hessian matrix.
-        """
-        return self.symbolic.shape
-
-
-class _Jacobian(object):
-    """The base class for computing Jacobian matrix.
-
-    ``_Jacobian`` implementes the core logic of multidimensional index and lazy 
-    evaluation for Jacobian matrix, subclass only need to overwrite following 
-    methods:
-
-        * ``_lazy_axis()``,  return the axis along which will be lazy 
-            evaluating.
-        * ``_flatten(xs)``, flattens the inputs ``xs``.
-        * ``_evaluate(index)``, evaluates one slice along ``_lazy_axis`` .
-
-    Notes:
-
-        Because currently PaddlePaddle only support reverse differentiation by 
-        ``paddle.grad``, so lazy evaluation is only supported along the row of 
-        Jacobian matrix, which means that slicing along row will get better 
-        performance.
-
-    """
-
-    def __init__(self, func, xs):
-        # Skip separating in prim mode temporarily, as detach and clone are not
-        # primitive operators.
-        if not paddle.fluid._non_static_mode(
-        ) and paddle.incubate.autograd.prim_enabled():
-            self._xs = xs
-        else:
-            self._xs = _separate(xs)
-        self._ys = func(*as_tensors(self._xs))
-        self._flatten_xs = self._flatten(as_tensors(self._xs))
-        self._flatten_ys = self._flatten(as_tensors(self._ys))
-        self._cache = {}
-
-    @property
-    def shape(self):
-        raise NotImplementedError
-
-    @property
-    def _lazy_axis(self):
-        """"The axis of lazily evaluated."""
-        raise NotImplementedError
-
-    def _lazy_indexes(self, indexes):
-        idx = indexes[self._lazy_axis]
-        return (idx, ) if isinstance(idx, int) else tuple(
-            range(idx.start, idx.stop, idx.step))
-
-    def _flatten(self, xs):
-        raise NotImplementedError
-
-    def _shifted_indexes(self, indexes, lazy_axis_size=0):
-        idx = indexes[self._lazy_axis]
-        shifted_lazy_axis_idx = 0 if isinstance(idx, int) else slice(
-            0, lazy_axis_size, 1)
-        return indexes[:self._lazy_axis] + (
-            shifted_lazy_axis_idx, ) + indexes[self._lazy_axis + 1:]
-
-    def __getitem__(self, indexes):
-        indexes = _multi_index(indexes, self.shape)
-
-        if isinstance(indexes[self._lazy_axis], int):
-            other_indexes = indexes[:self._lazy_axis] + \
-                indexes[self._lazy_axis+1:]
-            return self._cached_evaluate(
-                indexes[self._lazy_axis])[other_indexes]
-        lazy_indexes = self._lazy_indexes(indexes)
-        # Using concat and reshape to replace stack operator temporarily, as
-        # it is not a primitive operator.
-        shape = list(self.shape)
-        shape[self._lazy_axis] = len(lazy_indexes)
-        part_jac = paddle.concat(
-            [self._cached_evaluate(i) for i in lazy_indexes],
-            axis=self._lazy_axis).reshape(shape)
-        return part_jac[self._shifted_indexes(indexes, len(lazy_indexes))]
-
-    def _cached_evaluate(self, k):
-        v = self._cache.get(k)
-        if v is None:
-            v = self._evaluate(k)
-            self._cache[k] = v
-        return v
-
-    def _evaluate(self, index):
-        """Evaluate one slice at along lazy axis."""
-        raise NotImplementedError
-
-
-class _JacobianNoBatch(_Jacobian):
-    """Compute Jacobian matrix without batch dimension.
-    Suppose the mapping is :math:`f: R^M \to R^N`, the output shape is 
-    ``(N, M)`` .
-    """
-
-    def __init__(self, func, xs):
-        super(_JacobianNoBatch, self).__init__(func, xs)
-
-    @property
-    def shape(self):
-        return (self._flatten_ys.shape[0], self._flatten_xs.shape[0])
-
-    @property
-    def _lazy_axis(self):
-        return 0
-
-    def _flatten(self, xs):
-        return paddle.concat(tuple(x.reshape((-1, )) for x in xs))
-
-    def _evaluate(self, row_index):
-        return self._flatten(_grad(
-            self._flatten_ys[row_index],
-            self._xs,
-        ))
-
-
-class _JacobianBatchLast(_Jacobian):
-    """Compute Jacobian matrix with batch at last axis.
-    Suppose the mapping is :math:`f: R^{M,B} \to R^{N,B}`, the output shape is 
-    ``(N, M, B)`` .
-    """
-
-    def __init__(self, func, xs):
-        super(_JacobianBatchLast, self).__init__(func, xs)
-
-    @property
-    def shape(self):
-        return (self._flatten_ys.shape[0], self._flatten_xs.shape[0],
-                self._flatten_xs.shape[1])
-
-    @property
-    def _lazy_axis(self):
-        return 0
-
-    def _flatten(self, xs):
-        return paddle.concat(
-            tuple(x.reshape((-1, x.shape[-1])) for x in as_tensors(xs)), 0)
-
-    def _evaluate(self, row):
-        return self._flatten(_grad(self._flatten_ys[row, :], self._xs))
-
-
-class _JacobianBatchFirst(_Jacobian):
-    """Compute Jacobian matrix with batch at first axis.
-    Suppose the mapping is :math:`f: R^{B,M} \to R^{B,N}`, the output shape is 
-    ``(B, N, M)`` .
-    """
-
-    def __init__(self, func, xs):
-        super(_JacobianBatchFirst, self).__init__(func, xs)
-
-    @property
-    def shape(self):
-        return (self._flatten_xs.shape[0], self._flatten_ys.shape[1],
-                self._flatten_xs.shape[1])
-
-    @property
-    def _lazy_axis(self):
-        return 1
-
-    def _flatten(self, xs):
-        return paddle.concat(
-            tuple(x.reshape((x.shape[0], -1)) for x in as_tensors(xs)), 1)
-
-    def _evaluate(self, row_index):
-        return self._flatten(_grad(self._flatten_ys[:, row_index], self._xs))
-
-
-def _multi_index(indexes, shape):
-    """A tool for parsing N-dimensional index into a standard format.
-
-    Currently supporting following input format:
-        * ([positive|negative|slice], ...), the right-most elements can be 
-            omited.
-
-    The standard format after converted is slice tuple which contains N elements:
-        * ([positive|slice], ..., [positive|slice])
-
-    Notes: 
-        Ellipsis indexes such as ``(..., i), (i, ...)`` is not supported.
-
-    Args:
-        indexes (tuple): The input indexes.
-        shape (tuple): The input shape.
-
-    Returns:
-        tuple: The standard format index as the above description.
-    """
-    indexes = indexes if isinstance(indexes, typing.Sequence) else (indexes, )
-    if any(isinstance(i, type(Ellipsis)) for i in indexes):
-        raise IndexError('Ellipsis index currently is not supported.')
-    # Fill the right-most elements.
-    indexes = indexes + (slice(0, None, None), ) * (len(shape) - len(indexes))
-    # Convert to positive index.
-    positive_indexes = []
-    for i, index in enumerate(indexes):
-        if isinstance(index, slice):
-            index = slice(index.start or 0, index.stop or shape[i], index.step
-                          or 1)
-            positive_indexes.append(
-                slice(
-                    index.start + shape[i] if index.start < 0 else index.start,
-                    index.stop + shape[i] if index.stop < 0 else index.stop,
-                    # Negative step means index backward, no need to convert to
-                    # positive interger.
-                    index.step))
-        elif isinstance(index, int):
-            positive_indexes.append(index + shape[i] if index < 0 else index)
-        else:
-            raise TypeError(f'Not supported index type {index}.')
-    return tuple(positive_indexes)
-
-
-def _stack_tensor_or_return_none(origin_list):
-    assert len(origin_list) > 0, "Can't not stack an empty list"
-    return paddle.stack(origin_list, axis=0) if isinstance(
-        origin_list[0], paddle.fluid.framework.Variable) else None
-
-
-def _replace_none_with_zero_tensor(xs, refs):
-    if xs is None:
-        xs = paddle.zeros_like(refs)
-        xs.stop_gradient = refs.stop_gradient
-        return xs
-    elif isinstance(xs, typing.Sequence):
-        return tuple(
-            _replace_none_with_zero_tensor(x, refs[i])
-            for i, x in enumerate(xs))
-    else:
-        return xs
-
-
-def _grad(ys, xs, v=None):
-    """A gradient function that can be used in dynamic graph and static graph.
-
-    The ``grad`` combines ``paddle.grad`` used in dynamic graph and
-    ``paddle.static.gradients`` used in static graph, and do following changes:
-
-    * The ``allow_unused`` flag is removed and set defaults to true internally,
-        none in outputs will be replaced by zero tensor.
-    * The ``create_graph`` flag is removed and set defaults to true internally,
-        only makes sense in dynamic graph.
-    * When xs is a single Tensor, ``paddle.grad`` returns a list which only 
-        contains one Tensor. It may confuse users, thus in this case we improve 
-        to return a single Tensor in _grad interface.
-
-    Args:
-        ys (Tensor|Sequence[Tensor]): The output tensor or tensor sequence of
-            the graph to compute gradients.
-        xs (Tensor|Sequence[Tensor]): The input tensor or tensor sequence of the graph to
-            compute gradients. The returned values of this API are the
-            gradients of inputs .
-        v (Tensor|Sequence[Tensor]|None,optional): The initial gradient values
-            of outputs . If grad_outputs is None, the initial gradient values of
-            outputs would be Tensors filled with 1; if grad_outputs is not None,
-            it must have the same length as outputs , and in this case, the
-            initial gradient value of the i-th outputs would be: (1) a Tensor
-            filled with 1 when the i-th element of grad_outputs is None;
-            (2) the i-th element of grad_outputs when the i-th element of
-            grad_outputs is a Tensor. Default None.
-
-    Returns:
-        Tensor|tuple[Tensor]: Tensor or a tuple of Tensors, whose length is the 
-            same as the Tensor number inside inputs, and the i-th returned 
-            Tensor is the sum of gradients of outputs with respect to the i-th 
-            inputs.
-    """
-    if paddle.fluid._non_static_mode():
-        xs_grad = paddle.grad(ys, xs, v, create_graph=True, allow_unused=True)
-    else:
-        xs_grad = paddle.static.gradients(ys, xs, v)
-
-    if isinstance(xs, paddle.fluid.framework.Variable):
-        xs_grad = xs_grad[0]
-
-    return _replace_none_with_zero_tensor(xs_grad, xs)
-
-
-def _separate(xs):
-    """
-    ``_separate`` separates ``xs`` from the computation graph through ``clone`` 
-    or ``deteach`` .
-
-    Interally, ``paddle.grad(xs, ys)`` is stateful API implemented based on 
-    computional graph, which will reduce gradients along all path from ys to xs.
-
-    However, funcional autograd API such as ``vjp``, ``jvp`` is stateless, and 
-    only compute gradients with a given ``func`` .
-
-    For example, given a ``func`` :math:`y0=f(x0)`, supposing forward path is:
-    ``x0 -> y0``, ``x0 -> x1 -> y0`` .
-    ``paddle.grad(y0, x0)`` will reduce gradients along ``y0->x0`` and 
-    ``y0->x1->x0``, and ``vjp`` only need reduce along ``y0->x0``.
-
-    So, it's needed to clone or detach xs for breaking the dependencies with 
-    other variables.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            from paddle.autograd.functional import _separate
-
-
-            def func(x, y):
-                return x * y
-
-
-            x = paddle.ones((1,))
-            x.stop_gradient = False
-
-            y = func(x, x)
-            print(paddle.grad(y, x))
-            # [Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [2.])]
-
-            x1, x2 = _separate((x, x))
-            y = func(x1, x2)
-            print(paddle.grad(y, x1))
-            # [Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [1.])]
-
-    """
-    if isinstance(xs, typing.Sequence):
-        return tuple(_single_separate(x) for x in xs)
-    else:
-        return _single_separate(xs)
-
-
-def _single_separate(x):
-    if x is None:  # x maybe none because grad input's v defaults to none.
-        return x
-    if not x.stop_gradient:
-        return paddle.clone(x)
-    else:  # use detach to share memory when no need gradients.
-        x = x.detach()
-        x.stop_gradient = False
-        return x
-    return x
-
-
-def _check_inputs(func, xs, v=None):
-    if not callable(func):
-        raise TypeError(f"Expected 'fun' is Callable, but got {type(func)}.")
-
-    if not isinstance(xs, (framework.Variable, typing.Sequence)):
-        raise TypeError(f"Expected 'xs' is a Tensor|Sequence[Tensor],"
-                        f"but got {type(xs)}.")
-    if isinstance(xs, typing.Sequence) and not all(
-            isinstance(x, framework.Variable) for x in xs):
-        raise TypeError("All elements of 'xs' shoule be Tensor.")
-
-    if not isinstance(v, (framework.Variable, typing.Sequence, type(None))):
-        raise TypeError(
-            f"Expected 'v' is Tensor|Sequence[Tensor]|None, but got {type(v)}.")
-
-    if isinstance(v, typing.Sequence) and not all(
-            isinstance(e, framework.Variable) for e in v):
-        raise TypeError("All elements of 'xs' shoule be Tensor.")
-
-
-def _check_v_shape(v, refs):
-    if v is None:
-        return
-
-    v, refs = as_tensors(v), as_tensors(refs)
-    if len(refs) != len(v):
-        raise RuntimeError(f"The argument v is a tuple of invalid length:"
-                           f"should be {len(refs)} but got {len(v)}.")
-
-    for index, (element_v, element_ref) in enumerate(zip(v, refs)):
-        if element_v.shape != element_ref.shape:
-            raise RuntimeError(
-                f"The v[{index}] has invalid shape: should "
-                f"be {element_ref.shape} but got {element_v.shape}.")
-
-
-@framework.dygraph_only
-def jacobian(func, inputs, create_graph=False, allow_unused=False):
-    ''' 
-    .. note::
-        **This API is ONLY available in the imperative mode.**
-
-    This function computes the Jacobian matrix of `func` with respect to `inputs`.
-
-    Parameters:
-        func (function): a Python function that takes a Tensor or a Tensor
-            list/tuple as inputs and returns a Tensor or a Tensor tuple.
-        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or 
-            Tensor list/tuple of the function ``func``.
-        create_graph (bool, optional): whether to create the gradient graphs
-            of the computing process. When it is True, higher order derivatives
-            are supported to compute; when it is False, the gradient graphs of
-            the computing process would be discarded. Defaults to ``False``.
-        allow_unused (bool, optional): whether to raise error or return None if
-            some Tensors of `inputs` are unreachable in the graph. Error would
-            be raised if allow_unused=False, and None would be returned as
-            their gradients if allow_unused=True. Default False.
-    Returns:
-        Jacobian (Tensor or nested tuple of Tensors): if function ``func``
-        takes a Tensor as inputs and returns a Tensor as outputs, Jacobian
-        will be a single Tensor containing the Jacobian matrix for the
-        linearized inputs and outputs. If one of the inputs and outputs is
-        a Tensor, and another is a Tensor list/tuple, then the Jacobian will
-        be a tuple of Tensors. If both of inputs and outputs are Tensor
-        list/tuple, then the Jacobian will be a tuple of tuple of Tensors
-        where ``Jacobian[i][j]`` will contain the Jacobian matrix of the
-        linearized ``i``th output and ``j``th input and will have same
-        dtype and device as the corresponding input. ``Jacobian[i][j]`` will
-        have as size ``m * n``, where ``m`` and ``n`` denote the numbers of
-        elements of ``i``th output and ``j``th input respectively.
-
-
-    Examples 1:
-        .. code-block:: python
-
-            import paddle
-
-            def func(x):
-                return paddle.matmul(x, x)
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            x.stop_gradient = False
-            jacobian = paddle.autograd.jacobian(func, x)
-            print(jacobian)
-            # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[2., 1., 1., 0.],
-            #         [1., 2., 0., 1.],
-            #         [1., 0., 2., 1.],
-            #         [0., 1., 1., 2.]])
-
-    Examples 2:
-        .. code-block:: python
-
-            import paddle
-
-            def func(x, y):
-                return paddle.matmul(x, y)
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            y = paddle.ones(shape=[2, 2], dtype='float32') * 2
-            x.stop_gradient = False
-            y.stop_gradient = False
-            jacobian = paddle.autograd.jacobian(func, [x, y], create_graph=True)
-            print(jacobian)
-            # (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
-            #        [[2., 2., 0., 0.],
-            #         [2., 2., 0., 0.],
-            #         [0., 0., 2., 2.],
-            #         [0., 0., 2., 2.]]), 
-            #  Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
-            #        [[1., 0., 1., 0.],
-            #         [0., 1., 0., 1.],
-            #         [1., 0., 1., 0.],
-            #         [0., 1., 0., 1.]]))
-
-    Examples 3:
-        .. code-block:: python
-
-            import paddle
-
-            def func(x, y):
-                return paddle.matmul(x, y), x * x
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            y = paddle.ones(shape=[2, 2], dtype='float32') * 2
-            x.stop_gradient = False
-            y.stop_gradient = False
-            jacobian = paddle.autograd.jacobian(func, [x, y], allow_unused=True)
-            print(jacobian)
-            # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[2., 2., 0., 0.],
-            #         [2., 2., 0., 0.],
-            #         [0., 0., 2., 2.],
-            #         [0., 0., 2., 2.]]),
-            #   Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[1., 0., 1., 0.],
-            #         [0., 1., 0., 1.],
-            #         [1., 0., 1., 0.],
-            #         [0., 1., 0., 1.]])),
-            #  (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[2., 0., 0., 0.],
-            #         [0., 2., 0., 0.],
-            #         [0., 0., 2., 0.],
-            #         [0., 0., 0., 2.]]), None))
-
-    '''
-    inputs = as_tensors(inputs)
-    outputs = as_tensors(func(*inputs))
-    fin_size = len(inputs)
-    fout_size = len(outputs)
-    flat_outputs = tuple(
-        paddle.reshape(output, shape=[-1]) for output in outputs)
-    jacobian = tuple()
-    for i, flat_output in enumerate(flat_outputs):
-        jac_i = list([] for _ in range(fin_size))
-        for k in range(len(flat_output)):
-            row_k = paddle.grad(flat_output[k],
-                                inputs,
-                                create_graph=create_graph,
-                                retain_graph=True,
-                                allow_unused=allow_unused)
-            for j in range(fin_size):
-                jac_i[j].append(
-                    paddle.reshape(row_k[j], shape=[-1]) if isinstance(
-                        row_k[j], paddle.Tensor) else None)
-        jacobian += (tuple(
-            _stack_tensor_or_return_none(jac_i_j) for jac_i_j in jac_i), )
-    if fin_size == 1 and fout_size == 1:
-        return jacobian[0][0]
-    elif fin_size == 1 and fout_size != 1:
-        return tuple(jacobian[i][0] for i in range(fout_size))
-    elif fin_size != 1 and fout_size == 1:
-        return jacobian[0]
-    else:
-        return jacobian
-
-
-@framework.dygraph_only
-def batch_jacobian(func, inputs, create_graph=False, allow_unused=False):
-    ''' 
-    .. note::
-        **This API is ONLY available in the imperative mode.**
-
-    This function computes the batch Jacobian matrix of `func` with respect to `inputs`.
-    Noted that the first dimension of inputs is batch size.
-
-    Parameters:
-        func (function): a Python function that takes a Tensor or a Tensor
-            list/tuple as inputs(the first dimension is batch size) and 
-            returns a Tensor or a Tensor tuple.
-        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or 
-            Tensor list/tuple of the function ``func``, Noted that
-            the first dimension of inputs is batch size.
-        create_graph (bool, optional): whether to create the gradient graphs
-            of the computing process. When it is True, higher order derivatives
-            are supported to compute; when it is False, the gradient graphs of
-            the computing process would be discarded. Defaults to ``False``.
-        allow_unused (bool, optional): whether to raise error or return None if
-            some Tensors of `inputs` are unreachable in the graph. Error would
-            be raised if allow_unused=False, and None would be returned as
-            their gradients if allow_unused=True. Default False.
-    Returns:
-        Jacobian (Tensor or nested tuple of Tensors): if function ``func``
-        takes a Tensor as inputs and returns a Tensor as outputs, Jacobian
-        will be a single Tensor containing the Jacobian matrix for the
-        linearized inputs and outputs. If one of the inputs and outputs is
-        a Tensor, and another is a Tensor list/tuple, then the Jacobian will
-        be a tuple of Tensors. If both of inputs and outputs are Tensor
-        list/tuple, then the Jacobian will be a tuple of tuple of Tensors.
-        Noted that the first dimension of inputs is batch size.
-
-        For example,
-        the inputs shape and outputs shape of function ``func` is [batch_size, num] 
-        and [batch_size, num] respectively, then the Jacobian will be a Tensor with
-        a shape of [num, batch_size * num], where ``Jacobian[i][j]`` will contain 
-        the Jacobian matrix of the ``i``th column output and the ``j``th input and 
-        will have same dtype and device as the corresponding input.
-        Other situations can be deduced by analogy.
-
-    Examples 1:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.ones(shape=(4, 2), dtype='float64')
-            weight = paddle.ones(shape=(2, 4), dtype='float64')
-            y = paddle.ones(shape=(4, 2), dtype='float64')
-
-            def func(x):
-                return paddle.matmul(paddle.matmul(x, weight), y)
-
-            x.stop_gradient = False
-            batch_jacobian = paddle.autograd.batch_jacobian(func, x)
-            print(batch_jacobian)
-            # Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #      [[4., 4., 4., 4., 4., 4., 4., 4.],
-            #       [4., 4., 4., 4., 4., 4., 4., 4.]])
-
-    Examples 2:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.ones(shape=(4, 2), dtype='float64')
-            weight = paddle.ones(shape=(2, 4), dtype='float64')
-            y = paddle.ones(shape=(4, 2), dtype='float64')
-
-            def func(x):
-                return paddle.matmul(paddle.matmul(x, weight), y), x * x
-
-            x.stop_gradient = False
-            batch_jacobian = paddle.autograd.batch_jacobian(func, x) 
-            print(batch_jacobian)    
-            # (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #       [[4., 4., 4., 4., 4., 4., 4., 4.],
-            #        [4., 4., 4., 4., 4., 4., 4., 4.]]), Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #       [[2., 0., 2., 0., 2., 0., 2., 0.],
-            #        [0., 2., 0., 2., 0., 2., 0., 2.]]))
-
-    Examples 3:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.ones(shape=(4, 2), dtype='float64')
-            weight = paddle.ones(shape=(2, 4), dtype='float64')
-            y = paddle.ones(shape=(4, 2), dtype='float64')
-
-            def func(x, y):
-                return x * y
-
-            x.stop_gradient = False
-            y.stop_gradient = False
-            batch_jacobian = paddle.autograd.batch_jacobian(func, [x, y])
-            print(batch_jacobian)
-            # (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #       [[1., 0., 1., 0., 1., 0., 1., 0.],
-            #        [0., 1., 0., 1., 0., 1., 0., 1.]]), Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #       [[1., 0., 1., 0., 1., 0., 1., 0.],
-            #        [0., 1., 0., 1., 0., 1., 0., 1.]]))
-
-    '''
-
-    inputs = as_tensors(inputs)
-    outputs = as_tensors(func(*inputs))
-
-    batch_size = inputs[0].shape[0]
-    for input in inputs:
-        assert input.shape[
-            0] == batch_size, "The first dimension of input should equals to the same batch size!"
-    for output in outputs:
-        assert output.shape[
-            0] == batch_size, "The first dimension of output should equals to the same batch size!"
-    fin_size = len(inputs)
-    fout_size = len(outputs)
-    flat_outputs = tuple(
-        paddle.reshape(output, shape=[batch_size, -1]) for output in outputs)
-    jacobian = tuple()
-    for i, flat_output in enumerate(flat_outputs):
-        jac_i = list([] for _ in range(fin_size))
-        for k in range(flat_output.shape[1]):
-
-            row_k = paddle.grad(flat_output[:, k],
-                                inputs,
-                                create_graph=create_graph,
-                                retain_graph=True,
-                                allow_unused=allow_unused)
-
-            for j in range(fin_size):
-                jac_i[j].append(
-                    paddle.reshape(row_k[j], shape=[-1]) if isinstance(
-                        row_k[j], paddle.Tensor) else None)
-        jacobian += (tuple(
-            _stack_tensor_or_return_none(jac_i_j) for jac_i_j in jac_i), )
-    if fin_size == 1 and fout_size == 1:
-        return jacobian[0][0]
-    elif fin_size == 1 and fout_size != 1:
-        return tuple(jacobian[i][0] for i in range(fout_size))
-    elif fin_size != 1 and fout_size == 1:
-        return jacobian[0]
-    else:
-        return jacobian
-
-
-@framework.dygraph_only
-def batch_hessian(func, inputs, create_graph=False, allow_unused=False):
-    ''' 
-    .. note::
-        **This API is ONLY available in the imperative mode.**
-
-    This function computes the batch Hessian matrix of `func` with respect to `inputs`.
-    Noted that the first dimension of inputs is batch size.
-
-    Parameters:
-        func (function): a Python function that takes a Tensor or a Tensor
-            list/tuple as inputs(the first dimension is batch size) and
-            returns a Tensor with shape [batch_size, 1].
-        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or 
-            Tensor list/tuple of the function ``func``.
-            Noted that the first dimension of inputs is batch size.
-        create_graph (bool, optional): whether to create the gradient graphs
-            of the computing process. When it is True, higher order derivatives
-            are supported to compute; when it is False, the gradient graphs of
-            the computing process would be discarded. Defaults to ``False``.
-        allow_unused (bool, optional): whether to raise error or return None if
-            some Tensors of `inputs` are unreachable in the graph. Error would
-            be raised if allow_unused=False, and None would be returned as
-            their gradients if allow_unused=True. Default False.
-    Returns:
-        Hessian (Tensor or a tuple of tuple of Tensors): if function ``func``
-        takes a Tensor as ``inputs``, Hessian will be a single Tensor containing
-        the Hessian matrix for the linearized ``inputs`` Tensor. If function
-        ``func`` takes a Tensor list/tuple as ``inputs``, then the Hessian will
-        be a tuple of tuple of Tensors. Noted that the first dimension of inputs 
-        is batch size and the execution step is to obtain the result of the 
-        first order differentiation, and then differentiate the batch input.
-
-        For example,
-        the inputs shape and outputs shape of function ``func` is [batch_size, num] 
-        and [batch_size, 1] respectively, then the batched Hessian will be a Tensor with
-        a shape of [num, batch_size * num].
-
-        Why the final shape in this case is that?
-        because batch_hessian will create a inner func(the wrapper of paddle.grad() func)
-        to computes the sum of gradients of `outputs` with respect to each `inputs`,
-        this inner func will get the first order differentiation and shape is [batch_size, num], 
-        then call batch_jacobian to compute jacobian between the first order differentiation
-        and the origin inputs. The final result ``Hessian[i][j]`` will contain the Jacobian 
-        matrix of the ``i``th column output(Noted that this output means the first order 
-        differentiation) and the ``j``th input and will have same dtype and device as the 
-        corresponding input. Other situations can be deduced by analogy.
-
-
-    Examples 1:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.ones(shape=(4, 2), dtype='float64')
-            weight = paddle.ones(shape=(2, 4), dtype='float64')
-            y = paddle.ones(shape=(4, 2), dtype='float64')
-
-            def func(x):
-                return paddle.matmul(x * x, weight)[:, 0:1]
-
-
-            x.stop_gradient = False
-            batch_hessian = paddle.autograd.batch_hessian(func, x)
-            print(batch_hessian)
-            # Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #      [[2., 0., 2., 0., 2., 0., 2., 0.],
-            #       [0., 2., 0., 2., 0., 2., 0., 2.]])
-
-    Examples 2:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.ones(shape=(4, 2), dtype='float64')
-            weight = paddle.ones(shape=(2, 4), dtype='float64')
-            y = paddle.ones(shape=(4, 2), dtype='float64')
-
-            def func(x, y):
-                return paddle.matmul(x * x * y * y, weight)[:, 0:1]
-
-            x.stop_gradient = False
-            y.stop_gradient = False
-            batch_hessian = paddle.autograd.batch_hessian(func, [x, y])
-            print(batch_hessian)
-            # ((Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #        [[2., 0., 2., 0., 2., 0., 2., 0.],
-            #         [0., 2., 0., 2., 0., 2., 0., 2.]]), 
-            #   Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #        [[4., 0., 4., 0., 4., 0., 4., 0.],
-            #         [0., 4., 0., 4., 0., 4., 0., 4.]])), 
-            #  (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #        [[4., 0., 4., 0., 4., 0., 4., 0.],
-            #         [0., 4., 0., 4., 0., 4., 0., 4.]]), 
-            #   Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #        [[2., 0., 2., 0., 2., 0., 2., 0.],
-            #         [0., 2., 0., 2., 0., 2., 0., 2.]])))
-
-
-    Examples 3:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.ones(shape=(4, 2), dtype='float64')
-            weight = paddle.ones(shape=(2, 4), dtype='float64')
-            y = paddle.ones(shape=(4, 2), dtype='float64')
-
-            def func(x, y):
-                return paddle.matmul(x * x, weight)[:, 0:1]
-
-            x.stop_gradient = False
-            y.stop_gradient = False
-            batch_hessian = paddle.autograd.batch_hessian(func, [x, y], allow_unused=True)
-            print(batch_hessian)
-            # ((Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #        [[2., 0., 2., 0., 2., 0., 2., 0.],
-            #         [0., 2., 0., 2., 0., 2., 0., 2.]]), None), (None, None))
-
-    '''
-    inputs = as_tensors(inputs)
-    outputs = func(*inputs)
-    batch_size = inputs[0].shape[0]
-    for input in inputs:
-        assert input.shape[
-            0] == batch_size, "The first dimension of input should equals to the same batch size!"
-    assert isinstance(outputs, paddle.Tensor) and outputs.shape == [
-        batch_size, 1
-    ], "The function to compute batched Hessian matrix should return a Tensor of shape [batch_size, 1]"
-
-    def jac_func(*ins):
-        grad_inputs = paddle.grad(outputs,
-                                  ins,
-                                  create_graph=True,
-                                  retain_graph=True,
-                                  allow_unused=allow_unused)
-        return tuple(
-            _replace_none_with_zero_tensor(grad_inputs[i], inputs[i])
-            for i in range(len(inputs)))
-
-    return batch_jacobian(jac_func,
-                          inputs,
-                          create_graph=create_graph,
-                          allow_unused=allow_unused)
-
-
-@framework.dygraph_only
-def hessian(func, inputs, create_graph=False, allow_unused=False):
-    ''' 
-    .. note::
-        **This API is ONLY available in the imperative mode.**
-
-    This function computes the Hessian matrix of `func` with respect to `inputs`.
-
-    Parameters:
-        func (function): a Python function that takes a Tensor or a Tensor
-            list/tuple as inputs and returns a Tensor with a single element.
-        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or 
-            Tensor list/tuple of the function ``func``.
-        create_graph (bool, optional): whether to create the gradient graphs
-            of the computing process. When it is True, higher order derivatives
-            are supported to compute; when it is False, the gradient graphs of
-            the computing process would be discarded. Defaults to ``False``.
-        allow_unused (bool, optional): whether to raise error or return None if
-            some Tensors of `inputs` are unreachable in the graph. Error would
-            be raised if allow_unused=False, and None would be returned as
-            their gradients if allow_unused=True. Default False.
-    Returns:
-        Hessian (Tensor or a tuple of tuple of Tensors): if function ``func``
-        takes a Tensor as ``inputs``, Hessian will be a single Tensor containing
-        the Hessian matrix for the linearized ``inputs`` Tensor. If function
-        ``func`` takes a Tensor list/tuple as ``inputs``, then the Hessian will
-        be a tuple of tuple of Tensors where ``Hessian[i][j]`` will contain the
-        Hessian matrix of the ``i``th input and ``j``th input with size ``m * n``.
-        Here ``m`` and ``n`` denote the number of elements of the ``i`` th input
-        and the ``j`` th input respectively.
-
-    Examples 1:
-        .. code-block:: python
-
-            import paddle
-
-            def func(x):
-                return paddle.sum(paddle.matmul(x, x))
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            x.stop_gradient = False
-            hessian = paddle.autograd.hessian(func, x)
-            print(hessian)
-            # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[2., 1., 1., 0.],
-            #         [1., 0., 2., 1.],
-            #         [1., 2., 0., 1.],
-            #         [0., 1., 1., 2.]])
-
-    Examples 2:
-        .. code-block:: python
-
-            import paddle
-
-            def func(x, y):
-                return paddle.sum(paddle.matmul(x, y))
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            y = paddle.ones(shape=[2, 2], dtype='float32')
-            x.stop_gradient = False
-            y.stop_gradient = False
-            hessian = paddle.autograd.hessian(func, [x, y])
-            print(hessian)
-            # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[0., 0., 0., 0.],
-            #         [0., 0., 0., 0.],
-            #         [0., 0., 0., 0.],
-            #         [0., 0., 0., 0.]]),
-            #   Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[1., 1., 0., 0.],
-            #         [0., 0., 1., 1.],
-            #         [1., 1., 0., 0.],
-            #         [0., 0., 1., 1.]])),
-            #  (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[1., 0., 1., 0.],
-            #         [1., 0., 1., 0.],
-            #         [0., 1., 0., 1.],
-            #         [0., 1., 0., 1.]]),
-            #   Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[0., 0., 0., 0.],
-            #         [0., 0., 0., 0.],
-            #         [0., 0., 0., 0.],
-            #         [0., 0., 0., 0.]])))
-
-    Examples 3:
-        .. code-block:: python
-
-            import paddle
-
-            def func(x, y):
-                return paddle.sum(paddle.matmul(x, x))
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            y = paddle.ones(shape=[2, 2], dtype='float32')
-            x.stop_gradient = False
-            y.stop_gradient = False
-            hessian = paddle.autograd.hessian(func, [x, y], allow_unused=True)
-            print(hessian)
-            # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[2., 1., 1., 0.],
-            #         [1., 0., 2., 1.],
-            #         [1., 2., 0., 1.],
-            #         [0., 1., 1., 2.]]), None), (None, None))
-
-    '''
-    inputs = as_tensors(inputs)
-    outputs = func(*inputs)
-    assert isinstance(outputs, paddle.Tensor) and outputs.shape == [
-        1
-    ], "The function to compute Hessian matrix should return a Tensor with a single element"
-
-    def jac_func(*ins):
-        grad_inputs = paddle.grad(outputs,
-                                  ins,
-                                  create_graph=True,
-                                  retain_graph=True,
-                                  allow_unused=allow_unused)
-        return tuple(
-            _replace_none_with_zero_tensor(grad_inputs[i], inputs[i])
-            for i in range(len(inputs)))
-
-    return jacobian(jac_func,
-                    inputs,
-                    create_graph=create_graph,
-                    allow_unused=allow_unused)
-
-
-def vhp(func, inputs, v=None, create_graph=False, allow_unused=False):
-    ''' 
-    .. note::
-        **This API is ONLY available in the imperative mode.**
-
-    This function computes the product between a vector ``v`` and the
-    Hessian matrix of `func` with respect to `inputs`.
-
-    Parameters:
-        func (function): a Python function that takes a Tensor or a Tensor
-            list/tuple as inputs and returns a Tensor with a single element.
-        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or 
-            Tensor list/tuple of the function ``func``.
-        v (Tensor|list(Tensor)|tuple(Tensor)|None, optional): the vector used
-            to compute vector hessian product. ``v`` should have same shape
-            and dtype with ``inputs``. If ``v`` is None, it will be set as
-            Tensor|list(Tensor) with all elements 1. Defaults to "None".
-        create_graph (bool, optional): whether to create the gradient graphs
-            of the computing process. When it is True, higher order derivatives
-            are supported to compute; when it is False, the gradient graphs of
-            the computing process would be discarded. Defaults to ``False``.
-        allow_unused (bool, optional): whether to raise error or return None if
-            some Tensors of `inputs` are unreachable in the graph. Error would
-            be raised if allow_unused=False, and None would be returned as
-            their gradients if allow_unused=True. Default False.
-    Returns:
-        output (tuple): tuple with:
-            func_output (Tensor): output of ``func(inputs)``
-            vhp (list(Tensor)): result of the vector hessian product
-            with the same shape and dtype as the inputs.
-    Examples 1:
-        .. code-block:: python
-            import paddle
-            def func(x):
-                return paddle.sum(paddle.matmul(x, x))
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            x.stop_gradient = False
-            vx = paddle.ones(shape=[2, 2], dtype='float32') * 2
-            vhp_rslt = paddle.autograd.vhp(func, x, v=vx)
-            print(vhp_rslt)
-            # (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
-            #        [8.]),
-            #  Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[8., 8.],
-            #         [8., 8.]]))
-
-    Examples 2:
-        .. code-block:: python
-            import paddle
-            def func(x):
-                return paddle.sum(paddle.matmul(x, x))
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            x.stop_gradient = False
-            vhp_rslt = paddle.autograd.vhp(func, x)
-            print(vhp_rslt)
-            # (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
-            #        [8.]),
-            #  Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[4., 4.],
-            #         [4., 4.]]))
-
-    Examples 3:
-        .. code-block:: python
-            import paddle
-            def func(x, y):
-                return paddle.sum(paddle.matmul(x, x))
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            x.stop_gradient = False
-            y = paddle.ones(shape=[2, 2], dtype='float32')
-            y.stop_gradient = False
-            vx = paddle.ones(shape=[2, 2], dtype='float32') * 2
-            vy = paddle.ones(shape=[2, 2], dtype='float32') * 3
-            vhp_rslt = paddle.autograd.vhp(func, [x, y], v=[vx, vy], allow_unused=True)
-            print(vhp_rslt)
-            # (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
-            #        [8.]),
-            # [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[8., 8.],
-            #         [8., 8.]]), None])
-    '''
-    xs = as_tensors(inputs)
-    if v is not None:
-        v = as_tensors(v)
-    xs, v = _separate(xs), _separate(v)
-    outputs = func(*xs)
-    ys = as_tensors(outputs)
-    assert len(ys) == 1 and isinstance(
-        ys[0], framework.Variable
-    ) and ys[0].shape == [
-        1
-    ], "The function to compute vhp should return a Tensor with a single element"
-    jac = _grad(ys, xs)
-    vhp = _grad(jac, xs, v)
-    return outputs, vhp
diff --git a/python/paddle/autograd/utils.py b/python/paddle/autograd/utils.py
deleted file mode 100644
index 6b8865f4d7df012e902e5e24db57a32bec708074..0000000000000000000000000000000000000000
--- a/python/paddle/autograd/utils.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import typing
-
-from paddle.fluid import framework
-
-
-def as_tensors(xs):
-    if isinstance(xs, framework.Variable):
-        return (xs, )
-    elif isinstance(xs, typing.Sequence):
-        return tuple(xs)
-    else:
-        return xs
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index c37ac87da71b807e3d0b1c19f87555d7e9d3642e..5ed01a01144215e2cbd2ea5d383f3744f2a12ffa 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -2211,12 +2211,6 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
     check_type(target_gradients, 'target_gradients',
                (framework.Variable, list, tuple, type(None)),
                'paddle.static.gradients')
-
-    from ..incubate.autograd.primx import _gradients
-    from ..incubate.autograd.utils import prim_enabled
-    if prim_enabled():
-        return _gradients(targets, inputs, target_gradients)
-
     outs = calc_gradient(targets, inputs, target_gradients, no_grad_set)
     return _as_list(outs)
 
diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
index 832ecc61ee19062194b66dcced271f586f3b4bdb..45c0a08efe828523ee2813653fbd6e4829e63028 100644
--- a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
@@ -17,7 +17,7 @@ endforeach()
 
 set_tests_properties(test_autograd_functional_dynamic PROPERTIES TIMEOUT 200)
 set_tests_properties(test_autograd_functional_static PROPERTIES TIMEOUT 160)
-set_tests_properties(test_gradients_and_minimize PROPERTIES TIMEOUT 60)
+set_tests_properties(test_minimize PROPERTIES TIMEOUT 60)
 if(NOT WIN32)
   set_tests_properties(test_autograd_functional_prim PROPERTIES TIMEOUT 60)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py
index a98b509f963c7c074427c0ec05a0a52b8b203a1c..6c67b78d6a53924cd23be185d5ec16f24a5d20a2 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py
@@ -21,7 +21,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.compat as cpt
 import paddle.nn.functional as F
-from paddle.autograd.utils import as_tensors
+from paddle.incubate.autograd.utils import as_tensors
 from paddle.fluid.framework import _test_eager_guard, _in_legacy_dygraph, _in_eager_without_dygraph_check
 
 import config
@@ -78,9 +78,9 @@ class TestAutogradFunctional(unittest.TestCase):
             xs = self.gen_inputs(inputs)
             if v is not None:
                 v = self.gen_inputs(v)
-                outputs, inputs_grad = paddle.autograd.vjp(func, xs, v)
+                outputs, inputs_grad = paddle.incubate.autograd.vjp(func, xs, v)
             else:
-                outputs, inputs_grad = paddle.autograd.vjp(func, xs)
+                outputs, inputs_grad = paddle.incubate.autograd.vjp(func, xs)
             return outputs, inputs_grad
 
         def grad_test():
@@ -116,14 +116,14 @@ class TestAutogradFunctional(unittest.TestCase):
             xs = self.gen_inputs(inputs)
             if v is not None:
                 v = self.gen_inputs(v)
-                outputs, outputs_grad = paddle.autograd.jvp(
+                outputs, outputs_grad = paddle.incubate.autograd.jvp(
                     func,
                     xs,
                     v,
                     create_graph=create_graph,
                     allow_unused=allow_unused)
             else:
-                outputs, outputs_grad = paddle.autograd.jvp(
+                outputs, outputs_grad = paddle.incubate.autograd.jvp(
                     func,
                     xs,
                     create_graph=create_graph,
@@ -233,8 +233,8 @@ class TestVJPException(unittest.TestCase):
 
     def func_vjp(self):
         with self.assertRaises(self.expected_exception):
-            paddle.autograd.vjp(self.fun, paddle.to_tensor(self.xs),
-                                paddle.to_tensor(self.v))
+            paddle.incubate.autograd.vjp(self.fun, paddle.to_tensor(self.xs),
+                                         paddle.to_tensor(self.v))
 
     def test_all_cases(self):
         with _test_eager_guard():
@@ -243,8 +243,10 @@ class TestVJPException(unittest.TestCase):
 
 
 def jac(grad_fn, f, inputs):
-    assert grad_fn in [paddle.autograd.vjp, paddle.autograd.jvp]
-    if grad_fn is paddle.autograd.jvp:
+    assert grad_fn in [
+        paddle.incubate.autograd.vjp, paddle.incubate.autograd.jvp
+    ]
+    if grad_fn is paddle.incubate.autograd.jvp:
         vs = [paddle.zeros_like(x) for x in inputs]
     else:
         outputs = f(*inputs)
@@ -265,7 +267,7 @@ def jac(grad_fn, f, inputs):
             JJ_cols.append(d_outs)
     # JJ is the fully unrolled jacobian
     JJ = paddle.stack(JJ_cols)
-    if grad_fn is paddle.autograd.vjp:
+    if grad_fn is paddle.incubate.autograd.vjp:
         JJ = JJ.t()
     return JJ
 
@@ -279,8 +281,8 @@ class TestJVP(TestAutogradFunctional):
         ]  # noqa
         for f, inputs in test_cases:
             inputs = self.gen_inputs(inputs)
-            forward_jac = jac(paddle.autograd.jvp, f, inputs)
-            reverse_jac = jac(paddle.autograd.vjp, f, inputs)
+            forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs)
+            reverse_jac = jac(paddle.incubate.autograd.vjp, f, inputs)
             self.check_results(forward_jac, reverse_jac)
 
     def func_jvp_i2o1(self):
@@ -289,8 +291,8 @@ class TestJVP(TestAutogradFunctional):
         ]  # noqa
         for f, inputs in test_cases:
             inputs = self.gen_inputs(inputs)
-            forward_jac = jac(paddle.autograd.jvp, f, inputs)
-            reverse_jac = jac(paddle.autograd.vjp, f, inputs)
+            forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs)
+            reverse_jac = jac(paddle.incubate.autograd.vjp, f, inputs)
             self.check_results(forward_jac, reverse_jac)
 
     def func_jvp_i2o2(self):
@@ -299,8 +301,8 @@ class TestJVP(TestAutogradFunctional):
         ]  # noqa
         for f, inputs in test_cases:
             inputs = self.gen_inputs(inputs)
-            forward_jac = jac(paddle.autograd.jvp, f, inputs)
-            reverse_jac = jac(paddle.autograd.vjp, f, inputs)
+            forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs)
+            reverse_jac = jac(paddle.incubate.autograd.vjp, f, inputs)
             self.check_results(forward_jac, reverse_jac)
 
     def func_jvp_i2o2_omitting_v(self):
@@ -309,9 +311,9 @@ class TestJVP(TestAutogradFunctional):
         ]  # noqa
         for f, inputs in test_cases:
             inputs = self.gen_inputs(inputs)
-            results_omitting_v = paddle.autograd.jvp(f, inputs)
+            results_omitting_v = paddle.incubate.autograd.jvp(f, inputs)
             v = [paddle.ones_like(x) for x in inputs]
-            results_with_v = paddle.autograd.jvp(f, inputs, v)
+            results_with_v = paddle.incubate.autograd.jvp(f, inputs, v)
             self.check_results(results_omitting_v, results_with_v)
 
     def test_all_cases(self):
@@ -334,7 +336,7 @@ class TestJVP(TestAutogradFunctional):
     ('multi_in_single_out', paddle.matmul,
      (np.random.rand(2, 2), np.random.rand(2, 2))),
 ))
-class TestJacobianClassNoBatch(unittest.TestCase):
+class TestJacobianNoBatch(unittest.TestCase):
 
     def setUp(self):
         self._dtype = self.xs[0].dtype if isinstance(
@@ -349,7 +351,7 @@ class TestJacobianClassNoBatch(unittest.TestCase):
     def func_jacobian(self):
         xs = [paddle.to_tensor(x) for x in self.xs] if isinstance(
             self.xs, typing.Sequence) else paddle.to_tensor(self.xs)
-        self._actual = paddle.autograd.Jacobian(self.func, xs, False)
+        self._actual = paddle.incubate.autograd.Jacobian(self.func, xs, False)
         self._expected = self._get_expected()
 
         Index = collections.namedtuple('Index', ('type', 'value'))
@@ -387,7 +389,7 @@ class TestJacobianClassNoBatch(unittest.TestCase):
     ('3d_in_3d_out', utils.square, np.random.rand(2, 3, 4)),
     ('multi_in_single_out', utils.square, np.random.rand(2, 3)),
 ))
-class TestJacobianClassBatchFirst(unittest.TestCase):
+class TestJacobianBatchFirst(unittest.TestCase):
 
     def setUp(self):
         self._dtype = self.xs[0].dtype if isinstance(
@@ -402,7 +404,7 @@ class TestJacobianClassBatchFirst(unittest.TestCase):
     def func_jacobian(self):
         xs = [paddle.to_tensor(x) for x in self.xs] if isinstance(
             self.xs, typing.Sequence) else paddle.to_tensor(self.xs)
-        self._actual = paddle.autograd.Jacobian(self.func, xs, True)
+        self._actual = paddle.incubate.autograd.Jacobian(self.func, xs, True)
         self._expected = self._get_expected()
 
         Index = collections.namedtuple('Index', ('type', 'value'))
@@ -444,7 +446,7 @@ class TestJacobianClassBatchFirst(unittest.TestCase):
         self.func_jacobian()
 
 
-class TestHessianClassNoBatch(unittest.TestCase):
+class TestHessianNoBatch(unittest.TestCase):
 
     @classmethod
     def setUpClass(self):
@@ -470,7 +472,7 @@ class TestHessianClassNoBatch(unittest.TestCase):
         numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian)
 
         self.x.stop_gradient = False
-        hessian = paddle.autograd.Hessian(func, self.x)
+        hessian = paddle.incubate.autograd.Hessian(func, self.x)
         np.testing.assert_allclose(hessian[:].numpy(), numerical_hessian,
                                    self.rtol, self.atol)
 
@@ -484,7 +486,7 @@ class TestHessianClassNoBatch(unittest.TestCase):
         numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian)
         self.x.stop_gradient = False
         self.y.stop_gradient = False
-        hessian = paddle.autograd.Hessian(func, [self.x, self.y])
+        hessian = paddle.incubate.autograd.Hessian(func, [self.x, self.y])
         np.testing.assert_allclose(hessian[:].numpy(),
                                    numerical_hessian,
                                    rtol=self.rtol,
@@ -500,7 +502,7 @@ class TestHessianClassNoBatch(unittest.TestCase):
         numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian)
         self.x.stop_gradient = False
         self.y.stop_gradient = False
-        hessian = paddle.autograd.Hessian(func, [self.x, self.y])
+        hessian = paddle.incubate.autograd.Hessian(func, [self.x, self.y])
         np.testing.assert_allclose(hessian[:].numpy(), numerical_hessian,
                                    self.rtol, self.atol)
 
@@ -514,7 +516,7 @@ class TestHessianClassNoBatch(unittest.TestCase):
             func, self.x, self.numerical_delta, self.np_dtype)
         numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian)
         self.x.stop_gradient = False
-        hessian = paddle.autograd.Hessian(func, self.x)
+        hessian = paddle.incubate.autograd.Hessian(func, self.x)
         assert hessian[:].stop_gradient == False
         np.testing.assert_allclose(hessian[:].numpy(), numerical_hessian,
                                    self.rtol, self.atol)
@@ -526,7 +528,7 @@ class TestHessianClassNoBatch(unittest.TestCase):
             return x * x
 
         with self.assertRaises(RuntimeError):
-            paddle.autograd.Hessian(func, paddle.ones([3]))
+            paddle.incubate.autograd.Hessian(func, paddle.ones([3]))
 
     def test_all_cases(self):
         with _test_eager_guard():
@@ -544,7 +546,7 @@ class TestHessianClassNoBatch(unittest.TestCase):
         self.func_out_not_single()
 
 
-class TestHessianClassBatchFirst(unittest.TestCase):
+class TestHessianBatchFirst(unittest.TestCase):
 
     @classmethod
     def setUpClass(self):
@@ -572,7 +574,7 @@ class TestHessianClassBatchFirst(unittest.TestCase):
         expected = utils._compute_numerical_batch_hessian(
             func, self.x, self.numerical_delta, self.np_dtype)
 
-        H = paddle.autograd.Hessian(func, self.x, is_batched=True)
+        H = paddle.incubate.autograd.Hessian(func, self.x, is_batched=True)
         actual = utils._np_transpose_matrix_format(H[:].numpy(),
                                                    utils.MatrixFormat.BNM,
                                                    utils.MatrixFormat.NBM)
@@ -596,7 +598,8 @@ class TestHessianClassBatchFirst(unittest.TestCase):
 
         self.x.stop_gradient = False
         self.y.stop_gradient = False
-        H = paddle.autograd.Hessian(func, [self.x, self.y], is_batched=True)
+        H = paddle.incubate.autograd.Hessian(func, [self.x, self.y],
+                                             is_batched=True)
         actual = utils._np_transpose_matrix_format(H[:].numpy(),
                                                    utils.MatrixFormat.BNM,
                                                    utils.MatrixFormat.NBM)
@@ -620,8 +623,8 @@ class TestHessianClassBatchFirst(unittest.TestCase):
                                                      utils.MatrixFormat.NBM,
                                                      utils.MatrixFormat.BNM)
 
-        actual = paddle.autograd.Hessian(func, [self.x, self.y],
-                                         is_batched=True)[:]
+        actual = paddle.incubate.autograd.Hessian(func, [self.x, self.y],
+                                                  is_batched=True)[:]
 
         np.testing.assert_allclose(actual,
                                    expected,
@@ -638,7 +641,7 @@ class TestHessianClassBatchFirst(unittest.TestCase):
 
         x = self.x.clone()
         x.stop_gradient = True
-        H = paddle.autograd.Hessian(func, self.x, is_batched=True)[:]
+        H = paddle.incubate.autograd.Hessian(func, self.x, is_batched=True)[:]
         actual = utils._np_transpose_matrix_format(H[:].numpy(),
                                                    utils.MatrixFormat.BNM,
                                                    utils.MatrixFormat.NBM)
@@ -652,7 +655,9 @@ class TestHessianClassBatchFirst(unittest.TestCase):
             return (x * x)
 
         with self.assertRaises(RuntimeError):
-            paddle.autograd.Hessian(func, paddle.ones((3, 3)), is_batched=True)
+            paddle.incubate.autograd.Hessian(func,
+                                             paddle.ones((3, 3)),
+                                             is_batched=True)
 
     def test_all_cases(self):
         with _test_eager_guard():
@@ -670,829 +675,5 @@ class TestHessianClassBatchFirst(unittest.TestCase):
         self.func_out_not_single()
 
 
-class TestHessian(unittest.TestCase):
-
-    @classmethod
-    def setUpClass(self):
-        self.shape = (2, 2)
-        self.dtype = 'float32'
-        self.np_dtype = np.float32
-        self.numerical_delta = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("eps")
-        self.rtol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("rtol")
-        self.atol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("atol")
-
-        self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
-        self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
-
-    def func_single_input(self):
-
-        def func(x):
-            return paddle.sum(paddle.matmul(x, x))
-
-        numerical_hessian = _compute_numerical_hessian(func, self.x,
-                                                       self.numerical_delta,
-                                                       self.np_dtype)
-
-        self.x.stop_gradient = False
-        hessian = paddle.autograd.hessian(func, self.x)
-        np.testing.assert_allclose(hessian.numpy(), numerical_hessian[0][0],
-                                   self.rtol, self.atol)
-
-    def func_multi_input(self):
-
-        def func(x, y):
-            return paddle.sum(paddle.matmul(x, y))
-
-        numerical_hessian = _compute_numerical_hessian(func, [self.x, self.y],
-                                                       self.numerical_delta,
-                                                       self.np_dtype)
-
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        hessian = paddle.autograd.hessian(func, [self.x, self.y])
-        for i in range(len(hessian)):
-            for j in range(len(hessian[0])):
-                np.testing.assert_allclose(hessian[i][j].numpy(),
-                                           numerical_hessian[i][j], self.rtol,
-                                           self.atol)
-
-    def func_allow_unused_false(self):
-
-        def func(x, y):
-            return paddle.sum(paddle.matmul(x, x))
-
-        try:
-            self.x.stop_gradient = False
-            self.y.stop_gradient = False
-            hessian = paddle.autograd.hessian(func, [self.x, self.y])
-        except ValueError as e:
-            error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("allow_unused") > 0
-
-    def func_allow_unused_true(self):
-
-        def func(x, y):
-            return paddle.sum(paddle.matmul(x, x))
-
-        numerical_hessian = _compute_numerical_hessian(func, [self.x, self.y],
-                                                       self.numerical_delta,
-                                                       self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        hessian = paddle.autograd.hessian(func, [self.x, self.y],
-                                          allow_unused=True)
-        for i in range(len(hessian)):
-            for j in range(len(hessian[0])):
-                if i == j == 0:
-                    np.testing.assert_allclose(hessian[i][j].numpy(),
-                                               numerical_hessian[i][j],
-                                               self.rtol, self.atol)
-                else:
-                    assert hessian[i][j] is None
-
-    def func_create_graph_false(self):
-
-        def func(x):
-            return paddle.sum(paddle.matmul(x, x))
-
-        numerical_hessian = _compute_numerical_hessian(func, self.x,
-                                                       self.numerical_delta,
-                                                       self.np_dtype)
-        self.x.stop_gradient = False
-        hessian = paddle.autograd.hessian(func, self.x)
-        assert hessian.stop_gradient == True
-        np.testing.assert_allclose(hessian.numpy(), numerical_hessian[0][0],
-                                   self.rtol, self.atol)
-        try:
-            paddle.grad(hessian, self.x)
-        except Exception as e:
-            error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("has no gradient") > 0 or error_msg.find(
-                "does not appear") > 0
-
-    def func_create_graph_true(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
-
-        def func(x):
-            return paddle.sum(F.sigmoid(x))
-
-        numerical_hessian = _compute_numerical_hessian(func, self.x,
-                                                       self.numerical_delta,
-                                                       self.np_dtype)
-        self.x.stop_gradient = False
-        hessian = paddle.autograd.hessian(func, self.x, create_graph=True)
-        assert hessian.stop_gradient == False
-        np.testing.assert_allclose(hessian.numpy(), numerical_hessian[0][0],
-                                   self.rtol, self.atol)
-        triple_grad = paddle.grad(hessian, self.x)
-        assert triple_grad is not None
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
-
-    def test_all_cases(self):
-        with _test_eager_guard():
-            self.setUpClass()
-            self.func_single_input()
-            self.func_multi_input()
-            self.func_allow_unused_false()
-            self.func_allow_unused_true()
-            self.func_create_graph_false()
-            self.func_create_graph_true()
-        self.setUpClass()
-        self.func_single_input()
-        self.func_multi_input()
-        self.func_allow_unused_false()
-        self.func_allow_unused_true()
-        self.func_create_graph_false()
-        self.func_create_graph_true()
-
-
-class TestHessianFloat64(TestHessian):
-
-    @classmethod
-    def setUpClass(self):
-        self.shape = (2, 2)
-        self.dtype = 'float64'
-        self.np_dtype = np.float64
-        self.numerical_delta = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("eps")
-        self.rtol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("rtol")
-        self.atol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("atol")
-        self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
-        self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
-
-
-class TestBatchHessian(unittest.TestCase):
-
-    @classmethod
-    def setUpClass(self):
-        self.x_shape = (5, 2)
-        self.weight_shape = (2, 4)
-        self.y_shape = (5, 2)
-        self.dtype = 'float32'
-        self.np_dtype = np.float32
-        self.numerical_delta = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("eps")
-        self.rtol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("rtol")
-        self.atol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("atol")
-        self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype)
-        self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype)
-        self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype)
-
-    def func_single_input(self):
-
-        def func(x):
-            return paddle.matmul(x * x, self.weight)[:, 0:1]
-
-        numerical_hessian = _compute_numerical_batch_hessian(
-            func, self.x, self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        hessian = paddle.autograd.batch_hessian(func, self.x, create_graph=True)
-        np.testing.assert_allclose(hessian, numerical_hessian, self.rtol,
-                                   self.atol)
-
-    def func_multi_input(self):
-
-        def func(x, y):
-            return paddle.matmul(x * x * y * y, self.weight)[:, 0:1]
-
-        numerical_hessian = _compute_numerical_batch_hessian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        hessian = paddle.autograd.batch_hessian(func, [self.x, self.y])
-
-        shape_tensor = paddle.to_tensor(numerical_hessian).astype("float64")
-        hessian_reshape = np.reshape(hessian, (shape_tensor.shape))
-        np.testing.assert_allclose(hessian_reshape, numerical_hessian,
-                                   self.rtol, self.atol)
-
-    def func_allow_unused_false(self):
-
-        def func(x, y):
-            return paddle.matmul(x * x, self.weight)[:, 0:1]
-
-        try:
-            self.x.stop_gradient = False
-            self.y.stop_gradient = False
-            hessian = paddle.autograd.batch_hessian(func, [self.x, self.y])
-        except ValueError as e:
-            error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("allow_unused") > 0
-
-    def func_allow_unused_true(self):
-
-        def func(x, y):
-            return paddle.matmul(x * x, self.weight)[:, 0:1]
-
-        numerical_hessian = _compute_numerical_batch_hessian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        hessian = paddle.autograd.batch_hessian(func, [self.x, self.y],
-                                                allow_unused=True)
-
-        for i in range(len(hessian)):
-            for j in range(len(hessian[0])):
-                if i == j == 0:
-                    numerical_hessian = np.stack(
-                        (numerical_hessian[i][j], numerical_hessian[i][j + 1]),
-                        axis=0)
-                    np.testing.assert_allclose(hessian[i][j], numerical_hessian,
-                                               self.rtol, self.atol)
-                else:
-                    assert hessian[i][j] is None
-
-    def func_create_graph_false(self):
-
-        def func(x):
-            return paddle.matmul(x * x, self.weight)[:, 0:1]
-
-        numerical_hessian = _compute_numerical_batch_hessian(
-            func, self.x, self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        hessian = paddle.autograd.batch_hessian(func, self.x)
-        assert hessian.stop_gradient == True
-        np.testing.assert_allclose(hessian.numpy(), numerical_hessian,
-                                   self.rtol, self.atol)
-        try:
-            paddle.grad(hessian, self.x)
-        except Exception as e:
-            error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("has no gradient") > 0 or error_msg.find(
-                "does not appear") > 0
-
-    def func_create_graph_true(self):
-
-        def func(x):
-            return paddle.matmul(x * x, self.weight)[:, 0:1]
-
-        numerical_hessian = _compute_numerical_batch_hessian(
-            func, self.x, self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        hessian = paddle.autograd.batch_hessian(func, self.x, create_graph=True)
-        assert hessian.stop_gradient == False
-        np.testing.assert_allclose(hessian.numpy(), numerical_hessian,
-                                   self.rtol, self.atol)
-        triple_grad = paddle.grad(hessian, self.x)
-        assert triple_grad is not None
-
-    def test_all_cases(self):
-        with _test_eager_guard():
-            self.setUpClass()
-            self.func_single_input()
-            self.func_multi_input()
-            self.func_allow_unused_false()
-            self.func_allow_unused_true()
-            self.func_create_graph_false()
-            self.func_create_graph_true()
-        self.setUpClass()
-        self.func_single_input()
-        self.func_multi_input()
-        self.func_allow_unused_false()
-        self.func_allow_unused_true()
-        self.func_create_graph_false()
-        self.func_create_graph_true()
-
-
-class TestBatchHessianFloat64(TestBatchHessian):
-
-    @classmethod
-    def setUpClass(self):
-        self.x_shape = (5, 2)
-        self.weight_shape = (2, 4)
-        self.y_shape = (5, 2)
-        self.dtype = 'float64'
-        self.np_dtype = np.float64
-        self.numerical_delta = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("eps")
-        self.rtol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("rtol")
-        self.atol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("atol")
-        self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype)
-        self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype)
-        self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype)
-
-
-class TestVHP(unittest.TestCase):
-
-    @classmethod
-    def setUpClass(self):
-        self.shape = (2, 2)
-        self.dtype = 'float32'
-        self.np_dtype = np.float32
-        self.numerical_delta = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("eps")
-        self.rtol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("rtol")
-        self.atol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("atol")
-        self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
-        self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
-        self.vx = paddle.rand(shape=self.shape, dtype=self.dtype)
-        self.vy = paddle.rand(shape=self.shape, dtype=self.dtype)
-
-    def func_single_input(self):
-
-        def func(x):
-            return paddle.sum(paddle.matmul(x, x))
-
-        numerical_func_output = func(self.x).numpy()
-        numerical_vhp = _compute_numerical_vhp(func, self.x, self.vx,
-                                               self.numerical_delta,
-                                               self.np_dtype)
-
-        self.x.stop_gradient = False
-        func_output, vhp = paddle.autograd.vhp(func, self.x, self.vx)
-        np.testing.assert_allclose(func_output.numpy(), numerical_func_output,
-                                   self.rtol, self.atol)
-        np.testing.assert_allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol,
-                                   self.atol)
-
-    def func_multi_input(self):
-
-        def func(x, y):
-            return paddle.sum(paddle.matmul(x, y))
-
-        numerical_func_output = func(self.x, self.y).numpy()
-        numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y],
-                                               [self.vx, self.vy],
-                                               self.numerical_delta,
-                                               self.np_dtype)
-
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y],
-                                               [self.vx, self.vy])
-        np.testing.assert_allclose(func_output.numpy(), numerical_func_output,
-                                   self.rtol, self.atol)
-        for i in range(len(vhp)):
-            np.testing.assert_allclose(vhp[i].numpy(), numerical_vhp[i],
-                                       self.rtol, self.atol)
-
-    def func_v_default(self):
-
-        def func(x, y):
-            return paddle.sum(paddle.matmul(x, y))
-
-        numerical_func_output = func(self.x, self.y).numpy()
-        vx = paddle.ones(self.vx.shape, dtype=self.vx.dtype)
-        vy = paddle.ones(self.vy.shape, dtype=self.vy.dtype)
-        numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y], [vx, vy],
-                                               self.numerical_delta,
-                                               self.np_dtype)
-
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y])
-        np.testing.assert_allclose(func_output.numpy(), numerical_func_output,
-                                   self.rtol, self.atol)
-        for i in range(len(vhp)):
-            np.testing.assert_allclose(vhp[i].numpy(), numerical_vhp[i],
-                                       self.rtol, self.atol)
-
-    def func_allow_unused_true(self):
-
-        def func(x, y):
-            return paddle.sum(paddle.matmul(x, x))
-
-        numerical_func_output = func(self.x, self.y).numpy()
-        numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y],
-                                               [self.vx, self.vy],
-                                               self.numerical_delta,
-                                               self.np_dtype)
-
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y],
-                                               [self.vx, self.vy])
-        np.testing.assert_allclose(func_output.numpy(), numerical_func_output,
-                                   self.rtol, self.atol)
-        np.testing.assert_allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol,
-                                   self.atol)
-
-    def func_create_graph_true(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
-
-        def func(x):
-            return paddle.sum(F.sigmoid(x))
-
-        numerical_func_output = func(self.x).numpy()
-        numerical_vhp = _compute_numerical_vhp(func, self.x, self.vx,
-                                               self.numerical_delta,
-                                               self.np_dtype)
-
-        self.x.stop_gradient = False
-        func_output, vhp = paddle.autograd.vhp(func, self.x, self.vx)
-        np.testing.assert_allclose(func_output.numpy(), numerical_func_output,
-                                   self.rtol, self.atol)
-        assert vhp[0].stop_gradient == False
-        np.testing.assert_allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol,
-                                   self.atol)
-        triple_grad = paddle.grad(vhp, self.x)
-        assert triple_grad is not None
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
-
-    def test_all_cases(self):
-        with _test_eager_guard():
-            self.setUpClass()
-            self.func_v_default()
-            self.func_multi_input()
-            self.func_single_input()
-            self.func_allow_unused_true()
-            self.func_create_graph_true()
-        self.setUpClass()
-        self.func_v_default()
-        self.func_multi_input()
-        self.func_single_input()
-        self.func_allow_unused_true()
-        self.func_create_graph_true()
-
-
-class TestJacobian(unittest.TestCase):
-
-    @classmethod
-    def setUpClass(self):
-        self.shape = (4, 4)
-        self.dtype = 'float32'
-        self.np_dtype = np.float32
-        self.numerical_delta = 1e-4
-        self.rtol = 1e-3
-        self.atol = 1e-3
-        self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
-        self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
-
-    def func_single_input_and_single_output(self):
-
-        def func(x):
-            return paddle.matmul(x, x)
-
-        numerical_jacobian = _compute_numerical_jacobian(
-            func, self.x, self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        jacobian = paddle.autograd.jacobian(func, self.x)
-        np.testing.assert_allclose(jacobian.numpy(), numerical_jacobian[0][0],
-                                   self.rtol, self.atol)
-
-    def func_single_input_and_multi_output(self):
-
-        def func(x):
-            return paddle.matmul(x, x), x * x
-
-        numerical_jacobian = _compute_numerical_jacobian(
-            func, self.x, self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        jacobian = paddle.autograd.jacobian(func, self.x)
-        for i in range(len(jacobian)):
-            np.testing.assert_allclose(jacobian[i].numpy(),
-                                       numerical_jacobian[i][0], self.rtol,
-                                       self.atol)
-
-    def func_multi_input_and_single_output(self):
-
-        def func(x, y):
-            return paddle.matmul(x, y)
-
-        numerical_jacobian = _compute_numerical_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        jacobian = paddle.autograd.jacobian(func, [self.x, self.y])
-        for j in range(len(jacobian)):
-            np.testing.assert_allclose(jacobian[j].numpy(),
-                                       numerical_jacobian[0][j], self.rtol,
-                                       self.atol)
-
-    def func_multi_input_and_multi_output(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
-
-        def func(x, y):
-            return paddle.matmul(x, y), x * y
-
-        numerical_jacobian = _compute_numerical_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        jacobian = paddle.autograd.jacobian(func, [self.x, self.y])
-        for i in range(len(jacobian)):
-            for j in range(len(jacobian[0])):
-                np.testing.assert_allclose(jacobian[i][j].numpy(),
-                                           numerical_jacobian[i][j], self.rtol,
-                                           self.atol)
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
-
-    def func_allow_unused_false(self):
-
-        def func(x, y):
-            return paddle.matmul(x, x)
-
-        try:
-            self.x.stop_gradient = False
-            self.y.stop_gradient = False
-            jacobian = paddle.autograd.jacobian(func, [self.x, self.y])
-        except ValueError as e:
-            error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("allow_unused") > 0
-
-    def func_allow_unused_true(self):
-
-        def func(x, y):
-            return paddle.matmul(x, x)
-
-        numerical_jacobian = _compute_numerical_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        jacobian = paddle.autograd.jacobian(func, [self.x, self.y],
-                                            allow_unused=True)
-        np.testing.assert_allclose(jacobian[0].numpy(),
-                                   numerical_jacobian[0][0], self.rtol,
-                                   self.atol)
-        assert jacobian[1] is None
-
-    def func_create_graph_false(self):
-
-        def func(x, y):
-            return paddle.matmul(x, y)
-
-        numerical_jacobian = _compute_numerical_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        jacobian = paddle.autograd.jacobian(func, [self.x, self.y])
-        for j in range(len(jacobian)):
-            assert jacobian[j].stop_gradient == True
-            np.testing.assert_allclose(jacobian[j].numpy(),
-                                       numerical_jacobian[0][j], self.rtol,
-                                       self.atol)
-        try:
-            paddle.grad(jacobian[0], [self.x, self.y])
-        except Exception as e:
-            error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("has no gradient") > 0 or error_msg.find(
-                "does not appear") > 0
-
-    def func_create_graph_true(self):
-
-        def func(x, y):
-            return paddle.matmul(x, y)
-
-        numerical_jacobian = _compute_numerical_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        jacobian = paddle.autograd.jacobian(func, [self.x, self.y],
-                                            create_graph=True)
-        for j in range(len(jacobian)):
-            assert jacobian[j].stop_gradient == False
-            np.testing.assert_allclose(jacobian[j].numpy(),
-                                       numerical_jacobian[0][j], self.rtol,
-                                       self.atol)
-        double_grad = paddle.grad(jacobian[0], [self.x, self.y])
-        assert double_grad is not None
-
-    def test_all_cases(self):
-        with _test_eager_guard():
-            self.setUpClass()
-            self.func_multi_input_and_multi_output()
-            self.func_multi_input_and_single_output()
-            self.func_single_input_and_multi_output()
-            self.func_single_input_and_single_output()
-            self.func_allow_unused_false()
-            self.func_allow_unused_true()
-            self.func_create_graph_false()
-            self.func_create_graph_true()
-        self.setUpClass()
-        self.func_multi_input_and_multi_output()
-        self.func_multi_input_and_single_output()
-        self.func_single_input_and_multi_output()
-        self.func_single_input_and_single_output()
-        self.func_allow_unused_false()
-        self.func_allow_unused_true()
-        self.func_create_graph_false()
-        self.func_create_graph_true()
-
-
-class TestJacobianFloat64(TestJacobian):
-
-    @classmethod
-    def setUpClass(self):
-        self.shape = (4, 4)
-        self.dtype = 'float64'
-        self.np_dtype = np.float64
-        self.numerical_delta = 1e-7
-        self.rtol = 1e-7
-        self.atol = 1e-7
-        self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
-        self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
-
-
-class TestJacobianBatch(unittest.TestCase):
-
-    @classmethod
-    def setUpClass(self):
-        self.x_shape = (4, 2)
-        self.weight_shape = (2, 4)
-        self.y_shape = (4, 2)
-        self.dtype = 'float32'
-        self.np_dtype = np.float32
-        self.numerical_delta = 1e-4
-        self.rtol = 1e-3
-        self.atol = 1e-3
-        self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype)
-        self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype)
-        self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype)
-
-    def func_batch_single_input_and_batch_single_output(self):
-
-        def func(x):
-            return paddle.matmul(paddle.matmul(x, self.weight), self.y)
-
-        numerical_jacobian = _compute_numerical_batch_jacobian(
-            func, [self.x], self.numerical_delta, self.np_dtype)
-
-        self.x.stop_gradient = False
-        batch_jacobian = paddle.autograd.batch_jacobian(
-            func,
-            self.x,
-        )
-
-        self.assertTrue(
-            np.allclose(batch_jacobian.numpy().all(),
-                        numerical_jacobian[0][0].all()))
-
-    def func_batch_single_input_and_batch_multi_output(self):
-
-        def func(x):
-            return paddle.matmul(paddle.matmul(x, self.weight), self.y), x * x
-
-        numerical_jacobian = _compute_numerical_batch_jacobian(
-            func, [self.x], self.numerical_delta, self.np_dtype)
-
-        self.x.stop_gradient = False
-        batch_jacobian = paddle.autograd.batch_jacobian(
-            func,
-            self.x,
-        )
-
-        for i in range(len(batch_jacobian)):
-            np.testing.assert_allclose(batch_jacobian[i].numpy(),
-                                       numerical_jacobian[i][0], self.rtol,
-                                       self.atol)
-
-    def func_batch_multi_input_and_batch_single_output(self):
-
-        def func(x, y):
-            return x * y
-
-        numerical_jacobian = _compute_numerical_batch_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        batch_jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y])
-
-        for j in range(len(batch_jacobian)):
-            np.testing.assert_allclose(batch_jacobian[j].numpy(),
-                                       numerical_jacobian[0][j], self.rtol,
-                                       self.atol)
-
-    def func_batch_multi_input_and_batch_multi_output(self):
-
-        def func(x, y):
-            return x * y, x * y
-
-        numerical_jacobian = _compute_numerical_batch_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        batch_jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y])
-
-        for i in range(len(batch_jacobian)):
-            np.testing.assert_allclose(batch_jacobian[i], numerical_jacobian[i],
-                                       self.rtol, self.atol)
-
-    def func_allow_unused_false(self):
-
-        def func(x, y):
-            return x * x
-
-        try:
-            self.x.stop_gradient = False
-            self.y.stop_gradient = False
-            jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y])
-        except ValueError as e:
-            error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("allow_unused") > 0
-
-    def func_allow_unused_true(self):
-
-        def func(x, y):
-            return x * x
-
-        numerical_jacobian = _compute_numerical_batch_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y],
-                                                  allow_unused=True)
-
-        np.testing.assert_allclose(jacobian[0].numpy(),
-                                   numerical_jacobian[0][0], self.rtol,
-                                   self.atol)
-        assert jacobian[1] is None
-
-    def func_create_graph_false(self):
-
-        def func(x, y):
-            return x * y
-
-        numerical_jacobian = _compute_numerical_batch_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y])
-        for j in range(len(jacobian)):
-            assert jacobian[j].stop_gradient == True
-            np.testing.assert_allclose(jacobian[j].numpy(),
-                                       numerical_jacobian[0][j], self.rtol,
-                                       self.atol)
-        try:
-            paddle.grad(jacobian[0], [self.x, self.y])
-        except Exception as e:
-            error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("has no gradient") > 0 or error_msg.find(
-                "does not appear") > 0
-
-    def func_create_graph_true(self):
-
-        def func(x, y):
-            return x * y
-
-        numerical_jacobian = _compute_numerical_batch_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y],
-                                                  create_graph=True)
-        for j in range(len(jacobian)):
-            assert jacobian[j].stop_gradient == False
-            np.testing.assert_allclose(jacobian[j].numpy(),
-                                       numerical_jacobian[0][j], self.rtol,
-                                       self.atol)
-        double_grad = paddle.grad(jacobian[0], [self.x, self.y])
-        assert double_grad is not None
-
-    def test_all_cases(self):
-        with _test_eager_guard():
-            self.setUpClass()
-            self.func_batch_single_input_and_batch_single_output()
-            self.func_batch_single_input_and_batch_multi_output()
-            self.func_batch_multi_input_and_batch_single_output()
-            self.func_batch_multi_input_and_batch_multi_output()
-            self.func_allow_unused_false()
-            self.func_allow_unused_true()
-            self.func_create_graph_false()
-            self.func_create_graph_true()
-        self.setUpClass()
-        self.func_batch_single_input_and_batch_single_output()
-        self.func_batch_single_input_and_batch_multi_output()
-        self.func_batch_multi_input_and_batch_single_output()
-        self.func_batch_multi_input_and_batch_multi_output()
-        self.func_allow_unused_false()
-        self.func_allow_unused_true()
-        self.func_create_graph_false()
-        self.func_create_graph_true()
-
-
-class TestJacobianBatchFloat64(TestJacobianBatch):
-
-    @classmethod
-    def setUpClass(self):
-        self.x_shape = (12, 2)
-        self.weight_shape = (2, 12)
-        self.y_shape = (12, 2)
-        self.dtype = 'float64'
-        self.np_dtype = np.float64
-        self.numerical_delta = config.TOLERANCE.get(
-            self.dtype).get('second_order_grad').get('eps')
-        self.rtol = config.TOLERANCE.get(
-            self.dtype).get('second_order_grad').get('rtol')
-        self.atol = config.TOLERANCE.get(
-            self.dtype).get('second_order_grad').get('atol')
-        self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype)
-        self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype)
-        self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_prim.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_prim.py
index f75460df6b52dcf44e2a426920ebd155d66cc76b..d17420c90454638f4b9fab9273df1d0e7b34ecbd 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_prim.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_prim.py
@@ -145,5 +145,130 @@ class TestHessianPrim(unittest.TestCase):
                                    atol=self._atol)
 
 
+@utils.place(config.DEVICES)
+@utils.parameterize((utils.TEST_CASE_NAME, 'fun', 'args', 'dtype'), (
+    ('unary_float32', paddle.tanh, (np.random.rand(2, 3), ), 'float32'),
+    ('binary_float32', paddle.matmul,
+     (np.random.rand(2, 3), np.random.rand(3, 2)), 'float32'),
+    ('unary_float64', paddle.tanh, (np.random.rand(2, 3), ), 'float64'),
+    ('binary_float64', paddle.matmul,
+     (np.random.rand(2, 3), np.random.rand(3, 2)), 'float64'),
+))
+class TestJvpPrim(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.args = [arg.astype(cls.dtype) for arg in cls.args]
+        cls._rtol = config.TOLERANCE.get(
+            cls.dtype).get('first_order_grad').get('rtol')
+        cls._atol = config.TOLERANCE.get(
+            cls.dtype).get('first_order_grad').get('atol')
+
+    def setUp(self):
+        paddle.enable_static()
+        paddle.incubate.autograd.enable_prim()
+
+    def tearDown(self):
+        paddle.incubate.autograd.disable_prim()
+        paddle.disable_static()
+
+    def test_jacobian_prim(self):
+
+        def wrapper(fun, args):
+            mp = paddle.static.Program()
+            sp = paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                static_args = [
+                    paddle.static.data(f'arg{i}', arg.shape, self.dtype)
+                    for i, arg in enumerate(args)
+                ]
+                for arg in static_args:
+                    arg.stop_gradient = False
+                _, jvp_res = paddle.incubate.autograd.jvp(fun, static_args)
+                if paddle.incubate.autograd.prim_enabled():
+                    paddle.incubate.autograd.prim2orig()
+            exe = paddle.static.Executor()
+            exe.run(sp)
+            jvp_res = exe.run(
+                mp,
+                feed={f'arg{i}': arg
+                      for i, arg in enumerate(args)},
+                fetch_list=[jvp_res])
+            return jvp_res
+
+        paddle.incubate.autograd.enable_prim()
+        prim_jvp = wrapper(self.fun, self.args)
+        paddle.incubate.autograd.disable_prim()
+        orig_jvp = wrapper(self.fun, self.args)
+
+        np.testing.assert_allclose(orig_jvp,
+                                   prim_jvp,
+                                   rtol=self._rtol,
+                                   atol=self._atol)
+
+
+@utils.place(config.DEVICES)
+@utils.parameterize((utils.TEST_CASE_NAME, 'fun', 'args', 'dtype'), (
+    ('unary_float32', paddle.tanh, (np.random.rand(2, 3), ), 'float32'),
+    ('binary_float32', paddle.matmul,
+     (np.random.rand(2, 3), np.random.rand(3, 2)), 'float32'),
+    ('unary_float64', paddle.tanh, (np.random.rand(2, 3), ), 'float64'),
+    ('binary_float64', paddle.matmul,
+     (np.random.rand(2, 3), np.random.rand(3, 2)), 'float64'),
+))
+class TestVjpPrim(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.args = [arg.astype(cls.dtype) for arg in cls.args]
+        cls._rtol = config.TOLERANCE.get(
+            cls.dtype).get('first_order_grad').get('rtol')
+        cls._atol = config.TOLERANCE.get(
+            cls.dtype).get('first_order_grad').get('atol')
+
+    def setUp(self):
+        paddle.enable_static()
+        paddle.incubate.autograd.enable_prim()
+
+    def tearDown(self):
+        paddle.incubate.autograd.disable_prim()
+        paddle.disable_static()
+
+    def test_jacobian_prim(self):
+
+        def wrapper(fun, args):
+            mp = paddle.static.Program()
+            sp = paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                static_args = [
+                    paddle.static.data(f'arg{i}', arg.shape, self.dtype)
+                    for i, arg in enumerate(args)
+                ]
+                for arg in static_args:
+                    arg.stop_gradient = False
+                _, vjp_res = paddle.incubate.autograd.vjp(fun, static_args)
+                if paddle.incubate.autograd.prim_enabled():
+                    paddle.incubate.autograd.prim2orig()
+            exe = paddle.static.Executor()
+            exe.run(sp)
+            vjp_res = exe.run(
+                mp,
+                feed={f'arg{i}': arg
+                      for i, arg in enumerate(args)},
+                fetch_list=[vjp_res])
+            return vjp_res
+
+        paddle.incubate.autograd.enable_prim()
+        prim_vjp = wrapper(self.fun, self.args)
+        paddle.incubate.autograd.disable_prim()
+        orig_vjp = wrapper(self.fun, self.args)
+
+        for orig, prim in zip(orig_vjp, prim_vjp):
+            np.testing.assert_allclose(orig,
+                                       prim,
+                                       rtol=self._rtol,
+                                       atol=self._atol)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py
index 4e01ad5382c91617c1829d0c417bed2af5899843..9b2098d37b8826f5d48a7c837ab498ebf7cdf29d 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py
@@ -59,7 +59,8 @@ class TestVJP(unittest.TestCase):
         with paddle.static.program_guard(mp, sp):
             feed, static_xs, static_v = utils.gen_static_data_and_feed(
                 self.xs, self.v, stop_gradient=self.stop_gradient)
-            ys, xs_grads = paddle.autograd.vjp(self.fun, static_xs, static_v)
+            ys, xs_grads = paddle.incubate.autograd.vjp(self.fun, static_xs,
+                                                        static_v)
         exe.run(sp)
         return exe.run(mp, feed=feed, fetch_list=[ys, xs_grads])
 
@@ -103,7 +104,8 @@ class TestVJPException(unittest.TestCase):
         with paddle.static.program_guard(mp, sp):
             feed, static_xs, static_v = utils.gen_static_data_and_feed(
                 self.xs, self.v)
-            ys, xs_grads = paddle.autograd.vjp(self.fun, static_xs, static_v)
+            ys, xs_grads = paddle.incubate.autograd.vjp(self.fun, static_xs,
+                                                        static_v)
         self.exe.run(sp)
         return self.exe.run(mp, feed, fetch_list=[ys, xs_grads])
 
@@ -214,7 +216,7 @@ class TestJacobianFloat32(unittest.TestCase):
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
             xs = make_tensors(inps)
-            JJ = paddle.autograd.functional.Jacobian(pd_f, xs, is_batched=batch)
+            JJ = paddle.incubate.autograd.Jacobian(pd_f, xs, is_batched=batch)
             if batch:
                 _, nrow, ncol = JJ.shape
             else:
@@ -244,7 +246,7 @@ class TestJacobianFloat32(unittest.TestCase):
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
             xs = make_tensors(inps)
-            JJ = paddle.autograd.functional.Jacobian(pd_f, xs, is_batched=batch)
+            JJ = paddle.incubate.autograd.Jacobian(pd_f, xs, is_batched=batch)
             if batch:
                 nbatch, nrow, ncol = JJ.shape
                 rows = [JJ[:, i, :] for i in range(nrow)]
@@ -269,7 +271,7 @@ class TestJacobianFloat32(unittest.TestCase):
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
             xs = make_tensors(inps)
-            JJ = paddle.autograd.functional.Jacobian(pd_f, xs, is_batched=batch)
+            JJ = paddle.incubate.autograd.Jacobian(pd_f, xs, is_batched=batch)
             if batch:
                 nbatch, nrow, ncol = JJ.shape
                 entries = [
@@ -390,7 +392,7 @@ class TestHessianFloat32(unittest.TestCase):
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
             xs = make_tensors(inps)
-            HH = paddle.autograd.functional.Hessian(pd_f, xs, is_batched=batch)
+            HH = paddle.incubate.autograd.Hessian(pd_f, xs, is_batched=batch)
             nrow, ncol = HH.shape
             full_hessian = HH[:]
         exe = fluid.Executor(self.place)
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py b/python/paddle/fluid/tests/unittests/autograd/test_minimize.py
similarity index 56%
rename from python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py
rename to python/paddle/fluid/tests/unittests/autograd/test_minimize.py
index 67ebe01d9f0278510affd02fc50f54b391e06dc4..10259802c6933392a318df8a7a5f90d1eac1ae2a 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_minimize.py
@@ -13,82 +13,16 @@
 # limitations under the License.
 
 import unittest
-import numpy as np
 
+import numpy as np
 import paddle
 from paddle.incubate.autograd.primx import prim2orig
-from paddle.incubate.autograd.utils import enable_prim, disable_prim, prim_enabled
+from paddle.incubate.autograd.utils import (disable_prim, enable_prim,
+                                            prim_enabled)
 
 paddle.enable_static()
 
 
-class TestGradients(unittest.TestCase):
-
-    def test_third_order(self):
-        enable_prim()
-        main = paddle.static.Program()
-        startup = paddle.static.Program()
-        with paddle.static.program_guard(main, startup):
-            x = paddle.static.data(name='x', shape=[1], dtype='float32')
-            x2 = paddle.multiply(x, x)
-            x3 = paddle.multiply(x2, x)
-            x4 = paddle.multiply(x3, x)
-
-            grad1, = paddle.static.gradients([x4], [x])
-            grad2, = paddle.static.gradients([grad1], [x])
-            grad3, = paddle.static.gradients([grad2], [x])
-
-            prim2orig(main.block(0))
-
-        feed = {x.name: np.array([2.]).astype('float32')}
-        fetch_list = [grad3.name]
-        result = [np.array([48.])]
-
-        place = paddle.CPUPlace()
-        if paddle.device.is_compiled_with_cuda():
-            place = paddle.CUDAPlace(0)
-        exe = paddle.static.Executor(place)
-        exe.run(startup)
-        outs = exe.run(main, feed=feed, fetch_list=fetch_list)
-        np.allclose(outs, result)
-        disable_prim()
-
-    def test_fourth_order(self):
-        enable_prim()
-        main = paddle.static.Program()
-        startup = paddle.static.Program()
-        with paddle.static.program_guard(main, startup):
-            x = paddle.static.data(name='x', shape=[1], dtype='float32')
-            x2 = paddle.multiply(x, x)
-            x3 = paddle.multiply(x2, x)
-            x4 = paddle.multiply(x3, x)
-            x5 = paddle.multiply(x4, x)
-            out = paddle.sqrt(x5 + x4)
-
-            grad1, = paddle.static.gradients([out], [x])
-            grad2, = paddle.static.gradients([grad1], [x])
-            grad3, = paddle.static.gradients([grad2], [x])
-            grad4, = paddle.static.gradients([grad3], [x])
-
-            prim2orig(main.block(0))
-
-        feed = {
-            x.name: np.array([2.]).astype('float32'),
-        }
-        fetch_list = [grad4.name]
-        # (3*(-5*x^2-16*x-16))/(16*(x+1)^3.5)
-        result = [np.array([-0.27263762711])]
-
-        place = paddle.CPUPlace()
-        if paddle.device.is_compiled_with_cuda():
-            place = paddle.CUDAPlace(0)
-        exe = paddle.static.Executor(place)
-        exe.run(startup)
-        outs = exe.run(main, feed=feed, fetch_list=fetch_list)
-        np.allclose(outs, result)
-        disable_prim()
-
-
 class TestMinimize(unittest.TestCase):
 
     def model(self, x, w, bias, opt):
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_primapi.py b/python/paddle/fluid/tests/unittests/autograd/test_primapi.py
index 0137f4103fbb30c16607440486da7f5e861d2c99..dc52c5bc33b480401b7689ab4a2f7a89510e50b5 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_primapi.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_primapi.py
@@ -37,7 +37,7 @@ import utils
      ('input_gradients_not_none', paddle.matmul,
       (np.random.rand(3, 3), np.random.rand(3, 3)),
       (np.random.rand(3, 3), np.random.rand(3, 3)), 'float64')))
-class TestForwardGradients(unittest.TestCase):
+class TestForwardGrad(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
@@ -55,7 +55,7 @@ class TestForwardGradients(unittest.TestCase):
         paddle.incubate.autograd.disable_prim()
         paddle.disable_static()
 
-    def test_forward_gradients(self):
+    def test_forward_grad(self):
 
         def expected():
             paddle.incubate.autograd.disable_prim()
@@ -64,7 +64,8 @@ class TestForwardGradients(unittest.TestCase):
             with paddle.static.program_guard(mp, sp):
                 feed, static_xs, static_v = utils.gen_static_data_and_feed(
                     self.xs, self.v, stop_gradient=False)
-                _, ys_grad = paddle.autograd.jvp(self.fun, static_xs, static_v)
+                _, ys_grad = paddle.incubate.autograd.jvp(
+                    self.fun, static_xs, static_v)
             exe = paddle.static.Executor()
             exe.run(sp)
             out = exe.run(mp, feed=feed, fetch_list=ys_grad)
@@ -80,7 +81,8 @@ class TestForwardGradients(unittest.TestCase):
                     self.xs, self.v, stop_gradient=False)
                 ys = self.fun(*static_xs) if isinstance(
                     static_xs, typing.Sequence) else self.fun(static_xs)
-                ys_grad = primapi.forward_gradients(ys, static_xs, static_v)
+                ys_grad = paddle.incubate.autograd.forward_grad(
+                    ys, static_xs, static_v)
                 paddle.incubate.autograd.prim2orig(mp.block(0))
             exe = paddle.static.Executor()
             exe.run(sp)
@@ -106,7 +108,7 @@ class TestForwardGradients(unittest.TestCase):
                     self.xs, self.v, stop_gradient=False)
                 ys = self.fun(*static_xs) if isinstance(
                     static_xs, typing.Sequence) else self.fun(static_xs)
-                ys_grad = primapi.forward_gradients(ys, static_xs, static_v)
+                ys_grad = primapi.forward_grad(ys, static_xs, static_v)
                 paddle.incubate.autograd.prim2orig(mp.block(0))
             exe = paddle.static.Executor()
             exe.run(sp)
@@ -116,14 +118,125 @@ class TestForwardGradients(unittest.TestCase):
     def test_illegal_param(self):
         paddle.incubate.autograd.enable_prim()
         with self.assertRaises(TypeError):
-            primapi.forward_gradients(1, paddle.static.data('inputs',
-                                                            shape=[1]))
+            primapi.forward_grad(1, paddle.static.data('inputs', shape=[1]))
 
         with self.assertRaises(TypeError):
-            primapi.forward_gradients(paddle.static.data('targets', shape=[1]),
-                                      1)
+            primapi.forward_grad(paddle.static.data('targets', shape=[1]), 1)
         paddle.incubate.autograd.disable_prim()
 
 
+class TestGrad(unittest.TestCase):
+
+    def setUp(self):
+        paddle.enable_static()
+        paddle.incubate.autograd.enable_prim()
+
+    def tearDown(self):
+        paddle.incubate.autograd.disable_prim()
+        paddle.disable_static()
+
+    def test_third_order(self):
+        paddle.incubate.autograd.enable_prim()
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
+            x = paddle.static.data(name='x', shape=[1], dtype='float32')
+            x2 = paddle.multiply(x, x)
+            x3 = paddle.multiply(x2, x)
+            x4 = paddle.multiply(x3, x)
+
+            grad1, = paddle.incubate.autograd.grad([x4], [x])
+            grad2, = paddle.incubate.autograd.grad([grad1], [x])
+            grad3, = paddle.incubate.autograd.grad([grad2], [x])
+
+            paddle.incubate.autograd.prim2orig(main.block(0))
+
+        feed = {x.name: np.array([2.]).astype('float32')}
+        fetch_list = [grad3.name]
+        result = [np.array([48.])]
+
+        place = paddle.CPUPlace()
+        if paddle.device.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = paddle.static.Executor(place)
+        exe.run(startup)
+        outs = exe.run(main, feed=feed, fetch_list=fetch_list)
+        np.allclose(outs, result)
+        paddle.incubate.autograd.disable_prim()
+
+    def test_fourth_order(self):
+        paddle.incubate.autograd.enable_prim()
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
+            x = paddle.static.data(name='x', shape=[1], dtype='float32')
+            x2 = paddle.multiply(x, x)
+            x3 = paddle.multiply(x2, x)
+            x4 = paddle.multiply(x3, x)
+            x5 = paddle.multiply(x4, x)
+            out = paddle.sqrt(x5 + x4)
+
+            grad1, = paddle.incubate.autograd.grad([out], [x])
+            grad2, = paddle.incubate.autograd.grad([grad1], [x])
+            grad3, = paddle.incubate.autograd.grad([grad2], [x])
+            grad4, = paddle.incubate.autograd.grad([grad3], [x])
+
+            paddle.incubate.autograd.prim2orig(main.block(0))
+
+        feed = {
+            x.name: np.array([2.]).astype('float32'),
+        }
+        fetch_list = [grad4.name]
+        # (3*(-5*x^2-16*x-16))/(16*(x+1)^3.5)
+        result = [np.array([-0.27263762711])]
+
+        place = paddle.CPUPlace()
+        if paddle.device.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = paddle.static.Executor(place)
+        exe.run(startup)
+        outs = exe.run(main, feed=feed, fetch_list=fetch_list)
+        np.allclose(outs, result)
+        paddle.incubate.autograd.disable_prim()
+
+    def test_disable_prim(self):
+
+        def actual(x: np.array):
+            paddle.incubate.autograd.disable_prim()
+            main = paddle.static.Program()
+            startup = paddle.static.Program()
+            with paddle.static.program_guard(main, startup):
+                var_x = paddle.static.data('x', shape=x.shape, dtype=x.dtype)
+                var_x.stop_gradient = False
+                y = paddle.tanh(var_x)
+                y_grad = paddle.incubate.autograd.grad(y, var_x)
+                y_second_grad = paddle.incubate.autograd.grad(y_grad, var_x)
+            exe = paddle.static.Executor()
+            exe.run(startup)
+            return exe.run(main,
+                           feed={'x': x},
+                           fetch_list=[y_grad, y_second_grad])
+
+        def expect(x: np.array):
+            paddle.incubate.autograd.disable_prim()
+            main = paddle.static.Program()
+            startup = paddle.static.Program()
+            with paddle.static.program_guard(main, startup):
+                var_x = paddle.static.data('x', shape=x.shape, dtype=x.dtype)
+                var_x.stop_gradient = False
+                y = paddle.tanh(var_x)
+                y_grad = paddle.static.gradients(y, var_x)
+                y_second_grad = paddle.static.gradients(y_grad, var_x)
+            exe = paddle.static.Executor()
+            exe.run(startup)
+            return exe.run(main,
+                           feed={'x': x},
+                           fetch_list=[y_grad, y_second_grad])
+
+        x = np.random.randn(100, 200)
+        for i, j in zip(actual(x), expect(x)):
+            np.testing.assert_allclose(i, j)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_primops.py b/python/paddle/fluid/tests/unittests/autograd/test_primops.py
index ccbd630bfd08469a8cf90f0bcebb43a931456225..f14664237f36f78826384ad5524e8d3d70b292bb 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_primops.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_primops.py
@@ -21,7 +21,7 @@ from paddle.incubate.autograd.primops import (neg, set_value, add, sub, mul,
                                               concat, reduce, matmul,
                                               slice_select, slice_assign,
                                               gather, scatter_add, fill_const)
-from paddle.incubate.autograd.primx import Transform, topo_path, orig2prim, prim2orig, _gradients
+from paddle.incubate.autograd.primx import Transform, topo_path, orig2prim, prim2orig
 from paddle.incubate.autograd.utils import enable_prim, disable_prim, prim_enabled
 
 
diff --git a/python/paddle/fluid/tests/unittests/autograd/utils.py b/python/paddle/fluid/tests/unittests/autograd/utils.py
index 8a0e51f60f47bfa48e322f336852d248626b2836..6afd0ff392288b8859c29002e6a5fe9891605667 100644
--- a/python/paddle/fluid/tests/unittests/autograd/utils.py
+++ b/python/paddle/fluid/tests/unittests/autograd/utils.py
@@ -22,7 +22,7 @@ import contextlib
 import collections
 import numpy as np
 import paddle
-from paddle.autograd.utils import as_tensors
+from paddle.incubate.autograd.utils import as_tensors
 
 
 ##########################################################
diff --git a/python/paddle/incubate/autograd/__init__.py b/python/paddle/incubate/autograd/__init__.py
index 718bc018d9fe5ba31585d15d3cab90368c9f5158..c5ff3b18d4d4945af4dfb8a243f5ad32378b014a 100644
--- a/python/paddle/incubate/autograd/__init__.py
+++ b/python/paddle/incubate/autograd/__init__.py
@@ -11,11 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddle.autograd.functional import Hessian, Jacobian, jvp, vjp
+from .functional import Hessian, Jacobian, jvp, vjp
+from .primapi import forward_grad, grad
 from .primx import prim2orig
-from .utils import enable_prim, disable_prim, prim_enabled
+from .utils import disable_prim, enable_prim, prim_enabled
 
 __all__ = [  # noqa
-    'vjp', 'jvp', 'Jacobian', 'Hessian', 'prim2orig', 'enable_prim',
-    'disable_prim', 'prim_enabled'
+    'vjp', 'jvp', 'Jacobian', 'Hessian', 'enable_prim', 'disable_prim',
+    'forward_grad', 'grad'
 ]
diff --git a/python/paddle/incubate/autograd/functional.py b/python/paddle/incubate/autograd/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c740005f82531ff778aa9604e66422ceffb8e65
--- /dev/null
+++ b/python/paddle/incubate/autograd/functional.py
@@ -0,0 +1,675 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import typing
+
+import paddle
+from paddle.fluid import framework
+from paddle.incubate.autograd import primapi, utils
+
+
+def vjp(func, xs, v=None):
+    r"""Computes the Vector-Jacobian product, a functional form of
+    reverse mode automatic differentiation.
+
+    Warning:
+        This API is in beta, the signatures could be changed in future version.
+
+    Args:
+        func(Callable): A function that takes ``xs`` as inputs parameter and
+            returns a sequence of Tensors or a Tensor.
+        xs(Tensor|Sequence[Tensor]): Used as positional arguments to evaluate
+            ``func``. ``xs`` is accepted as one Tensor or a sequence of Tensors.
+        v(Tensor|Sequence[Tensor]|None, optional): The cotangent vector invovled
+            in the VJP computation. ``v`` matches the size and shape of
+            ``func`` 's output. Defaults to None, which is equivalent to all
+            ones the same size of ``func`` 's output.
+
+    Returns:
+        output(tuple):
+        
+            - func_out(Tensor|tuple[Tensor]): The output of ``func(xs)`` .
+            - vjp(Tensor|tuple[Tensor]): The vjp result.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            def func(x):
+                return paddle.matmul(x, x)
+
+            x = paddle.ones(shape=[2, 2], dtype='float32')
+            _, vjp_result = paddle.incubate.autograd.vjp(func, x)
+            print(vjp_result)
+            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [[4., 4.],
+            #         [4., 4.]])
+
+            v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]])
+            _, vjp_result = paddle.incubate.autograd.vjp(func, x, v)
+            print(vjp_result)
+            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [[2., 1.],
+            #         [1., 0.]])
+    """
+    _check_inputs(func, xs, v)
+
+    # ``_seprate`` breaks the dependencies between ``xs`` and other
+    # variables. See more ``_seprate`` .
+    if paddle.fluid._non_static_mode() or not utils.prim_enabled():
+        xs, v = _separate(xs), _separate(v)
+    ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs)
+    _check_v_shape(v, ys)
+
+    return ys, _grad(ys, xs, v)
+
+
+def jvp(func, xs, v=None):
+    r"""
+    Computes the Jacobian-Vector product for a function at the given
+    inputs and a vector in the tangent space induced by the inputs.
+
+    Warning:
+        This API is in beta, the signatures could be changed in future version.
+
+    Args:
+        func(Callable): The ``func`` takes as input a Tensor or a Sequence
+            of Tensors and returns a Tensor or a Sequence of Tensors.
+        xs(Tensor|Sequence[Tensor]): Used as positional arguments to
+            evaluate ``func``.  The ``xs`` is accepted as one Tensor or a
+            Sequence of Tensors.
+        v(Tensor|Sequence[Tensor]|None, Optional): The tangent vector invovled
+            in the JVP computation. The ``v`` matches the size and shape of
+            ``xs`` . Default value is None and in this case is equivalent to 
+            all ones the same size of ``xs`` .
+
+    Returns:
+        output(tuple):
+
+            - func_out(Tensor|tuple[Tensor]): The output of ``func(xs)`` .
+            - jvp(Tensor|tuple[Tensor]): The jvp result.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+
+            def func(x):
+                return paddle.matmul(x, x)
+
+
+            x = paddle.ones(shape=[2, 2], dtype='float32')
+            _, jvp_result = paddle.incubate.autograd.jvp(func, x)
+            print(jvp_result)
+            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [[4., 4.],
+            #         [4., 4.]])
+            v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]])
+            _, jvp_result = paddle.incubate.autograd.jvp(func, x, v)
+            print(jvp_result)
+            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [[2., 1.],
+            #         [1., 0.]])
+
+    """
+    _check_inputs(func, xs, v)
+    # ``_seprate`` breaks the dependencies between ``xs`` and other
+    # variables. See more ``_seprate`` .
+    if paddle.fluid._non_static_mode() or not utils.prim_enabled():
+        xs, v = _separate(xs), _separate(v)
+    ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs)
+    _check_v_shape(v, xs)
+
+    if not paddle.fluid._non_static_mode() and utils.prim_enabled():
+        return ys, primapi.forward_grad(ys, xs, v)
+    else:
+        return ys, _double_backward_trick(ys, xs, v)
+
+
+def _double_backward_trick(ys, xs, v):
+    """Double backward trick for computing ``jvp`` by ``vjp``
+    see details: https://j-towns.github.io/2017/06/12/A-new-trick.html
+    """
+    # The value of ys_grad is not important, it can be any random value in
+    # theory, but it's required to set stop_gradient=False.
+    ys_grad = _zeros_like_with_grad(ys)
+    xs_grad = _grad(ys, xs, ys_grad)
+    return _grad(xs_grad, ys_grad, v)
+
+
+def _zeros_like_with_grad(xs):
+    """Create a zero or zeros sequence Tensor like ``xs`` with a flag 
+    ``stop_graident=False`` .
+    """
+    if not isinstance(xs, typing.Sequence):
+        ys = paddle.zeros_like(xs)
+        ys.stop_gradient = False
+    else:
+        ys = []
+        for x in xs:
+            y = paddle.zeros_like(x)
+            y.stop_gradient = False
+            ys.append(y)
+    return ys
+
+
+class Jacobian(object):
+    r"""
+    Computes the Jacobian matrix of a given function.
+
+    If the function has multiple inputs and multiple outputs, during internal 
+    implementation, all input tensors are concatenated after being flatten, 
+    the batch dimension is retained, and the output is subject to the same 
+    processing rules.
+
+    Once the Jacobian ``J`` is constructed, you can use a multidimensional index 
+    to retrieve the submatrix of ``J``, as same as slicing a Tensor. The 
+    submatrix is lazily evaluated along row axis, and will be cached once 
+    evaluated.
+
+    For examples, supposing ``is_batched=True``, you can retrieve the submatrix 
+    by following methods:
+
+        * J[:], retrieving the full matrix.
+        * J[:, :, j], retrieving the partial derivatives w.r.t. the j'th input
+          variable.
+        * J[:, i, :], retrieving the partial derivatives w.r.t. the i'th output
+          variable.
+        * J[:, i, j], retrieving the partial derivatives w.r.t. the i'th output
+          variable and the j'th input variable.
+
+    Notes:
+
+        Eclipsis index is not supported currently.
+
+    Warning:
+        This API is in beta, the signatures could be changed in future version.
+
+    Args:
+
+        func (Callable): A python function that takes a Tensor or a sequence of 
+            Tensors as inputs(the first dimension is batch size) and
+            returns a Tensor  a sequence of Tensors.
+        xs (Tensor|Sequence[Tensor]): The input to the function ``func`` .
+        is_batched (bool): If true, the first axis is batch axis. Defaults to 
+            False.
+
+    Returns:
+
+        Jacobian (Object): A python object retains the Jacobian matrix.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+
+            def func(x, y):
+                return paddle.matmul(x, y)
+
+
+            x = paddle.to_tensor([[1., 2.], [3., 4.]])
+            J = paddle.incubate.autograd.Jacobian(func, [x, x])
+            print(J[:, :])
+            # Tensor(shape=[4, 8], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [[1., 3., 0., 0., 1., 0., 2., 0.],
+            #         [2., 4., 0., 0., 0., 1., 0., 2.],
+            #         [0., 0., 1., 3., 3., 0., 4., 0.],
+            #         [0., 0., 2., 4., 0., 3., 0., 4.]])
+
+            print(J[0, :])
+            # Tensor(shape=[8], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [1., 3., 0., 0., 1., 0., 2., 0.])
+            print(J[:, 0])
+            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [1., 2., 0., 0.])
+
+    """
+
+    def __init__(self, func, xs, is_batched=False):
+        if not is_batched:
+            self._jacobian = _JacobianNoBatch(func, xs)
+        else:
+            self._jacobian = _JacobianBatchFirst(func, xs)
+
+    def __getitem__(self, indexes):
+        return self._jacobian[indexes]
+
+    @property
+    def shape(self):
+        """The shape of flattened Jacobian matrix.
+        """
+        return self._jacobian.shape
+
+
+class Hessian(object):
+    """
+    Computes the Hessian matrix  with a given ``func`` with respect to ``xs`` .
+
+    If the function has multiple inputs, during internal implementation, 
+    all input tensors are concatenated after being flatten, the batch dimension 
+    is retained.
+
+    The Hessian submatrix is lazily evaluated, and can be retrieved with a 
+    multidimensional indexes. See details ``Jacobian`` .
+
+    Warning:
+        This API is in beta, the signatures could be changed in future version.
+
+    Args:
+        func (Callable): A python function that takes a Tensor or a Tensor
+            sequence as inputs and returns a Tensor with shape 
+            ``[batch_size, 1]`` with batch or ``[1]`` without batch.
+        xs (Tensor|Sequence(Tensor)): The input Tensor or Tensor sequence of 
+            the function ``func``.
+        is_batched (bool): If true, the first axis is batch axis. Defaults to 
+            False.
+
+    Returns:
+
+        Hessian (Object): A python object retains the Hessian matrix.
+
+
+    Examples:
+
+    .. code-block:: python
+
+        import paddle
+
+
+        def reducer(x):
+            return paddle.sum(x * x)
+
+
+        x = paddle.rand([2, 2])
+        h = paddle.incubate.autograd.Hessian(reducer, x)
+        print(h[:])
+        # Tensor(shape=[4, 4], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+        #        [[2., 0., 0., 0.],
+        #         [0., 2., 0., 0.],
+        #         [0., 0., 2., 0.],
+        #         [0., 0., 0., 2.]])
+    """
+
+    def __init__(self, func, xs, is_batched=False):
+
+        def _jac_func(*xs):
+            jac = Jacobian(func, xs, is_batched=is_batched)
+            if (is_batched and jac.shape[1] != 1) or (not is_batched
+                                                      and jac.shape[0] != 1):
+                raise RuntimeError(
+                    "The function given to Hessian shoud return as single element Tensor or batched single element Tensor."
+                )
+            return jac[:, 0, :] if is_batched else jac[0, :]
+
+        self.symbolic = Jacobian(_jac_func, xs, is_batched=is_batched)
+
+    def __getitem__(self, indexes):
+        return self.symbolic[indexes]
+
+    @property
+    def shape(self):
+        """The shape of flattened Hessian matrix.
+        """
+        return self.symbolic.shape
+
+
+class _Jacobian(object):
+    """The base class for computing Jacobian matrix.
+
+    ``_Jacobian`` implementes the core logic of multidimensional index and lazy 
+    evaluation for Jacobian matrix, subclass only need to overwrite following 
+    methods:
+
+        * ``_lazy_axis()``,  return the axis along which will be lazy 
+            evaluating.
+        * ``_flatten(xs)``, flattens the inputs ``xs``.
+        * ``_evaluate(index)``, evaluates one slice along ``_lazy_axis`` .
+
+    Notes:
+
+        Because currently PaddlePaddle only support reverse differentiation by 
+        ``paddle.grad``, so lazy evaluation is only supported along the row of 
+        Jacobian matrix, which means that slicing along row will get better 
+        performance.
+
+    """
+
+    def __init__(self, func, xs):
+        # Skip separating in prim mode temporarily, as detach and clone are not
+        # primitive operators.
+        if not paddle.fluid._non_static_mode() and utils.prim_enabled():
+            self._xs = xs
+        else:
+            self._xs = _separate(xs)
+        self._ys = func(*utils.as_tensors(self._xs))
+        self._flatten_xs = self._flatten(utils.as_tensors(self._xs))
+        self._flatten_ys = self._flatten(utils.as_tensors(self._ys))
+        self._cache = {}
+
+    @property
+    def shape(self):
+        raise NotImplementedError
+
+    @property
+    def _lazy_axis(self):
+        """"The axis of lazily evaluated."""
+        raise NotImplementedError
+
+    def _lazy_indexes(self, indexes):
+        idx = indexes[self._lazy_axis]
+        return (idx, ) if isinstance(idx, int) else tuple(
+            range(idx.start, idx.stop, idx.step))
+
+    def _flatten(self, xs):
+        raise NotImplementedError
+
+    def _shifted_indexes(self, indexes, lazy_axis_size=0):
+        idx = indexes[self._lazy_axis]
+        shifted_lazy_axis_idx = 0 if isinstance(idx, int) else slice(
+            0, lazy_axis_size, 1)
+        return indexes[:self._lazy_axis] + (
+            shifted_lazy_axis_idx, ) + indexes[self._lazy_axis + 1:]
+
+    def __getitem__(self, indexes):
+        indexes = _multi_index(indexes, self.shape)
+
+        if isinstance(indexes[self._lazy_axis], int):
+            other_indexes = indexes[:self._lazy_axis] + \
+                indexes[self._lazy_axis+1:]
+            return self._cached_evaluate(
+                indexes[self._lazy_axis])[other_indexes]
+        lazy_indexes = self._lazy_indexes(indexes)
+        # Using concat and reshape to replace stack operator temporarily, as
+        # it is not a primitive operator.
+        shape = list(self.shape)
+        shape[self._lazy_axis] = len(lazy_indexes)
+        part_jac = paddle.concat(
+            [self._cached_evaluate(i) for i in lazy_indexes],
+            axis=self._lazy_axis).reshape(shape)
+        return part_jac[self._shifted_indexes(indexes, len(lazy_indexes))]
+
+    def _cached_evaluate(self, k):
+        v = self._cache.get(k)
+        if v is None:
+            v = self._evaluate(k)
+            self._cache[k] = v
+        return v
+
+    def _evaluate(self, index):
+        """Evaluate one slice at along lazy axis."""
+        raise NotImplementedError
+
+
+class _JacobianNoBatch(_Jacobian):
+    """Compute Jacobian matrix without batch dimension.
+    Suppose the mapping is :math:`f: R^M \to R^N`, the output shape is 
+    ``(N, M)`` .
+    """
+
+    def __init__(self, func, xs):
+        super(_JacobianNoBatch, self).__init__(func, xs)
+
+    @property
+    def shape(self):
+        return (self._flatten_ys.shape[0], self._flatten_xs.shape[0])
+
+    @property
+    def _lazy_axis(self):
+        return 0
+
+    def _flatten(self, xs):
+        return paddle.concat(tuple(x.reshape((-1, )) for x in xs))
+
+    def _evaluate(self, row_index):
+        return self._flatten(_grad(
+            self._flatten_ys[row_index],
+            self._xs,
+        ))
+
+
+class _JacobianBatchFirst(_Jacobian):
+    """Compute Jacobian matrix with batch at first axis.
+    Suppose the mapping is :math:`f: R^{B,M} \to R^{B,N}`, the output shape is 
+    ``(B, N, M)`` .
+    """
+
+    def __init__(self, func, xs):
+        super(_JacobianBatchFirst, self).__init__(func, xs)
+
+    @property
+    def shape(self):
+        return (self._flatten_xs.shape[0], self._flatten_ys.shape[1],
+                self._flatten_xs.shape[1])
+
+    @property
+    def _lazy_axis(self):
+        return 1
+
+    def _flatten(self, xs):
+        return paddle.concat(
+            tuple(x.reshape((x.shape[0], -1)) for x in utils.as_tensors(xs)), 1)
+
+    def _evaluate(self, row_index):
+        return self._flatten(_grad(self._flatten_ys[:, row_index], self._xs))
+
+
+def _multi_index(indexes, shape):
+    """A tool for parsing N-dimensional index into a standard format.
+
+    Currently supporting following input format:
+        * ([positive|negative|slice], ...), the right-most elements can be 
+            omited.
+
+    The standard format after converted is slice tuple which contains N elements:
+        * ([positive|slice], ..., [positive|slice])
+
+    Notes: 
+        Ellipsis indexes such as ``(..., i), (i, ...)`` is not supported.
+
+    Args:
+        indexes (tuple): The input indexes.
+        shape (tuple): The input shape.
+
+    Returns:
+        tuple: The standard format index as the above description.
+    """
+    indexes = indexes if isinstance(indexes, typing.Sequence) else (indexes, )
+    if any(isinstance(i, type(Ellipsis)) for i in indexes):
+        raise IndexError('Ellipsis index currently is not supported.')
+    # Fill the right-most elements.
+    indexes = indexes + (slice(0, None, None), ) * (len(shape) - len(indexes))
+    # Convert to positive index.
+    positive_indexes = []
+    for i, index in enumerate(indexes):
+        if isinstance(index, slice):
+            index = slice(index.start or 0, index.stop or shape[i], index.step
+                          or 1)
+            positive_indexes.append(
+                slice(
+                    index.start + shape[i] if index.start < 0 else index.start,
+                    index.stop + shape[i] if index.stop < 0 else index.stop,
+                    # Negative step means index backward, no need to convert to
+                    # positive interger.
+                    index.step))
+        elif isinstance(index, int):
+            positive_indexes.append(index + shape[i] if index < 0 else index)
+        else:
+            raise TypeError(f'Not supported index type {index}.')
+    return tuple(positive_indexes)
+
+
+def _replace_none_with_zero_tensor(xs, refs):
+    if xs is None:
+        xs = paddle.zeros_like(refs)
+        xs.stop_gradient = refs.stop_gradient
+        return xs
+    elif isinstance(xs, typing.Sequence):
+        return tuple(
+            _replace_none_with_zero_tensor(x, refs[i])
+            for i, x in enumerate(xs))
+    else:
+        return xs
+
+
+def _grad(ys, xs, v=None):
+    """A gradient function that can be used in dynamic graph and static graph.
+
+    The ``grad`` combines ``paddle.grad`` used in dynamic graph and
+    ``paddle.static.gradients`` used in static graph, and do following changes:
+
+    * The ``allow_unused`` flag is removed and set defaults to true internally,
+        none in outputs will be replaced by zero tensor.
+    * The ``create_graph`` flag is removed and set defaults to true internally,
+        only makes sense in dynamic graph.
+    * When xs is a single Tensor, ``paddle.grad`` returns a list which only 
+        contains one Tensor. It may confuse users, thus in this case we improve 
+        to return a single Tensor in _grad interface.
+
+    Args:
+        ys (Tensor|Sequence[Tensor]): The output tensor or tensor sequence of
+            the graph to compute gradients.
+        xs (Tensor|Sequence[Tensor]): The input tensor or tensor sequence of the graph to
+            compute gradients. The returned values of this API are the
+            gradients of inputs .
+        v (Tensor|Sequence[Tensor]|None,optional): The initial gradient values
+            of outputs . If grad_outputs is None, the initial gradient values of
+            outputs would be Tensors filled with 1; if grad_outputs is not None,
+            it must have the same length as outputs , and in this case, the
+            initial gradient value of the i-th outputs would be: (1) a Tensor
+            filled with 1 when the i-th element of grad_outputs is None;
+            (2) the i-th element of grad_outputs when the i-th element of
+            grad_outputs is a Tensor. Default None.
+
+    Returns:
+        Tensor|tuple[Tensor]: Tensor or a tuple of Tensors, whose length is the 
+            same as the Tensor number inside inputs, and the i-th returned 
+            Tensor is the sum of gradients of outputs with respect to the i-th 
+            inputs.
+    """
+    if paddle.fluid._non_static_mode():
+        xs_grad = paddle.grad(ys, xs, v, create_graph=True, allow_unused=True)
+    else:
+        xs_grad = paddle.incubate.autograd.grad(ys, xs, v)
+
+    if isinstance(xs, paddle.fluid.framework.Variable):
+        xs_grad = xs_grad[0]
+
+    return _replace_none_with_zero_tensor(xs_grad, xs)
+
+
+def _separate(xs):
+    """
+    ``_separate`` separates ``xs`` from the computation graph through ``clone`` 
+    or ``deteach`` .
+
+    Interally, ``paddle.grad(xs, ys)`` is stateful API implemented based on 
+    computional graph, which will reduce gradients along all path from ys to xs.
+
+    However, funcional autograd API such as ``vjp``, ``jvp`` is stateless, and 
+    only compute gradients with a given ``func`` .
+
+    For example, given a ``func`` :math:`y0=f(x0)`, supposing forward path is:
+    ``x0 -> y0``, ``x0 -> x1 -> y0`` .
+    ``paddle.grad(y0, x0)`` will reduce gradients along ``y0->x0`` and 
+    ``y0->x1->x0``, and ``vjp`` only need reduce along ``y0->x0``.
+
+    So, it's needed to clone or detach xs for breaking the dependencies with 
+    other variables.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle.autograd.functional import _separate
+
+
+            def func(x, y):
+                return x * y
+
+
+            x = paddle.ones((1,))
+            x.stop_gradient = False
+
+            y = func(x, x)
+            print(paddle.grad(y, x))
+            # [Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [2.])]
+
+            x1, x2 = _separate((x, x))
+            y = func(x1, x2)
+            print(paddle.grad(y, x1))
+            # [Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [1.])]
+
+    """
+    if isinstance(xs, typing.Sequence):
+        return tuple(_single_separate(x) for x in xs)
+    else:
+        return _single_separate(xs)
+
+
+def _single_separate(x):
+    if x is None:  # x maybe none because grad input's v defaults to none.
+        return x
+    if not x.stop_gradient:
+        return paddle.clone(x)
+    else:  # use detach to share memory when no need gradients.
+        x = x.detach()
+        x.stop_gradient = False
+        return x
+    return x
+
+
+def _check_inputs(func, xs, v=None):
+    if not callable(func):
+        raise TypeError(f"Expected 'fun' is Callable, but got {type(func)}.")
+
+    if not isinstance(xs, (framework.Variable, typing.Sequence)):
+        raise TypeError(f"Expected 'xs' is a Tensor|Sequence[Tensor],"
+                        f"but got {type(xs)}.")
+    if isinstance(xs, typing.Sequence) and not all(
+            isinstance(x, framework.Variable) for x in xs):
+        raise TypeError("All elements of 'xs' shoule be Tensor.")
+
+    if not isinstance(v, (framework.Variable, typing.Sequence, type(None))):
+        raise TypeError(
+            f"Expected 'v' is Tensor|Sequence[Tensor]|None, but got {type(v)}.")
+
+    if isinstance(v, typing.Sequence) and not all(
+            isinstance(e, framework.Variable) for e in v):
+        raise TypeError("All elements of 'xs' shoule be Tensor.")
+
+
+def _check_v_shape(v, refs):
+    if v is None:
+        return
+
+    v, refs = utils.as_tensors(v), utils.as_tensors(refs)
+    if len(refs) != len(v):
+        raise RuntimeError(f"The argument v is a tuple of invalid length:"
+                           f"should be {len(refs)} but got {len(v)}.")
+
+    for index, (element_v, element_ref) in enumerate(zip(v, refs)):
+        if element_v.shape != element_ref.shape:
+            raise RuntimeError(
+                f"The v[{index}] has invalid shape: should "
+                f"be {element_ref.shape} but got {element_v.shape}.")
diff --git a/python/paddle/incubate/autograd/primapi.py b/python/paddle/incubate/autograd/primapi.py
index 75a70b09731f2db652b214fc61eaea0117064d9d..5b3ad0dd78a3be5215e0aa47b924b5382f1d055a 100644
--- a/python/paddle/incubate/autograd/primapi.py
+++ b/python/paddle/incubate/autograd/primapi.py
@@ -14,28 +14,26 @@
 
 import typing
 
-import paddle.autograd.utils as tensor_utils
-import paddle.incubate.autograd.utils as prim_utils
-from paddle.fluid import framework
-from paddle.incubate.autograd import primx
+from paddle.fluid import backward, framework
+from paddle.incubate.autograd import primx, utils
 
 
 @framework.static_only
-def forward_gradients(targets, inputs, input_gradients=None):
+def forward_grad(outputs, inputs, grad_inputs=None):
     """Forward mode of automatic differentiation.
 
     .. note::
         **ONLY available in the static mode and primitive operators.**
 
     Args:
-        targets: The target tensor or tensors
+        outputs: The output tensor or tensors
         inputs: The input tensor or tensors
-        input_gradients: The gradient Tensor or Tensors of inputs which has 
+        grad_inputs: The gradient Tensor or Tensors of inputs which has 
             the same shape with inputs, Defaults to None, in this case is 
             equivalent to all ones .
 
     Returns:
-        target_gradients (Tensor|Sequence[Tensor]): The gradients for targets.
+        grad_outputs (Tensor|Sequence[Tensor]): The gradients for outputs.
 
     Examples:
 
@@ -53,7 +51,7 @@ def forward_gradients(targets, inputs, input_gradients=None):
             with paddle.static.program_guard(main_program, startup_program):
                 x = paddle.static.data('x', shape=[1], dtype='float32')
                 y = x * x 
-                y_grad = paddle.incubate.autograd.forward_gradients(y, x)
+                y_grad = paddle.incubate.autograd.forward_grad(y, x)
                 paddle.incubate.autograd.prim2orig()
 
             exe = paddle.static.Executor()
@@ -65,20 +63,20 @@ def forward_gradients(targets, inputs, input_gradients=None):
             paddle.incubate.autograd.disable_prim()
             paddle.disable_static()
     """
-    if not prim_utils.prim_enabled():
-        raise RuntimeError('forward_gradients must be running on primitive'
+    if not utils.prim_enabled():
+        raise RuntimeError('forward_grad must be running on primitive'
                            'operators, use enable_prim to turn it on.')
 
-    if not isinstance(targets, (framework.Variable, typing.Sequence)):
-        raise TypeError(f'Expected targets is Tensor|Sequence[Tesnor], '
-                        f'but got {type(targets)}.')
+    if not isinstance(outputs, (framework.Variable, typing.Sequence)):
+        raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], '
+                        f'but got {type(outputs)}.')
 
     if not isinstance(inputs, (framework.Variable, typing.Sequence)):
         raise TypeError(f'Expected inputs is Tensor|Sequence[Tesnor], '
                         f'but got {type(inputs)}.')
 
-    ys, xs, xs_dot = tensor_utils.as_tensors(targets), tensor_utils.as_tensors(
-        inputs), tensor_utils.as_tensors(input_gradients)
+    ys, xs, xs_dot = utils.as_tensors(outputs), utils.as_tensors(
+        inputs), utils.as_tensors(grad_inputs)
 
     block = framework.default_main_program().current_block()
     if any(x.block != block for x in xs + ys):
@@ -90,4 +88,95 @@ def forward_gradients(targets, inputs, input_gradients=None):
     ad = primx.Transform(ys[0].block)
     _, ys_dot = ad.linearize(xs, ys, xs_dot)
 
-    return ys_dot[0] if isinstance(targets, framework.Variable) else ys_dot
+    return ys_dot[0] if isinstance(outputs, framework.Variable) else ys_dot
+
+
+@framework.static_only
+def grad(outputs, inputs, grad_outputs=None):
+    """Reverse mode of automatic differentiation.
+
+    .. note::
+        **ONLY available in the static mode and primitive operators**
+
+    Args:
+        outputs (Tensor|Sequence[Tensor]): The output Tensor or Tensors.
+        inputs (Tensor|Sequence[Tensor]): The input Tensor or Tensors.
+        grad_outputs (Tensor|Sequence[Tensor]): The gradient Tensor or 
+            Tensors of outputs which has the same shape with outputs, Defaults 
+            to None, in this case is equivalent to all ones .
+
+    Returns:
+        grad_inputs (Tensor|Tensors): The gradients for inputs. 
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+            paddle.enable_static()
+            paddle.incubate.autograd.enable_prim()
+            startup_program = paddle.static.Program()
+            main_program = paddle.static.Program()
+            with paddle.static.program_guard(main_program, startup_program):
+                x = paddle.static.data('x', shape=[1], dtype='float32')
+                x.stop_gradients = False
+                y = x * x 
+                x_grad = paddle.incubate.autograd.grad(y, x)
+                paddle.incubate.autograd.prim2orig()
+            exe = paddle.static.Executor()
+            exe.run(startup_program)
+            x_grad = exe.run(main_program, feed={'x': np.array([2.]).astype('float32')}, fetch_list=[x_grad])
+            print(x_grad)
+            # [array([4.], dtype=float32)]
+            paddle.incubate.autograd.disable_prim()
+            paddle.disable_static()
+    """
+
+    if not utils.prim_enabled():
+        return backward.gradients(outputs, inputs, grad_outputs)
+
+    if not isinstance(outputs, (framework.Variable, typing.Sequence)):
+        raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], '
+                        f'but got {type(outputs)}.')
+
+    if not isinstance(inputs, (framework.Variable, typing.Sequence)):
+        raise TypeError(f'Expected inputs is Tensor|Sequence[Tesnor], '
+                        f'but got {type(inputs)}.')
+
+    ys, xs, ys_bar = utils.as_tensors(outputs), utils.as_tensors(
+        inputs), utils.as_tensors(grad_outputs)
+    block = framework.default_main_program().current_block()
+    if any((x is not None and x.block != block) for x in xs + ys):
+        raise RuntimeError(
+            'Variable in inputs and outputs should be None or in current block of main program'
+        )
+
+    # TODO(Tongxin) without any prior knowledge about whether the program
+    # is completely lowered to primitive ops, it's mandatory to run the lowering
+    # pass once and again. This is obviously inefficient and needs to be
+    # optimized.
+    primx.orig2prim(block)
+    ad = primx.Transform(block)
+    xs_dot, ys_dot = ad.linearize(xs, ys)
+    if any(var is None for var in ys_dot):
+        raise RuntimeError(
+            'Grads cannot be computed. The given outputs does not depend on inputs'
+        )
+    ys_bar, xs_bar = ad.transpose(ys_dot, xs_dot, ys_bar)
+
+    # remove xs_dot and their constructor ops
+    op_indexes = []
+    for var in xs_dot:
+        if var is not None:
+            op_index = block.ops.index(var.op)
+            if op_index < 0:
+                raise ValueError(
+                    f'op_index should be greater than or equal to 0, but op_index={op_index}.'
+                )
+            op_indexes.append(op_index)
+
+    ad.erase_ops(sorted(op_indexes))
+    ad.erase_dots(xs_dot)
+
+    return xs_bar[0] if isinstance(inputs, framework.Variable) else xs_bar
diff --git a/python/paddle/incubate/autograd/primops.py b/python/paddle/incubate/autograd/primops.py
index 6017ac35989204ff823b3652e8e1bd56420ad852..b9a3ac459961a322064638f01d5157e5b3ec1669 100644
--- a/python/paddle/incubate/autograd/primops.py
+++ b/python/paddle/incubate/autograd/primops.py
@@ -14,6 +14,7 @@
 
 import paddle
 from paddle.fluid.layer_helper import LayerHelper
+
 from .primreg import REGISTER_FN
 
 
diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py
index d5037dcf64994bca69ec452a217e983a2e10d526..260a97cdc16a43a1b2f230ad00a8b198e80da9db 100644
--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
@@ -22,7 +22,7 @@ from .primreg import op_position_inputs, op_position_output, lookup_orig2prim, l
 from .primrules import _orig2prim, _prim2orig, _jvp, _transpose
 from .utils import get_input_var_list, get_output_var_list, flatten, flatten_and_remove_none
 from collections import OrderedDict
-from paddle.autograd.utils import as_tensors
+from paddle.incubate.autograd.utils import as_tensors
 
 
 def topo_path(xs, ys, block=None):
@@ -577,47 +577,3 @@ def prim2orig(block=None):
     assert block == default_main_program().current_block(
     ), f'block is neither None nor current block of main program'
     _lower(block, reverse=True)
-
-
-def _gradients(ys, xs, ys_bar=None):
-    """ A drop-in replacement of paddle.gradients but instead computing
-    on primitive ops.
-    
-    Args:
-        ys: the target tensor or tensors
-        xs: the input tensor or tensors
-        ys_bar: the optional gradient tensors of `ys`
-    
-    Returns:
-        xs_bar: a list gradients of input `xs`
-    """
-
-    ys, xs, ys_bar = as_tensors(ys), as_tensors(xs), as_tensors(ys_bar)
-    block = default_main_program().current_block()
-    for el in xs + ys:
-        assert el is None or el.block == block, f'variable in xs and ys should be None or in current block of main program'
-    # TODO(Tongxin) without any prior knowledge about whether the program
-    # is completely lowered to primitive ops, it's mandatory to run the lowering
-    # pass once and again. This is obviously inefficient and needs to be
-    # optimized.
-    orig2prim(block)
-
-    ad = Transform(block)
-
-    xs_dot, ys_dot = ad.linearize(xs, ys)
-    if any(var is None for var in ys_dot):
-        assert False, f'Gradients cannot be computed. The given output `ys` does not depend on input `xs`.'
-    ys_bar, xs_bar = ad.transpose(ys_dot, xs_dot, ys_bar)
-    # remove xs_dot and their constructor ops
-
-    op_indexes = []
-    for var in xs_dot:
-        if var is not None:
-            op_index = block.ops.index(var.op)
-            assert op_index >= 0, f'op_index should be greater than or equal to 0, but op_index={op_index}.'
-            op_indexes.append(op_index)
-
-    ad.erase_ops(sorted(op_indexes))
-    ad.erase_dots(xs_dot)
-
-    return xs_bar
diff --git a/python/paddle/incubate/autograd/utils.py b/python/paddle/incubate/autograd/utils.py
index 9d6a8c4f6a36dc325065d6bb1a581b00810c4bb5..96faf7f7440ca59ed3227d6fd7d778678db4b0c8 100644
--- a/python/paddle/incubate/autograd/utils.py
+++ b/python/paddle/incubate/autograd/utils.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import typing
 
 import paddle
 from paddle.fluid import framework as framework
@@ -170,3 +171,12 @@ def flatten(inp):
 def flatten_and_remove_none(inp):
     flattened = flatten(inp)
     return [var for var in flattened if var is not None]
+
+
+def as_tensors(xs):
+    if isinstance(xs, framework.Variable):
+        return (xs, )
+    elif isinstance(xs, typing.Sequence):
+        return tuple(xs)
+    else:
+        return xs