From e7928a067d44bf68a433052d7177600a108eccdf Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Thu, 31 Mar 2022 14:23:36 +0800
Subject: [PATCH] [New API]: miminize_bfgs and miminize_lbfgs (#40710)

* [New API]: miminize_bfgs and miminize_lbfgs

* modify for python module call correctly

* add functional package, add error raise in static_graph, change assign to set_value

* unify static_graph and dygraph, fix bug when x or H0 is float64

* now only accept input is tensor, put check args in utils.py, put exception test together

* temp

* add more detailed algorithm illustration and comment, reduce test case to limit test time in 15s

* change in_dygraph_mode to in_dynamic_mode

* fix bug of sample code; reduce test case to reduce test time

* change dir to incubate
---
 .../paddle/fluid/tests/unittests/test_bfgs.py | 165 ++++++++++
 .../fluid/tests/unittests/test_lbfgs.py       | 164 ++++++++++
 python/paddle/incubate/optimizer/__init__.py  |   1 +
 .../incubate/optimizer/functional/__init__.py |  18 ++
 .../incubate/optimizer/functional/bfgs.py     | 195 ++++++++++++
 .../incubate/optimizer/functional/lbfgs.py    | 239 ++++++++++++++
 .../optimizer/functional/line_search.py       | 297 ++++++++++++++++++
 .../incubate/optimizer/functional/utils.py    |  96 ++++++
 python/setup.py.in                            |   1 +
 9 files changed, 1176 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_bfgs.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_lbfgs.py
 create mode 100644 python/paddle/incubate/optimizer/functional/__init__.py
 create mode 100644 python/paddle/incubate/optimizer/functional/bfgs.py
 create mode 100644 python/paddle/incubate/optimizer/functional/lbfgs.py
 create mode 100644 python/paddle/incubate/optimizer/functional/line_search.py
 create mode 100644 python/paddle/incubate/optimizer/functional/utils.py

diff --git a/python/paddle/fluid/tests/unittests/test_bfgs.py b/python/paddle/fluid/tests/unittests/test_bfgs.py
new file mode 100644
index 0000000000..c89f7205f0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bfgs.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+
+from paddle.incubate.optimizer.functional.bfgs import minimize_bfgs
+
+np.random.seed(123)
+
+
+def test_static_graph(func, x0, line_search_fn='strong_wolfe', dtype='float32'):
+    dimension = x0.shape[0]
+    paddle.enable_static()
+    main = paddle.static.Program()
+    startup = paddle.static.Program()
+    with paddle.static.program_guard(main, startup):
+        X = paddle.static.data(name='x', shape=[dimension], dtype=dtype)
+        Y = minimize_bfgs(func, X, line_search_fn=line_search_fn, dtype=dtype)
+
+    exe = paddle.static.Executor()
+    exe.run(startup)
+    return exe.run(main, feed={'x': x0}, fetch_list=[Y])
+
+
+def test_static_graph_H0(func, x0, H0, dtype='float32'):
+    paddle.enable_static()
+    main = paddle.static.Program()
+    startup = paddle.static.Program()
+    with paddle.static.program_guard(main, startup):
+        X = paddle.static.data(name='x', shape=[x0.shape[0]], dtype=dtype)
+        H = paddle.static.data(
+            name='h', shape=[H0.shape[0], H0.shape[1]], dtype=dtype)
+        Y = minimize_bfgs(
+            func, X, initial_inverse_hessian_estimate=H, dtype=dtype)
+
+    exe = paddle.static.Executor()
+    exe.run(startup)
+    return exe.run(main, feed={'x': x0, 'h': H0}, fetch_list=[Y])
+
+
+def test_dynamic_graph(func,
+                       x0,
+                       H0=None,
+                       line_search_fn='strong_wolfe',
+                       dtype='float32'):
+    paddle.disable_static()
+    x0 = paddle.to_tensor(x0)
+    if H0 is not None:
+        H0 = paddle.to_tensor(H0)
+    return minimize_bfgs(
+        func,
+        x0,
+        initial_inverse_hessian_estimate=H0,
+        line_search_fn=line_search_fn,
+        dtype=dtype)
+
+
+class TestBfgs(unittest.TestCase):
+    def test_quadratic_nd(self):
+        for dimension in [1, 10]:
+            minimum = np.random.random(size=[dimension]).astype('float32')
+            scale = np.exp(np.random.random(size=[dimension]).astype('float32'))
+
+            def func(x):
+                minimum_ = paddle.assign(minimum)
+                scale_ = paddle.assign(scale)
+                return paddle.sum(
+                    paddle.multiply(scale_, (F.square_error_cost(x, minimum_))))
+
+            x0 = np.random.random(size=[dimension]).astype('float32')
+            results = test_static_graph(func=func, x0=x0)
+            self.assertTrue(np.allclose(minimum, results[2]))
+
+            results = test_dynamic_graph(func=func, x0=x0)
+            self.assertTrue(np.allclose(minimum, results[2].numpy()))
+
+    def test_inf_minima(self):
+        extream_point = np.array([-1, 2]).astype('float32')
+
+        def func(x):
+            # df = 3(x - 1.01)(x - 0.99)
+            # f = x^3 - 3x^2 + 3*1.01*0.99x
+            return x * x * x / 3.0 - (
+                extream_point[0] + extream_point[1]
+            ) * x * x / 2 + extream_point[0] * extream_point[1] * x
+
+        x0 = np.array([-1.7]).astype('float32')
+        results = test_static_graph(func, x0)
+        self.assertFalse(results[0][0])
+
+    def test_multi_minima(self):
+        def func(x):
+            # df = 12(x + 1.1)(x - 0.2)(x - 0.8)
+            # f = 3*x^4+0.4*x^3-5.46*x^2+2.112*x
+            # minimum = -1.1 or 0.8. 
+            # All these minima may be reached from appropriate starting points.
+            return 3 * x**4 + 0.4 * x**3 - 5.64 * x**2 + 2.112 * x
+
+        x0 = np.array([0.82], dtype='float64')
+
+        results = test_static_graph(func, x0, dtype='float64')
+        self.assertTrue(np.allclose(0.8, results[2]))
+
+    def test_rosenbrock(self):
+        # The Rosenbrock function is a standard optimization test case.
+        a = np.random.random(size=[1]).astype('float32')
+        minimum = [a.item(), (a**2).item()]
+        b = np.random.random(size=[1]).astype('float32')
+
+        def func(position):
+            # f(x, y) = (a - x)^2 + b (y - x^2)^2
+            # minimum = (a, a^2)
+            x, y = position[0], position[1]
+            c = (a - x)**2 + b * (y - x**2)**2
+            # the return cant be np array[1], or in jacobin will cause flat error
+            return c[0]
+
+        x0 = np.random.random(size=[2]).astype('float32')
+
+        results = test_dynamic_graph(func, x0)
+        self.assertTrue(np.allclose(minimum, results[2]))
+
+    def test_exception(self):
+        def func(x):
+            return paddle.dot(x, x)
+
+        x0 = np.random.random(size=[2]).astype('float32')
+        H0 = np.array([[2.0, 0.0], [0.0, 0.9]]).astype('float32')
+
+        # test initial_inverse_hessian_estimate is good
+        results = test_static_graph_H0(func, x0, H0, dtype='float32')
+        self.assertTrue(np.allclose([0., 0.], results[2]))
+        self.assertTrue(results[0][0])
+
+        # test initial_inverse_hessian_estimate is bad
+        H1 = np.array([[1.0, 2.0], [2.0, 1.0]]).astype('float32')
+        self.assertRaises(ValueError, test_dynamic_graph, func, x0, H0=H1)
+
+        # test line_search_fn is bad
+        self.assertRaises(
+            NotImplementedError,
+            test_static_graph,
+            func,
+            x0,
+            line_search_fn='other')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lbfgs.py b/python/paddle/fluid/tests/unittests/test_lbfgs.py
new file mode 100644
index 0000000000..bb38187476
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lbfgs.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+
+from paddle.incubate.optimizer.functional.lbfgs import minimize_lbfgs
+
+np.random.seed(123)
+
+
+def test_static_graph(func, x0, line_search_fn='strong_wolfe', dtype='float32'):
+    dimension = x0.shape[0]
+    paddle.enable_static()
+    main = paddle.static.Program()
+    startup = paddle.static.Program()
+    with paddle.static.program_guard(main, startup):
+        X = paddle.static.data(name='x', shape=[dimension], dtype=dtype)
+        Y = minimize_lbfgs(func, X, line_search_fn=line_search_fn, dtype=dtype)
+
+    exe = paddle.static.Executor()
+    exe.run(startup)
+    return exe.run(main, feed={'x': x0}, fetch_list=[Y])
+
+
+def test_static_graph_H0(func, x0, H0, dtype='float32'):
+    paddle.enable_static()
+    main = paddle.static.Program()
+    startup = paddle.static.Program()
+    with paddle.static.program_guard(main, startup):
+        X = paddle.static.data(name='x', shape=[x0.shape[0]], dtype=dtype)
+        H = paddle.static.data(
+            name='h', shape=[H0.shape[0], H0.shape[1]], dtype=dtype)
+        Y = minimize_lbfgs(
+            func, X, initial_inverse_hessian_estimate=H, dtype=dtype)
+
+    exe = paddle.static.Executor()
+    exe.run(startup)
+    return exe.run(main, feed={'x': x0, 'h': H0}, fetch_list=[Y])
+
+
+def test_dynamic_graph(func,
+                       x0,
+                       H0=None,
+                       line_search_fn='strong_wolfe',
+                       dtype='float32'):
+    paddle.disable_static()
+    x0 = paddle.to_tensor(x0)
+    if H0 is not None:
+        H0 = paddle.to_tensor(H0)
+    return minimize_lbfgs(
+        func,
+        x0,
+        initial_inverse_hessian_estimate=H0,
+        line_search_fn=line_search_fn,
+        dtype=dtype)
+
+
+class TestLbfgs(unittest.TestCase):
+    def test_quadratic_nd(self):
+        for dimension in [1, 10]:
+            minimum = np.random.random(size=[dimension]).astype('float32')
+            scale = np.exp(np.random.random(size=[dimension]).astype('float32'))
+
+            def func(x):
+                minimum_ = paddle.assign(minimum)
+                scale_ = paddle.assign(scale)
+                return paddle.sum(
+                    paddle.multiply(scale_, (F.square_error_cost(x, minimum_))))
+
+            x0 = np.random.random(size=[dimension]).astype('float32')
+            results = test_static_graph(func, x0)
+            self.assertTrue(np.allclose(minimum, results[2]))
+
+            results = test_dynamic_graph(func, x0)
+            self.assertTrue(np.allclose(minimum, results[2].numpy()))
+
+    def test_inf_minima(self):
+        extream_point = np.array([-1, 2]).astype('float32')
+
+        def func(x):
+            # df = 3(x - 1.01)(x - 0.99)
+            # f = x^3 - 3x^2 + 3*1.01*0.99x
+            return x * x * x / 3.0 - (
+                extream_point[0] + extream_point[1]
+            ) * x * x / 2 + extream_point[0] * extream_point[1] * x
+
+        x0 = np.array([-1.7]).astype('float32')
+        results = test_static_graph(func, x0)
+        self.assertFalse(results[0][0])
+
+    def test_multi_minima(self):
+        def func(x):
+            # df = 12(x + 1.1)(x - 0.2)(x - 0.8)
+            # f = 3*x^4+0.4*x^3-5.46*x^2+2.112*x
+            # minimum = -1.1 or 0.8. 
+            # All these minima may be reached from appropriate starting points.
+            return 3 * x**4 + 0.4 * x**3 - 5.64 * x**2 + 2.112 * x
+
+        x0 = np.array([0.82], dtype='float64')
+
+        results = test_static_graph(func, x0, dtype='float64')
+        self.assertTrue(np.allclose(0.8, results[2]))
+
+    def test_rosenbrock(self):
+        # The Rosenbrock function is a standard optimization test case.
+        a = np.random.random(size=[1]).astype('float32')
+        minimum = [a.item(), (a**2).item()]
+        b = np.random.random(size=[1]).astype('float32')
+
+        def func(position):
+            # f(x, y) = (a - x)^2 + b (y - x^2)^2
+            # minimum = (a, a^2)
+            x, y = position[0], position[1]
+            c = (a - x)**2 + b * (y - x**2)**2
+            # the return cant be np array[1], or in jacobin will cause flat error
+            return c[0]
+
+        x0 = np.random.random(size=[2]).astype('float32')
+
+        results = test_dynamic_graph(func, x0)
+        self.assertTrue(np.allclose(minimum, results[2]))
+
+    def test_exception(self):
+        def func(x):
+            return paddle.dot(x, x)
+
+        x0 = np.random.random(size=[2]).astype('float32')
+        H0 = np.array([[2.0, 0.0], [0.0, 0.9]]).astype('float32')
+
+        # test dtype is not float32 or float64
+        x1 = np.random.random(size=[2]).astype('int32')
+        self.assertRaises(
+            ValueError, test_static_graph, func, x1, dtype='int32')
+
+        # test initial_inverse_hessian_estimate is good
+        results = test_static_graph_H0(func, x0, H0, dtype='float32')
+        self.assertTrue(np.allclose([0., 0.], results[2]))
+        self.assertTrue(results[0][0])
+
+        # test initial_inverse_hessian_estimate is bad and float64
+        x2 = np.random.random(size=[2]).astype('float64')
+        H1 = np.array([[1.0, 2.0], [3.0, 1.0]]).astype('float64')
+        self.assertRaises(
+            ValueError, test_static_graph_H0, func, x2, H0=H1, dtype='float64')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/optimizer/__init__.py b/python/paddle/incubate/optimizer/__init__.py
index fd5332986e..f5c24f8589 100644
--- a/python/paddle/incubate/optimizer/__init__.py
+++ b/python/paddle/incubate/optimizer/__init__.py
@@ -15,5 +15,6 @@
 from .lookahead import LookAhead  # noqa: F401
 from .modelaverage import ModelAverage  # noqa: F401
 from .distributed_fused_lamb import DistributedFusedLamb  # noqa: F401
+from . import functional  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/incubate/optimizer/functional/__init__.py b/python/paddle/incubate/optimizer/functional/__init__.py
new file mode 100644
index 0000000000..fc863a923d
--- /dev/null
+++ b/python/paddle/incubate/optimizer/functional/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .bfgs import minimize_bfgs  # noqa: F401
+from .lbfgs import minimize_lbfgs  # noqa: F401
+
+__all__ = ['minimize_bfgs', 'minimize_lbfgs']
diff --git a/python/paddle/incubate/optimizer/functional/bfgs.py b/python/paddle/incubate/optimizer/functional/bfgs.py
new file mode 100644
index 0000000000..9afcc2240a
--- /dev/null
+++ b/python/paddle/incubate/optimizer/functional/bfgs.py
@@ -0,0 +1,195 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from .line_search import strong_wolfe
+from .utils import _value_and_gradient, check_input_type, check_initial_inverse_hessian_estimate
+
+import paddle
+
+
+def minimize_bfgs(objective_func,
+                  initial_position,
+                  max_iters=50,
+                  tolerance_grad=1e-7,
+                  tolerance_change=1e-9,
+                  initial_inverse_hessian_estimate=None,
+                  line_search_fn='strong_wolfe',
+                  max_line_search_iters=50,
+                  initial_step_length=1.0,
+                  dtype='float32',
+                  name=None):
+    r"""
+    Minimizes a differentiable function `func` using the BFGS method.
+    The BFGS is a quasi-Newton method for solving an unconstrained
+    optimization problem over a differentiable function.
+    Closely related is the Newton method for minimization. Consider the iterate 
+    update formula
+    .. math::
+        x_{k+1} = x_{k} + H \nabla{f},
+    If $H$ is the inverse Hessian of $f$ at $x_{k}$, then it's the Newton method.
+    If $H$ is symmetric and positive definite, used as an approximation of the inverse Hessian, then 
+    it's a quasi-Newton. In practice, the approximated Hessians are obtained
+    by only using the gradients, over either whole or part of the search 
+    history, the former is BFGS.
+
+    Reference:
+        Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006.
+        pp140: Algorithm 6.1 (BFGS Method).
+    
+    Following summarizes the the main logic of the program based on BFGS. Note: _k represents value of
+    k_th iteration, ^T represents the transposition of a vector or matrix.
+    repeat
+        p_k = H_k * g_k
+        alpha = strong_wolfe(f, x_k, p_k)
+        x_k+1 = x_k + alpha * p_k
+        s_k = x_k+1 - x_k
+        y_k = g_k+1 - g_k
+        rho_k = 1 / (s_k^T * y_k)
+        V_k^T = I - rho_k * s_k * y_k^T
+        V_k = I - rho_k * y_k * s_k^T
+        H_k+1 = V_k^T * H_k * V_k + rho_k * s_k * s_k^T
+        check_converge
+    end 
+
+    Args:
+        objective_func: the objective function to minimize. ``func`` accepts
+            a multivariate input and returns a scalar.
+        initial_position (Tensor): the starting point of the iterates. For methods like Newton and quasi-Newton 
+        the initial trial step length should always be 1.0.
+        max_iters (int): the maximum number of minimization iterations.
+        tolerance_grad (float): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm.
+        tolerance_change (float): terminates if the change of function value/position/parameter between 
+            two iterations is smaller than this value.
+        initial_inverse_hessian_estimate (Tensor): the initial inverse hessian approximation at initial_position.
+        It must be symmetric and positive definite.
+        line_search_fn (str): indicate which line search method to use, only support 'strong wolfe' right now. May support 
+            'Hager Zhang' in the futrue.
+        max_line_search_iters (int): the maximum number of line search iterations.
+        initial_step_length (float): step length used in first iteration of line search. different initial_step_length 
+        may cause different optimal result.
+        dtype ('float32' | 'float64'): In static graph, float64 will be convert to float32 due to paddle.assign limit.
+    
+    Returns:
+        is_converge (bool): Indicates whether found the minimum within tolerance.
+        num_func_calls (int): number of objective function called.
+        position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of 
+        the objective function regrading to the initial position.
+        objective_value (Tensor): objective function value at the `position`.
+        objective_gradient (Tensor): objective function gradient at the `position`.
+        inverse_hessian_estimate (Tensor): the estimate of inverse hessian at the `position`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            
+            def func(x):
+                return paddle.dot(x, x)
+
+            x0 = paddle.to_tensor([1.3, 2.7])
+            results = paddle.optimizer.functional.minimize_bfgs(func, x0)
+            print("is_converge: ", results[0])
+            print("the minimum of func is: ", results[2])
+            # is_converge:  is_converge:  Tensor(shape=[1], dtype=bool, place=Place(gpu:0), stop_gradient=True,
+            #        [True])
+            # the minimum of func is:  Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [0., 0.])
+    """
+
+    if dtype not in ['float32', 'float64']:
+        raise ValueError(
+            "The dtype must be 'float32' or 'float64', but the specified is {}.".
+            format(dtype))
+
+    op_name = 'minimize_bfgs'
+    check_input_type(initial_position, 'initial_position', op_name)
+
+    I = paddle.eye(initial_position.shape[0], dtype=dtype)
+    if initial_inverse_hessian_estimate is None:
+        initial_inverse_hessian_estimate = I
+    else:
+        check_input_type(initial_inverse_hessian_estimate,
+                         'initial_inverse_hessian_estimate', op_name)
+        check_initial_inverse_hessian_estimate(initial_inverse_hessian_estimate)
+
+    Hk = paddle.assign(initial_inverse_hessian_estimate)
+    xk = initial_position
+
+    value, g1 = _value_and_gradient(objective_func, xk)
+    num_func_calls = paddle.full(shape=[1], fill_value=1, dtype='int64')
+
+    # when the dim of x is 1000, it needs more than 30 iters to get all element converge to minimum.
+    k = paddle.full(shape=[1], fill_value=0, dtype='int64')
+    done = paddle.full(shape=[1], fill_value=False, dtype='bool')
+    is_converge = paddle.full(shape=[1], fill_value=False, dtype='bool')
+
+    def cond(k, done, is_converge, num_func_calls, xk, value, g1, Hk):
+        return (k < max_iters) & ~done
+
+    def body(k, done, is_converge, num_func_calls, xk, value, g1, Hk):
+        #############    compute pk    #############
+        pk = -paddle.matmul(Hk, g1)
+
+        #############    compute alpha by line serach    #############
+        if line_search_fn == 'strong_wolfe':
+            alpha, value, g2, ls_func_calls = strong_wolfe(
+                f=objective_func,
+                xk=xk,
+                pk=pk,
+                initial_step_length=initial_step_length,
+                dtype=dtype)
+        else:
+            raise NotImplementedError(
+                "Currently only support line_search_fn = 'strong_wolfe', but the specified is '{}'".
+                format(line_search_fn))
+        num_func_calls += ls_func_calls
+
+        #############    update Hk    #############
+        sk = alpha * pk
+        yk = g2 - g1
+
+        xk = xk + sk
+        g1 = g2
+
+        sk = paddle.unsqueeze(sk, 0)
+        yk = paddle.unsqueeze(yk, 0)
+
+        rhok_inv = paddle.dot(yk, sk)
+        rhok = paddle.static.nn.cond(
+            rhok_inv == 0., lambda: paddle.full(shape=[1], fill_value=1000.0, dtype=dtype), lambda: 1. / rhok_inv)
+
+        Vk_transpose = I - rhok * sk * yk.t()
+        Vk = I - rhok * yk * sk.t()
+        Hk = paddle.matmul(paddle.matmul(Vk_transpose, Hk),
+                           Vk) + rhok * sk * sk.t()
+
+        k += 1
+
+        #############    check convergence    #############
+        gnorm = paddle.linalg.norm(g1, p=np.inf)
+        pk_norm = paddle.linalg.norm(pk, p=np.inf)
+        paddle.assign(done | (gnorm < tolerance_grad) |
+                      (pk_norm < tolerance_change), done)
+        paddle.assign(done, is_converge)
+        # when alpha=0, there is no chance to get xk change.
+        paddle.assign(done | (alpha == 0.), done)
+        return [k, done, is_converge, num_func_calls, xk, value, g1, Hk]
+
+    paddle.static.nn.while_loop(
+        cond=cond,
+        body=body,
+        loop_vars=[k, done, is_converge, num_func_calls, xk, value, g1, Hk])
+    return is_converge, num_func_calls, xk, value, g1, Hk
diff --git a/python/paddle/incubate/optimizer/functional/lbfgs.py b/python/paddle/incubate/optimizer/functional/lbfgs.py
new file mode 100644
index 0000000000..90ae452653
--- /dev/null
+++ b/python/paddle/incubate/optimizer/functional/lbfgs.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from .line_search import strong_wolfe
+from .utils import _value_and_gradient, check_input_type, check_initial_inverse_hessian_estimate
+
+import paddle
+
+
+def minimize_lbfgs(objective_func,
+                   initial_position,
+                   history_size=100,
+                   max_iters=50,
+                   tolerance_grad=1e-8,
+                   tolerance_change=1e-8,
+                   initial_inverse_hessian_estimate=None,
+                   line_search_fn='strong_wolfe',
+                   max_line_search_iters=50,
+                   initial_step_length=1.0,
+                   dtype='float32',
+                   name=None):
+    r"""Minimizes a differentiable function `func` using the L-BFGS method.
+    The L-BFGS is simalar as BFGS, the only difference is that L-BFGS use historical
+    sk, yk, rhok rather than H_k-1 to compute Hk.
+    Reference:
+        Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006.
+        pp179: Algorithm 7.5 (L-BFGS).
+
+    Following summarizes the the main logic of the program based on L-BFGS.Note: _k represents 
+    value of k_th iteration, ^T represents the transposition of a vector or matrix.
+    repeat
+        compute p_k by two-loop recursion
+        alpha = strong_wolfe(f, x_k, p_k)
+        x_k+1 = x_k + alpha * p_k
+        s_k = x_k+1 - x_k
+        y_k = g_k+1 - g_k
+        rho_k = 1 / (s_k^T * y_k)
+        update sk_vec, yk_vec, rhok_vec
+        check_converge
+    end 
+
+    Args:
+        objective_func: the objective function to minimize. ``func`` accepts
+            a multivariate input and returns a scalar.
+        initial_position (Tensor): the starting point of the iterates. For methods like Newton and quasi-Newton 
+        the initial trial step length should always be 1.0 .
+        history_size (Scalar): the number of stored vector pairs {si,yi}.
+        max_iters (Scalar): the maximum number of minimization iterations.
+        tolerance_grad (Scalar): terminates if the gradient norm is smaller than
+            this. Currently gradient norm uses inf norm.
+        tolerance_change (Scalar): terminates if the change of function value/position/parameter between 
+            two iterations is smaller than this value.
+        initial_inverse_hessian_estimate (Tensor): the initial inverse hessian approximation.
+        line_search_fn (str): indicate which line search method to use, only support 'strong wolfe' right now. May support 
+            'Hager Zhang' in the futrue.
+        max_line_search_iters (Scalar): the maximum number of line search iterations.
+        initial_step_length: step length used in first iteration of line search. different initial_step_length 
+        may cause different optimal result.
+        dtype ('float' | 'float32' | 'float64' | 'double'): the data
+            type to be used.
+    
+    Returns:
+        is_converge (bool): Indicates whether found the minimum within tolerance.
+        num_func_calls (int): number of objective function called.
+        position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of 
+        the objective function regrading to the initial position.
+        objective_value (Tensor): objective function value at the `position`.
+        objective_gradient (Tensor): objective function gradient at the `position`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            
+            def func(x):
+                return paddle.dot(x, x)
+
+            x0 = paddle.to_tensor([1.3, 2.7])
+            results = paddle.optimizer.functional.minimize_lbfgs(func, x0)
+            print("is_converge: ", results[0])
+            print("the minimum of func is: ", results[2])
+            # is_converge:  is_converge:  Tensor(shape=[1], dtype=bool, place=Place(gpu:0), stop_gradient=True,
+            #        [True])
+            # the minimum of func is:  Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [0., 0.])
+    """
+    if dtype not in ['float32', 'float64']:
+        raise ValueError(
+            "The dtype must be 'float32' or 'float64', but the specified is {}.".
+            format(dtype))
+
+    op_name = 'minimize_lbfgs'
+    check_input_type(initial_position, 'initial_position', op_name)
+
+    if initial_inverse_hessian_estimate is None:
+        H0 = paddle.eye(initial_position.shape[0], dtype=dtype)
+    else:
+        check_input_type(initial_inverse_hessian_estimate,
+                         'initial_inverse_hessian_estimate', op_name)
+        check_initial_inverse_hessian_estimate(initial_inverse_hessian_estimate)
+        H0 = initial_inverse_hessian_estimate
+
+    xk = initial_position
+    value, g1 = _value_and_gradient(objective_func, xk)
+
+    k = paddle.full(shape=[1], fill_value=0, dtype='int64')
+    done = paddle.full(shape=[1], fill_value=False, dtype='bool')
+    is_converge = paddle.full(shape=[1], fill_value=False, dtype='bool')
+    num_func_calls = paddle.full(shape=[1], fill_value=1, dtype='int64')
+
+    history_size = paddle.full(
+        shape=[1], fill_value=history_size, dtype='int64')
+    head = paddle.full(shape=[1], fill_value=1, dtype='int64')
+    tail = paddle.full(shape=[1], fill_value=0, dtype='int64')
+
+    shape = initial_position.shape[0]
+    # Use tensor as array of fixed length, rather than flexible tensor array. Because in static mode,
+    # tensor array will produce tensor of shape[-1], which will cause error when calling jacobian. In this way, can not use append
+    # or pop, so we need head and tail to record where is the newest data and where is the oldest.
+    # Totally speaking, realized a stack by array.
+    sk_vec = paddle.zeros((history_size + 1, shape), dtype=dtype)
+    yk_vec = paddle.zeros((history_size + 1, shape), dtype=dtype)
+    rhok_vec = paddle.zeros((history_size + 1, 1), dtype=dtype)
+    ai_vec = paddle.zeros((history_size + 1, 1), dtype=dtype)
+
+    def cond(k, done, is_converge, num_func_calls, value, xk, g1, sk_vec,
+             yk_vec, rhok_vec, head, tail):
+        return (k < max_iters) & ~done
+
+    def body(k, done, is_converge, num_func_calls, value, xk, g1, sk_vec,
+             yk_vec, rhok_vec, head, tail):
+        # use assign to cut off the relevance between g1 and q, or they will change together.
+
+        #############    compute p_k by two-loop recursion    #############
+        q = paddle.assign(g1)
+        # In a array circle, the index may out of range, so must use mod.
+        i = paddle.full(
+            shape=[1], fill_value=(head - 1).mod(history_size), dtype='int64')
+
+        def cond(i, q):
+            return i != tail
+
+        def body(i, q):
+            ai_vec[i] = rhok_vec[i] * paddle.dot(sk_vec[i], q)
+            q = q - ai_vec[i] * yk_vec[i]
+            i = (i - 1).mod(history_size)
+            return i, q
+
+        paddle.static.nn.while_loop(cond=cond, body=body, loop_vars=[i, q])
+
+        r = paddle.matmul(H0, q)
+
+        i = paddle.full(shape=[1], fill_value=tail + 1, dtype='int64')
+
+        def cond(i, r):
+            return i != head
+
+        def body(i, r):
+            beta = rhok_vec[i] * paddle.dot(yk_vec[i], r)
+            r = r + sk_vec[i] * (ai_vec[i] - beta)
+            i = (i + 1).mod(history_size)
+            return i, r
+
+        paddle.static.nn.while_loop(cond=cond, body=body, loop_vars=[i, r])
+
+        pk = -r
+
+        #############    compute alpha by line serach    #############
+        if line_search_fn == 'strong_wolfe':
+            alpha, value, g2, ls_func_calls = strong_wolfe(
+                f=objective_func,
+                xk=xk,
+                pk=pk,
+                initial_step_length=initial_step_length,
+                dtype=dtype)
+        else:
+            raise NotImplementedError(
+                "Currently only support line_search_fn = 'strong_wolfe', but the specified is '{}'".
+                format(line_search_fn))
+        paddle.assign(num_func_calls + ls_func_calls, num_func_calls)
+
+        #############    update sk_vec, yk_vec, rhok_vec    #############
+        sk = alpha * pk
+        yk = g2 - g1
+
+        rhok_inv = paddle.dot(yk, sk)
+        rhok = paddle.static.nn.cond(
+            rhok_inv == 0., lambda: paddle.full(shape=[1], fill_value=1000.0, dtype=dtype), lambda: 1. / rhok_inv)
+
+        sk_vec[head] = sk
+        yk_vec[head] = yk
+        rhok_vec[head] = rhok
+        head = (head + 1) % history_size
+
+        def true_fn(tail):
+            paddle.assign(tail + 1, tail)
+
+        # when array is full, the tail should move forward too.
+        paddle.static.nn.cond(head == tail, lambda: true_fn(tail), None)
+
+        xk = xk + sk
+        g1 = g2
+        k += 1
+
+        #############    check convergence    #############
+        gnorm = paddle.linalg.norm(g1, p=np.inf)
+        pk_norm = paddle.linalg.norm(pk, p=np.inf)
+        paddle.assign(done | (gnorm < tolerance_grad) |
+                      (pk_norm < tolerance_change), done)
+        paddle.assign(done, is_converge)
+        # when alpha=0, there is no chance to get xk change.
+        paddle.assign(done | (alpha == 0.), done)
+
+        return [
+            k, done, is_converge, num_func_calls, value, xk, g1, sk_vec, yk_vec,
+            rhok_vec, head, tail
+        ]
+
+    paddle.static.nn.while_loop(
+        cond=cond,
+        body=body,
+        loop_vars=[
+            k, done, is_converge, num_func_calls, value, xk, g1, sk_vec, yk_vec,
+            rhok_vec, head, tail
+        ])
+    return is_converge, num_func_calls, xk, value, g1
diff --git a/python/paddle/incubate/optimizer/functional/line_search.py b/python/paddle/incubate/optimizer/functional/line_search.py
new file mode 100644
index 0000000000..d42732e605
--- /dev/null
+++ b/python/paddle/incubate/optimizer/functional/line_search.py
@@ -0,0 +1,297 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .utils import _value_and_gradient
+import paddle
+
+
+def cubic_interpolation_(x1, f1, g1, x2, f2, g2):
+    r"""Cubic interpolation between (x1, f1, g1) and (x2, f2, g2).
+        Use two points and their gradient to determine a cubic function and get the minimun point
+        between them in the cubic curve.
+        
+    Reference:
+        Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006.
+        pp59: formula 3.59
+
+    Args:
+        x1, f1, g1: point1's position, value and gradient.
+        x2, f2, g2: point2's position, value and gradient.
+    Returns:
+        min_pos: the minimun point between the specified points in the cubic curve.
+    """
+    xmin, xmax = paddle.static.nn.cond(x1 <= x2, lambda: (x1, x2),
+                                       lambda: (x2, x1))
+    d1 = g1 + g2 - 3 * (f1 - f2) / (x1 - x2)
+    d2_square = d1**2 - g1 * g2
+
+    def true_func1():
+        d2 = d2_square.sqrt()
+
+        def true_fn2():
+            return x2 - (x2 - x1) * ((g2 + d2 - d1) / (g2 - g1 + 2 * d2))
+
+        def false_fn2():
+            return x1 - (x1 - x2) * ((g1 + d2 - d1) / (g1 - g2 + 2 * d2))
+
+        pred = paddle.less_equal(x=x1, y=x2)
+        min_pos = paddle.static.nn.cond(pred, true_fn2, false_fn2)
+
+        return paddle.minimum(paddle.maximum(min_pos, xmin), xmax)
+
+    def false_func1():
+        return (xmin + xmax) / 2.
+
+    min_pos = paddle.static.nn.cond(d2_square >= 0., true_func1, false_func1)
+    return min_pos
+
+
+def strong_wolfe(f,
+                 xk,
+                 pk,
+                 max_iters=20,
+                 tolerance_change=1e-8,
+                 initial_step_length=1.0,
+                 c1=1e-4,
+                 c2=0.9,
+                 alpha_max=10,
+                 dtype='float32'):
+    r"""Implements of line search algorithm that satisfies the strong Wolfe conditions using double zoom.
+    
+    Reference:
+        Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006.
+        pp60: Algorithm 3.5 (Line Search Algorithm).
+    
+    Args:
+        f: the objective function to minimize. ``f`` accepts a multivariate input and returns a scalar.
+        xk (Tensor): the starting point of the iterates.
+        pk (Tensor): search direction.
+        max_iters (Scalar): the maximum number of iterations.
+        tolerance_grad (Scalar): terminates if the gradient norm is smaller than
+            this. Currently gradient norm uses inf norm.
+        tolerance_change (Scalar): terminates if the change of function value/position/parameter between 
+            two iterations is smaller than this value.
+        initial_step_length (Scalar): step length used in first iteration.
+        c1 (Scalar): parameter for sufficient decrease condition.
+        c2 (Scalar): parameter for curvature condition.
+        alpha_max (float): max step length.
+        dtype ('float32' | 'float64'): the datatype to be used.
+    
+    Returns:
+        num_func_calls (float): number of objective function called in line search process.
+        a_star(Tensor): optimal step length, or 0. if the line search algorithm did not converge.
+        phi_star (Tensor): phi at a_star.
+        derphi_star (Tensor): derivative of phi at a_star.
+
+    Following summarizes the essentials of the strong Wolfe line search algorithm.
+    Some notations used in the description:
+    
+        - `f` denotes the objective function.
+        - `phi` is a function of step size alpha, restricting `f` on a line.
+        
+            phi = f(xk + a * pk),
+            where xk is the position of k'th iterate, pk is the line search direction(decent direction), 
+            and a is the step size.
+        - a : substitute of alpha
+        - a1 is a of last iteration, which is alpha_(i-1).
+        - a2 is a of current iteration, which is alpha_i.
+        - a_lo is a in left position when calls zoom, which is alpha_low.
+        - a_hi is a in right position when calls zoom, which is alpha_high.
+    
+    Line Search Algorithm:
+        repeat
+            Compute phi(a2) and derphi(a2).
+            1. If phi(a2) > phi(0) + c_1 * a2 * phi'(0) or [phi(a2) >= phi(a1) and i > 1], 
+                a_star= zoom(a1, a2) and stop;
+
+            2. If |phi'(a2)| <= -c_2 * phi'(0), 
+                a_star= a2 and stop;
+
+            3. If phi'(a2) >= 0,
+                a_star= zoom(a2, a1) and stop;
+
+            a1 = a2
+            a2 = min(2 * a2, a2)
+            i = i + 1
+        end(repeat)
+    
+    zoom(a_lo, a_hi) Algorithm: 
+        repeat
+            aj = cubic_interpolation(a_lo, a_hi)
+            Compute phi(aj) and derphi(aj).
+            1. If phi(aj) > phi(0) + c_1 * aj * phi'(0) or phi(aj) >= phi(a_lo),
+                then a_hi <- aj;
+            2.
+                2.1. If |phi'(aj)| <= -c_2 * phi'(0), then a_star= a2 and stop;
+
+                2.2. If phi'(aj) * (a2 - a1) >= 0, then a_hi = a_lo
+
+                a_lo = aj;
+        end(repeat)
+    """
+
+    def phi_and_derphi(a):
+        r"""Compute function value and derivative of phi at a.
+            phi = f(xk + a * pk)
+            phi'(a) = f'(xk + a * pk) * pk
+        """
+        phi_value, f_grad = _value_and_gradient(f, xk + a * pk)
+        phi_grad = paddle.dot(f_grad, pk)
+        # return f_grad to be used in bfgs/l-bfgs to compute yk to avoid computint repeatly.
+        return phi_value, f_grad, phi_grad
+
+    def zoom(a_lo, phi_lo, derphi_lo, derf_lo, a_hi, phi_hi, derphi_hi, phi_0,
+             derphi_0):
+        # find the exact a from the bracket [a_lo, a_hi]
+        max_zoom_iters = max_iters
+        j = paddle.full(shape=[1], fill_value=0, dtype='int64')
+        done_zoom = paddle.full(shape=[1], fill_value=False, dtype='bool')
+
+        def cond_zoom(j, done_zoom, a_lo, phi_lo, derphi_lo, derf_lo, a_hi,
+                      phi_hi, derphi_hi):
+            pred = paddle.abs(a_hi - a_lo) < tolerance_change
+            paddle.assign(done_zoom | pred, done_zoom)
+            return (j < max_zoom_iters) & ~done_zoom
+
+        def body_zoom(j, done_zoom, a_lo, phi_lo, derphi_lo, derf_lo, a_hi,
+                      phi_hi, derphi_hi):
+            aj = cubic_interpolation_(a_lo, phi_lo, derphi_lo, a_hi, phi_hi,
+                                      derphi_hi)  # 21
+            min_change = 0.1 * paddle.abs(a_hi - a_lo)
+            pred = paddle.minimum(
+                paddle.abs(aj - a_lo), paddle.abs(aj - a_hi)) < min_change
+            aj = paddle.static.nn.cond(pred, lambda: 0.5 * (a_lo + a_hi),
+                                       lambda: aj)
+
+            phi_j, derf_j, derphi_j = phi_and_derphi(aj)
+
+            def true_fn():
+                # use assing to modify the variable in-place
+                paddle.assign(aj, a_hi)
+                paddle.assign(phi_j, phi_hi)
+                paddle.assign(derphi_j, derphi_hi)
+
+            def false_fn(a_lo, done_zoom):
+                pred3 = (paddle.abs(derphi_j) <= -c2 * derphi_0)
+                paddle.assign(pred3, done_zoom)
+
+                def true_fn():
+                    paddle.assign(a_lo, a_hi)
+                    paddle.assign(phi_lo, phi_hi)
+                    paddle.assign(derphi_lo, derphi_hi)
+
+                pred4 = ~done_zoom & (derphi_j * (a_hi - a_lo) >= 0)
+                paddle.static.nn.cond(pred4, true_fn, None)
+
+                paddle.assign(aj, a_lo)
+                paddle.assign(phi_j, phi_lo)
+                paddle.assign(derphi_j, derphi_lo)
+                paddle.assign(derf_j, derf_lo)
+
+            pred2 = (phi_j > phi_0 + c1 * aj * derphi_0) | (phi_j >= phi_lo)
+            paddle.static.nn.cond(pred2, true_fn,
+                                  lambda: false_fn(a_lo, done_zoom))
+            j = paddle.static.nn.cond(done_zoom, lambda: j, lambda: j + 1)
+            return [
+                j, done_zoom, a_lo, phi_lo, derphi_lo, derf_lo, a_hi, phi_hi,
+                derphi_hi
+            ]
+
+        paddle.static.nn.while_loop(
+            cond=cond_zoom,
+            body=body_zoom,
+            loop_vars=[
+                j, done_zoom, a_lo, phi_lo, derphi_lo, derf_lo, a_hi, phi_hi,
+                derphi_hi
+            ])
+        # j is the number of object function called in zoom.
+        return j
+
+    alpha_max = paddle.full(shape=[1], fill_value=alpha_max, dtype=dtype)
+
+    a1 = paddle.full(shape=[1], fill_value=0., dtype=dtype)
+    a2 = paddle.full(shape=[1], fill_value=initial_step_length, dtype=dtype)
+
+    phi_1, derf_1, derphi_1 = phi_and_derphi(a1)
+    # use assign to cut off binding between two variables
+    phi_0 = paddle.assign(phi_1)
+    derphi_0 = paddle.assign(derphi_1)
+    ls_func_calls = paddle.full(shape=[1], fill_value=1, dtype='int64')
+
+    # If not found the a_star, will return alpha=0 and f(xk), derf(xk)
+    a_star = paddle.full(shape=[1], fill_value=0, dtype=dtype)
+    phi_star = paddle.assign(phi_1)
+    derf_star = paddle.assign(derf_1)
+
+    i = paddle.full(shape=[1], fill_value=0, dtype='int64')
+    done = paddle.full(shape=[1], fill_value=False, dtype='bool')
+
+    def cond(i, ls_func_calls, a1, a2, phi_1, derf_1, done):
+        return (i < max_iters) & ~done
+
+    def body(i, ls_func_calls, a1, a2, phi_1, derf_1, done):
+        phi_2, derf_2, derphi_2 = phi_and_derphi(a2)
+        paddle.assign(ls_func_calls + 1, ls_func_calls)
+        paddle.assign(done | paddle.any(paddle.isinf(phi_2)), done)
+
+        def true_fn1():
+            j = zoom(a1, phi_1, derphi_1, derf_1, a2, phi_2, derphi_2, phi_0,
+                     derphi_0)
+            paddle.assign(a1, a_star)
+            paddle.assign(phi_1, phi_star)
+            paddle.assign(derf_1, derf_star)
+            paddle.assign(ls_func_calls + j, ls_func_calls)
+
+        pred1 = ~done & ((phi_2 > phi_0 + c1 * a2 * derphi_0) | (
+            (phi_2 >= phi_0) & (i > 1)))
+        paddle.assign(done | pred1, done)
+        paddle.static.nn.cond(pred1, true_fn1, None)
+
+        def true_fn2():
+            paddle.assign(a2, a_star)
+            paddle.assign(phi_2, phi_star)
+            paddle.assign(derf_2, derf_star)
+
+        pred2 = ~done & (paddle.abs(derphi_2) <= -c2 * derphi_0)
+        paddle.assign(done | pred2, done)
+        paddle.static.nn.cond(pred2, true_fn2, None)
+
+        def true_fn3():
+            j = zoom(a2, phi_2, derphi_2, derf_2, a1, phi_1, derphi_1, phi_0,
+                     derphi_0)
+            paddle.assign(a2, a_star)
+            paddle.assign(phi_2, phi_star)
+            paddle.assign(derf_2, derf_star)
+            paddle.assign(ls_func_calls + j, ls_func_calls)
+
+        pred3 = ~done & (derphi_2 >= 0)
+        paddle.assign(done | pred3, done)
+        paddle.static.nn.cond(pred3, true_fn3, None)
+
+        def false_fn():
+            paddle.assign(a2, a1)
+            paddle.assign(phi_2, phi_1)
+            paddle.assign(derf_2, derf_1)
+            paddle.assign(paddle.minimum(2 * a2, alpha_max), a2)
+            paddle.assign(i + 1, i)
+
+        paddle.static.nn.cond(done, None, false_fn)
+        return [i, ls_func_calls, a1, a2, phi_1, derf_1, done]
+
+    paddle.static.nn.while_loop(
+        cond=cond,
+        body=body,
+        loop_vars=[i, ls_func_calls, a1, a2, phi_1, derf_1, done])
+
+    return a_star, phi_star, derf_star, ls_func_calls
diff --git a/python/paddle/incubate/optimizer/functional/utils.py b/python/paddle/incubate/optimizer/functional/utils.py
new file mode 100644
index 0000000000..c197f8a1ac
--- /dev/null
+++ b/python/paddle/incubate/optimizer/functional/utils.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.autograd.functional import vjp, Jacobian
+from paddle.fluid.framework import Variable
+from paddle.fluid.data_feeder import check_type, check_dtype
+
+
+def check_input_type(input, name, op_name):
+    r"""Check whether the input is tensor or variable."""
+    if paddle.in_dynamic_mode():
+        if not isinstance(input, paddle.Tensor):
+            raise ValueError("The input: {} must be tensor.".format(input))
+    else:
+        check_type(input, name, Variable, op_name)
+
+
+def check_initial_inverse_hessian_estimate(H0):
+    r"""Check whether the specified initial_inverse_hessian_estimate is symmetric and positive definite.
+        Raise errors when precondition not met.
+
+    Note: 
+        In static graph can not raise error directly, so use py_func make raise_func as a op,
+        and use paddle.static.nn.cond to decide if put the op in net.
+        cholesky is the fast way to check positive definition, but in static graph can not catch 
+        exception to raise value error, so use eigvals rather than cholesky in static graph.
+    """
+    is_symmetric = paddle.all(paddle.equal(H0, H0.t()))
+
+    def raise_func():
+        raise ValueError(
+            "The initial_inverse_hessian_estimate should be symmetric and positive definite, but the specified is not."
+        )
+
+    if paddle.in_dynamic_mode():
+        if not is_symmetric:
+            raise_func()
+        try:
+            paddle.linalg.cholesky(H0)
+        except RuntimeError as error:
+            raise_func()
+    else:
+
+        def create_tmp_var(program, name, dtype, shape):
+            return program.current_block().create_var(
+                name=name, dtype=dtype, shape=shape)
+
+        out_var = create_tmp_var(
+            paddle.static.default_main_program(),
+            name='output',
+            dtype='float32',
+            shape=[-1])
+
+        def false_fn():
+            paddle.static.nn.py_func(
+                func=raise_func, x=is_symmetric, out=out_var)
+
+        paddle.static.nn.cond(is_symmetric, None, false_fn)
+        # eigvals only support cpu
+        paddle.set_device("cpu")
+        eigvals = paddle.paddle.linalg.eigvals(H0)
+        is_positive = paddle.all(eigvals.real() > 0.) and paddle.all(
+            eigvals.imag() == 0.)
+        paddle.static.nn.cond(is_positive, None, false_fn)
+
+
+def _value_and_gradient(f, x, v=None):
+    r"""Compute function value and gradient of f at x.
+    
+    Args:
+        f (Callable): the objective function.
+        x (Tensor): the input tensor.
+    Returns:
+        value: a tensor that holds the function value.
+        gradient: a tensor that holds the function gradients.  
+    """
+    if paddle.in_dynamic_mode():
+        value, gradient = vjp(f, x, v=v)
+        gradient = gradient[0]
+    else:
+        JJ = Jacobian(f, x)
+        gradient = JJ[:][0]
+        value = f(x)
+    return value, gradient
diff --git a/python/setup.py.in b/python/setup.py.in
index 7c1232c1d4..3e59e22fcb 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -362,6 +362,7 @@ packages=['paddle',
           'paddle.incubate.nn',
           'paddle.incubate.nn.functional',
           'paddle.incubate.nn.layer',
+          'paddle.incubate.optimizer.functional',
           'paddle.io',
           'paddle.optimizer',
           'paddle.nn',
-- 
GitLab