From e7928a067d44bf68a433052d7177600a108eccdf Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Thu, 31 Mar 2022 14:23:36 +0800 Subject: [PATCH] [New API]: miminize_bfgs and miminize_lbfgs (#40710) * [New API]: miminize_bfgs and miminize_lbfgs * modify for python module call correctly * add functional package, add error raise in static_graph, change assign to set_value * unify static_graph and dygraph, fix bug when x or H0 is float64 * now only accept input is tensor, put check args in utils.py, put exception test together * temp * add more detailed algorithm illustration and comment, reduce test case to limit test time in 15s * change in_dygraph_mode to in_dynamic_mode * fix bug of sample code; reduce test case to reduce test time * change dir to incubate --- .../paddle/fluid/tests/unittests/test_bfgs.py | 165 ++++++++++ .../fluid/tests/unittests/test_lbfgs.py | 164 ++++++++++ python/paddle/incubate/optimizer/__init__.py | 1 + .../incubate/optimizer/functional/__init__.py | 18 ++ .../incubate/optimizer/functional/bfgs.py | 195 ++++++++++++ .../incubate/optimizer/functional/lbfgs.py | 239 ++++++++++++++ .../optimizer/functional/line_search.py | 297 ++++++++++++++++++ .../incubate/optimizer/functional/utils.py | 96 ++++++ python/setup.py.in | 1 + 9 files changed, 1176 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_bfgs.py create mode 100644 python/paddle/fluid/tests/unittests/test_lbfgs.py create mode 100644 python/paddle/incubate/optimizer/functional/__init__.py create mode 100644 python/paddle/incubate/optimizer/functional/bfgs.py create mode 100644 python/paddle/incubate/optimizer/functional/lbfgs.py create mode 100644 python/paddle/incubate/optimizer/functional/line_search.py create mode 100644 python/paddle/incubate/optimizer/functional/utils.py diff --git a/python/paddle/fluid/tests/unittests/test_bfgs.py b/python/paddle/fluid/tests/unittests/test_bfgs.py new file mode 100644 index 00000000000..c89f7205f08 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_bfgs.py @@ -0,0 +1,165 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +import paddle.nn.functional as F + +from paddle.incubate.optimizer.functional.bfgs import minimize_bfgs + +np.random.seed(123) + + +def test_static_graph(func, x0, line_search_fn='strong_wolfe', dtype='float32'): + dimension = x0.shape[0] + paddle.enable_static() + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + X = paddle.static.data(name='x', shape=[dimension], dtype=dtype) + Y = minimize_bfgs(func, X, line_search_fn=line_search_fn, dtype=dtype) + + exe = paddle.static.Executor() + exe.run(startup) + return exe.run(main, feed={'x': x0}, fetch_list=[Y]) + + +def test_static_graph_H0(func, x0, H0, dtype='float32'): + paddle.enable_static() + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + X = paddle.static.data(name='x', shape=[x0.shape[0]], dtype=dtype) + H = paddle.static.data( + name='h', shape=[H0.shape[0], H0.shape[1]], dtype=dtype) + Y = minimize_bfgs( + func, X, initial_inverse_hessian_estimate=H, dtype=dtype) + + exe = paddle.static.Executor() + exe.run(startup) + return exe.run(main, feed={'x': x0, 'h': H0}, fetch_list=[Y]) + + +def test_dynamic_graph(func, + x0, + H0=None, + line_search_fn='strong_wolfe', + dtype='float32'): + paddle.disable_static() + x0 = paddle.to_tensor(x0) + if H0 is not None: + H0 = paddle.to_tensor(H0) + return minimize_bfgs( + func, + x0, + initial_inverse_hessian_estimate=H0, + line_search_fn=line_search_fn, + dtype=dtype) + + +class TestBfgs(unittest.TestCase): + def test_quadratic_nd(self): + for dimension in [1, 10]: + minimum = np.random.random(size=[dimension]).astype('float32') + scale = np.exp(np.random.random(size=[dimension]).astype('float32')) + + def func(x): + minimum_ = paddle.assign(minimum) + scale_ = paddle.assign(scale) + return paddle.sum( + paddle.multiply(scale_, (F.square_error_cost(x, minimum_)))) + + x0 = np.random.random(size=[dimension]).astype('float32') + results = test_static_graph(func=func, x0=x0) + self.assertTrue(np.allclose(minimum, results[2])) + + results = test_dynamic_graph(func=func, x0=x0) + self.assertTrue(np.allclose(minimum, results[2].numpy())) + + def test_inf_minima(self): + extream_point = np.array([-1, 2]).astype('float32') + + def func(x): + # df = 3(x - 1.01)(x - 0.99) + # f = x^3 - 3x^2 + 3*1.01*0.99x + return x * x * x / 3.0 - ( + extream_point[0] + extream_point[1] + ) * x * x / 2 + extream_point[0] * extream_point[1] * x + + x0 = np.array([-1.7]).astype('float32') + results = test_static_graph(func, x0) + self.assertFalse(results[0][0]) + + def test_multi_minima(self): + def func(x): + # df = 12(x + 1.1)(x - 0.2)(x - 0.8) + # f = 3*x^4+0.4*x^3-5.46*x^2+2.112*x + # minimum = -1.1 or 0.8. + # All these minima may be reached from appropriate starting points. + return 3 * x**4 + 0.4 * x**3 - 5.64 * x**2 + 2.112 * x + + x0 = np.array([0.82], dtype='float64') + + results = test_static_graph(func, x0, dtype='float64') + self.assertTrue(np.allclose(0.8, results[2])) + + def test_rosenbrock(self): + # The Rosenbrock function is a standard optimization test case. + a = np.random.random(size=[1]).astype('float32') + minimum = [a.item(), (a**2).item()] + b = np.random.random(size=[1]).astype('float32') + + def func(position): + # f(x, y) = (a - x)^2 + b (y - x^2)^2 + # minimum = (a, a^2) + x, y = position[0], position[1] + c = (a - x)**2 + b * (y - x**2)**2 + # the return cant be np array[1], or in jacobin will cause flat error + return c[0] + + x0 = np.random.random(size=[2]).astype('float32') + + results = test_dynamic_graph(func, x0) + self.assertTrue(np.allclose(minimum, results[2])) + + def test_exception(self): + def func(x): + return paddle.dot(x, x) + + x0 = np.random.random(size=[2]).astype('float32') + H0 = np.array([[2.0, 0.0], [0.0, 0.9]]).astype('float32') + + # test initial_inverse_hessian_estimate is good + results = test_static_graph_H0(func, x0, H0, dtype='float32') + self.assertTrue(np.allclose([0., 0.], results[2])) + self.assertTrue(results[0][0]) + + # test initial_inverse_hessian_estimate is bad + H1 = np.array([[1.0, 2.0], [2.0, 1.0]]).astype('float32') + self.assertRaises(ValueError, test_dynamic_graph, func, x0, H0=H1) + + # test line_search_fn is bad + self.assertRaises( + NotImplementedError, + test_static_graph, + func, + x0, + line_search_fn='other') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_lbfgs.py b/python/paddle/fluid/tests/unittests/test_lbfgs.py new file mode 100644 index 00000000000..bb381874760 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_lbfgs.py @@ -0,0 +1,164 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +import paddle.nn.functional as F + +from paddle.incubate.optimizer.functional.lbfgs import minimize_lbfgs + +np.random.seed(123) + + +def test_static_graph(func, x0, line_search_fn='strong_wolfe', dtype='float32'): + dimension = x0.shape[0] + paddle.enable_static() + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + X = paddle.static.data(name='x', shape=[dimension], dtype=dtype) + Y = minimize_lbfgs(func, X, line_search_fn=line_search_fn, dtype=dtype) + + exe = paddle.static.Executor() + exe.run(startup) + return exe.run(main, feed={'x': x0}, fetch_list=[Y]) + + +def test_static_graph_H0(func, x0, H0, dtype='float32'): + paddle.enable_static() + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + X = paddle.static.data(name='x', shape=[x0.shape[0]], dtype=dtype) + H = paddle.static.data( + name='h', shape=[H0.shape[0], H0.shape[1]], dtype=dtype) + Y = minimize_lbfgs( + func, X, initial_inverse_hessian_estimate=H, dtype=dtype) + + exe = paddle.static.Executor() + exe.run(startup) + return exe.run(main, feed={'x': x0, 'h': H0}, fetch_list=[Y]) + + +def test_dynamic_graph(func, + x0, + H0=None, + line_search_fn='strong_wolfe', + dtype='float32'): + paddle.disable_static() + x0 = paddle.to_tensor(x0) + if H0 is not None: + H0 = paddle.to_tensor(H0) + return minimize_lbfgs( + func, + x0, + initial_inverse_hessian_estimate=H0, + line_search_fn=line_search_fn, + dtype=dtype) + + +class TestLbfgs(unittest.TestCase): + def test_quadratic_nd(self): + for dimension in [1, 10]: + minimum = np.random.random(size=[dimension]).astype('float32') + scale = np.exp(np.random.random(size=[dimension]).astype('float32')) + + def func(x): + minimum_ = paddle.assign(minimum) + scale_ = paddle.assign(scale) + return paddle.sum( + paddle.multiply(scale_, (F.square_error_cost(x, minimum_)))) + + x0 = np.random.random(size=[dimension]).astype('float32') + results = test_static_graph(func, x0) + self.assertTrue(np.allclose(minimum, results[2])) + + results = test_dynamic_graph(func, x0) + self.assertTrue(np.allclose(minimum, results[2].numpy())) + + def test_inf_minima(self): + extream_point = np.array([-1, 2]).astype('float32') + + def func(x): + # df = 3(x - 1.01)(x - 0.99) + # f = x^3 - 3x^2 + 3*1.01*0.99x + return x * x * x / 3.0 - ( + extream_point[0] + extream_point[1] + ) * x * x / 2 + extream_point[0] * extream_point[1] * x + + x0 = np.array([-1.7]).astype('float32') + results = test_static_graph(func, x0) + self.assertFalse(results[0][0]) + + def test_multi_minima(self): + def func(x): + # df = 12(x + 1.1)(x - 0.2)(x - 0.8) + # f = 3*x^4+0.4*x^3-5.46*x^2+2.112*x + # minimum = -1.1 or 0.8. + # All these minima may be reached from appropriate starting points. + return 3 * x**4 + 0.4 * x**3 - 5.64 * x**2 + 2.112 * x + + x0 = np.array([0.82], dtype='float64') + + results = test_static_graph(func, x0, dtype='float64') + self.assertTrue(np.allclose(0.8, results[2])) + + def test_rosenbrock(self): + # The Rosenbrock function is a standard optimization test case. + a = np.random.random(size=[1]).astype('float32') + minimum = [a.item(), (a**2).item()] + b = np.random.random(size=[1]).astype('float32') + + def func(position): + # f(x, y) = (a - x)^2 + b (y - x^2)^2 + # minimum = (a, a^2) + x, y = position[0], position[1] + c = (a - x)**2 + b * (y - x**2)**2 + # the return cant be np array[1], or in jacobin will cause flat error + return c[0] + + x0 = np.random.random(size=[2]).astype('float32') + + results = test_dynamic_graph(func, x0) + self.assertTrue(np.allclose(minimum, results[2])) + + def test_exception(self): + def func(x): + return paddle.dot(x, x) + + x0 = np.random.random(size=[2]).astype('float32') + H0 = np.array([[2.0, 0.0], [0.0, 0.9]]).astype('float32') + + # test dtype is not float32 or float64 + x1 = np.random.random(size=[2]).astype('int32') + self.assertRaises( + ValueError, test_static_graph, func, x1, dtype='int32') + + # test initial_inverse_hessian_estimate is good + results = test_static_graph_H0(func, x0, H0, dtype='float32') + self.assertTrue(np.allclose([0., 0.], results[2])) + self.assertTrue(results[0][0]) + + # test initial_inverse_hessian_estimate is bad and float64 + x2 = np.random.random(size=[2]).astype('float64') + H1 = np.array([[1.0, 2.0], [3.0, 1.0]]).astype('float64') + self.assertRaises( + ValueError, test_static_graph_H0, func, x2, H0=H1, dtype='float64') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/incubate/optimizer/__init__.py b/python/paddle/incubate/optimizer/__init__.py index fd5332986ed..f5c24f85896 100644 --- a/python/paddle/incubate/optimizer/__init__.py +++ b/python/paddle/incubate/optimizer/__init__.py @@ -15,5 +15,6 @@ from .lookahead import LookAhead # noqa: F401 from .modelaverage import ModelAverage # noqa: F401 from .distributed_fused_lamb import DistributedFusedLamb # noqa: F401 +from . import functional # noqa: F401 __all__ = [] diff --git a/python/paddle/incubate/optimizer/functional/__init__.py b/python/paddle/incubate/optimizer/functional/__init__.py new file mode 100644 index 00000000000..fc863a923d8 --- /dev/null +++ b/python/paddle/incubate/optimizer/functional/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .bfgs import minimize_bfgs # noqa: F401 +from .lbfgs import minimize_lbfgs # noqa: F401 + +__all__ = ['minimize_bfgs', 'minimize_lbfgs'] diff --git a/python/paddle/incubate/optimizer/functional/bfgs.py b/python/paddle/incubate/optimizer/functional/bfgs.py new file mode 100644 index 00000000000..9afcc2240ae --- /dev/null +++ b/python/paddle/incubate/optimizer/functional/bfgs.py @@ -0,0 +1,195 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from .line_search import strong_wolfe +from .utils import _value_and_gradient, check_input_type, check_initial_inverse_hessian_estimate + +import paddle + + +def minimize_bfgs(objective_func, + initial_position, + max_iters=50, + tolerance_grad=1e-7, + tolerance_change=1e-9, + initial_inverse_hessian_estimate=None, + line_search_fn='strong_wolfe', + max_line_search_iters=50, + initial_step_length=1.0, + dtype='float32', + name=None): + r""" + Minimizes a differentiable function `func` using the BFGS method. + The BFGS is a quasi-Newton method for solving an unconstrained + optimization problem over a differentiable function. + Closely related is the Newton method for minimization. Consider the iterate + update formula + .. math:: + x_{k+1} = x_{k} + H \nabla{f}, + If $H$ is the inverse Hessian of $f$ at $x_{k}$, then it's the Newton method. + If $H$ is symmetric and positive definite, used as an approximation of the inverse Hessian, then + it's a quasi-Newton. In practice, the approximated Hessians are obtained + by only using the gradients, over either whole or part of the search + history, the former is BFGS. + + Reference: + Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. + pp140: Algorithm 6.1 (BFGS Method). + + Following summarizes the the main logic of the program based on BFGS. Note: _k represents value of + k_th iteration, ^T represents the transposition of a vector or matrix. + repeat + p_k = H_k * g_k + alpha = strong_wolfe(f, x_k, p_k) + x_k+1 = x_k + alpha * p_k + s_k = x_k+1 - x_k + y_k = g_k+1 - g_k + rho_k = 1 / (s_k^T * y_k) + V_k^T = I - rho_k * s_k * y_k^T + V_k = I - rho_k * y_k * s_k^T + H_k+1 = V_k^T * H_k * V_k + rho_k * s_k * s_k^T + check_converge + end + + Args: + objective_func: the objective function to minimize. ``func`` accepts + a multivariate input and returns a scalar. + initial_position (Tensor): the starting point of the iterates. For methods like Newton and quasi-Newton + the initial trial step length should always be 1.0. + max_iters (int): the maximum number of minimization iterations. + tolerance_grad (float): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. + tolerance_change (float): terminates if the change of function value/position/parameter between + two iterations is smaller than this value. + initial_inverse_hessian_estimate (Tensor): the initial inverse hessian approximation at initial_position. + It must be symmetric and positive definite. + line_search_fn (str): indicate which line search method to use, only support 'strong wolfe' right now. May support + 'Hager Zhang' in the futrue. + max_line_search_iters (int): the maximum number of line search iterations. + initial_step_length (float): step length used in first iteration of line search. different initial_step_length + may cause different optimal result. + dtype ('float32' | 'float64'): In static graph, float64 will be convert to float32 due to paddle.assign limit. + + Returns: + is_converge (bool): Indicates whether found the minimum within tolerance. + num_func_calls (int): number of objective function called. + position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of + the objective function regrading to the initial position. + objective_value (Tensor): objective function value at the `position`. + objective_gradient (Tensor): objective function gradient at the `position`. + inverse_hessian_estimate (Tensor): the estimate of inverse hessian at the `position`. + + Examples: + .. code-block:: python + + import paddle + + def func(x): + return paddle.dot(x, x) + + x0 = paddle.to_tensor([1.3, 2.7]) + results = paddle.optimizer.functional.minimize_bfgs(func, x0) + print("is_converge: ", results[0]) + print("the minimum of func is: ", results[2]) + # is_converge: is_converge: Tensor(shape=[1], dtype=bool, place=Place(gpu:0), stop_gradient=True, + # [True]) + # the minimum of func is: Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True, + # [0., 0.]) + """ + + if dtype not in ['float32', 'float64']: + raise ValueError( + "The dtype must be 'float32' or 'float64', but the specified is {}.". + format(dtype)) + + op_name = 'minimize_bfgs' + check_input_type(initial_position, 'initial_position', op_name) + + I = paddle.eye(initial_position.shape[0], dtype=dtype) + if initial_inverse_hessian_estimate is None: + initial_inverse_hessian_estimate = I + else: + check_input_type(initial_inverse_hessian_estimate, + 'initial_inverse_hessian_estimate', op_name) + check_initial_inverse_hessian_estimate(initial_inverse_hessian_estimate) + + Hk = paddle.assign(initial_inverse_hessian_estimate) + xk = initial_position + + value, g1 = _value_and_gradient(objective_func, xk) + num_func_calls = paddle.full(shape=[1], fill_value=1, dtype='int64') + + # when the dim of x is 1000, it needs more than 30 iters to get all element converge to minimum. + k = paddle.full(shape=[1], fill_value=0, dtype='int64') + done = paddle.full(shape=[1], fill_value=False, dtype='bool') + is_converge = paddle.full(shape=[1], fill_value=False, dtype='bool') + + def cond(k, done, is_converge, num_func_calls, xk, value, g1, Hk): + return (k < max_iters) & ~done + + def body(k, done, is_converge, num_func_calls, xk, value, g1, Hk): + ############# compute pk ############# + pk = -paddle.matmul(Hk, g1) + + ############# compute alpha by line serach ############# + if line_search_fn == 'strong_wolfe': + alpha, value, g2, ls_func_calls = strong_wolfe( + f=objective_func, + xk=xk, + pk=pk, + initial_step_length=initial_step_length, + dtype=dtype) + else: + raise NotImplementedError( + "Currently only support line_search_fn = 'strong_wolfe', but the specified is '{}'". + format(line_search_fn)) + num_func_calls += ls_func_calls + + ############# update Hk ############# + sk = alpha * pk + yk = g2 - g1 + + xk = xk + sk + g1 = g2 + + sk = paddle.unsqueeze(sk, 0) + yk = paddle.unsqueeze(yk, 0) + + rhok_inv = paddle.dot(yk, sk) + rhok = paddle.static.nn.cond( + rhok_inv == 0., lambda: paddle.full(shape=[1], fill_value=1000.0, dtype=dtype), lambda: 1. / rhok_inv) + + Vk_transpose = I - rhok * sk * yk.t() + Vk = I - rhok * yk * sk.t() + Hk = paddle.matmul(paddle.matmul(Vk_transpose, Hk), + Vk) + rhok * sk * sk.t() + + k += 1 + + ############# check convergence ############# + gnorm = paddle.linalg.norm(g1, p=np.inf) + pk_norm = paddle.linalg.norm(pk, p=np.inf) + paddle.assign(done | (gnorm < tolerance_grad) | + (pk_norm < tolerance_change), done) + paddle.assign(done, is_converge) + # when alpha=0, there is no chance to get xk change. + paddle.assign(done | (alpha == 0.), done) + return [k, done, is_converge, num_func_calls, xk, value, g1, Hk] + + paddle.static.nn.while_loop( + cond=cond, + body=body, + loop_vars=[k, done, is_converge, num_func_calls, xk, value, g1, Hk]) + return is_converge, num_func_calls, xk, value, g1, Hk diff --git a/python/paddle/incubate/optimizer/functional/lbfgs.py b/python/paddle/incubate/optimizer/functional/lbfgs.py new file mode 100644 index 00000000000..90ae452653a --- /dev/null +++ b/python/paddle/incubate/optimizer/functional/lbfgs.py @@ -0,0 +1,239 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from .line_search import strong_wolfe +from .utils import _value_and_gradient, check_input_type, check_initial_inverse_hessian_estimate + +import paddle + + +def minimize_lbfgs(objective_func, + initial_position, + history_size=100, + max_iters=50, + tolerance_grad=1e-8, + tolerance_change=1e-8, + initial_inverse_hessian_estimate=None, + line_search_fn='strong_wolfe', + max_line_search_iters=50, + initial_step_length=1.0, + dtype='float32', + name=None): + r"""Minimizes a differentiable function `func` using the L-BFGS method. + The L-BFGS is simalar as BFGS, the only difference is that L-BFGS use historical + sk, yk, rhok rather than H_k-1 to compute Hk. + Reference: + Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. + pp179: Algorithm 7.5 (L-BFGS). + + Following summarizes the the main logic of the program based on L-BFGS.Note: _k represents + value of k_th iteration, ^T represents the transposition of a vector or matrix. + repeat + compute p_k by two-loop recursion + alpha = strong_wolfe(f, x_k, p_k) + x_k+1 = x_k + alpha * p_k + s_k = x_k+1 - x_k + y_k = g_k+1 - g_k + rho_k = 1 / (s_k^T * y_k) + update sk_vec, yk_vec, rhok_vec + check_converge + end + + Args: + objective_func: the objective function to minimize. ``func`` accepts + a multivariate input and returns a scalar. + initial_position (Tensor): the starting point of the iterates. For methods like Newton and quasi-Newton + the initial trial step length should always be 1.0 . + history_size (Scalar): the number of stored vector pairs {si,yi}. + max_iters (Scalar): the maximum number of minimization iterations. + tolerance_grad (Scalar): terminates if the gradient norm is smaller than + this. Currently gradient norm uses inf norm. + tolerance_change (Scalar): terminates if the change of function value/position/parameter between + two iterations is smaller than this value. + initial_inverse_hessian_estimate (Tensor): the initial inverse hessian approximation. + line_search_fn (str): indicate which line search method to use, only support 'strong wolfe' right now. May support + 'Hager Zhang' in the futrue. + max_line_search_iters (Scalar): the maximum number of line search iterations. + initial_step_length: step length used in first iteration of line search. different initial_step_length + may cause different optimal result. + dtype ('float' | 'float32' | 'float64' | 'double'): the data + type to be used. + + Returns: + is_converge (bool): Indicates whether found the minimum within tolerance. + num_func_calls (int): number of objective function called. + position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of + the objective function regrading to the initial position. + objective_value (Tensor): objective function value at the `position`. + objective_gradient (Tensor): objective function gradient at the `position`. + + Examples: + .. code-block:: python + + import paddle + + def func(x): + return paddle.dot(x, x) + + x0 = paddle.to_tensor([1.3, 2.7]) + results = paddle.optimizer.functional.minimize_lbfgs(func, x0) + print("is_converge: ", results[0]) + print("the minimum of func is: ", results[2]) + # is_converge: is_converge: Tensor(shape=[1], dtype=bool, place=Place(gpu:0), stop_gradient=True, + # [True]) + # the minimum of func is: Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True, + # [0., 0.]) + """ + if dtype not in ['float32', 'float64']: + raise ValueError( + "The dtype must be 'float32' or 'float64', but the specified is {}.". + format(dtype)) + + op_name = 'minimize_lbfgs' + check_input_type(initial_position, 'initial_position', op_name) + + if initial_inverse_hessian_estimate is None: + H0 = paddle.eye(initial_position.shape[0], dtype=dtype) + else: + check_input_type(initial_inverse_hessian_estimate, + 'initial_inverse_hessian_estimate', op_name) + check_initial_inverse_hessian_estimate(initial_inverse_hessian_estimate) + H0 = initial_inverse_hessian_estimate + + xk = initial_position + value, g1 = _value_and_gradient(objective_func, xk) + + k = paddle.full(shape=[1], fill_value=0, dtype='int64') + done = paddle.full(shape=[1], fill_value=False, dtype='bool') + is_converge = paddle.full(shape=[1], fill_value=False, dtype='bool') + num_func_calls = paddle.full(shape=[1], fill_value=1, dtype='int64') + + history_size = paddle.full( + shape=[1], fill_value=history_size, dtype='int64') + head = paddle.full(shape=[1], fill_value=1, dtype='int64') + tail = paddle.full(shape=[1], fill_value=0, dtype='int64') + + shape = initial_position.shape[0] + # Use tensor as array of fixed length, rather than flexible tensor array. Because in static mode, + # tensor array will produce tensor of shape[-1], which will cause error when calling jacobian. In this way, can not use append + # or pop, so we need head and tail to record where is the newest data and where is the oldest. + # Totally speaking, realized a stack by array. + sk_vec = paddle.zeros((history_size + 1, shape), dtype=dtype) + yk_vec = paddle.zeros((history_size + 1, shape), dtype=dtype) + rhok_vec = paddle.zeros((history_size + 1, 1), dtype=dtype) + ai_vec = paddle.zeros((history_size + 1, 1), dtype=dtype) + + def cond(k, done, is_converge, num_func_calls, value, xk, g1, sk_vec, + yk_vec, rhok_vec, head, tail): + return (k < max_iters) & ~done + + def body(k, done, is_converge, num_func_calls, value, xk, g1, sk_vec, + yk_vec, rhok_vec, head, tail): + # use assign to cut off the relevance between g1 and q, or they will change together. + + ############# compute p_k by two-loop recursion ############# + q = paddle.assign(g1) + # In a array circle, the index may out of range, so must use mod. + i = paddle.full( + shape=[1], fill_value=(head - 1).mod(history_size), dtype='int64') + + def cond(i, q): + return i != tail + + def body(i, q): + ai_vec[i] = rhok_vec[i] * paddle.dot(sk_vec[i], q) + q = q - ai_vec[i] * yk_vec[i] + i = (i - 1).mod(history_size) + return i, q + + paddle.static.nn.while_loop(cond=cond, body=body, loop_vars=[i, q]) + + r = paddle.matmul(H0, q) + + i = paddle.full(shape=[1], fill_value=tail + 1, dtype='int64') + + def cond(i, r): + return i != head + + def body(i, r): + beta = rhok_vec[i] * paddle.dot(yk_vec[i], r) + r = r + sk_vec[i] * (ai_vec[i] - beta) + i = (i + 1).mod(history_size) + return i, r + + paddle.static.nn.while_loop(cond=cond, body=body, loop_vars=[i, r]) + + pk = -r + + ############# compute alpha by line serach ############# + if line_search_fn == 'strong_wolfe': + alpha, value, g2, ls_func_calls = strong_wolfe( + f=objective_func, + xk=xk, + pk=pk, + initial_step_length=initial_step_length, + dtype=dtype) + else: + raise NotImplementedError( + "Currently only support line_search_fn = 'strong_wolfe', but the specified is '{}'". + format(line_search_fn)) + paddle.assign(num_func_calls + ls_func_calls, num_func_calls) + + ############# update sk_vec, yk_vec, rhok_vec ############# + sk = alpha * pk + yk = g2 - g1 + + rhok_inv = paddle.dot(yk, sk) + rhok = paddle.static.nn.cond( + rhok_inv == 0., lambda: paddle.full(shape=[1], fill_value=1000.0, dtype=dtype), lambda: 1. / rhok_inv) + + sk_vec[head] = sk + yk_vec[head] = yk + rhok_vec[head] = rhok + head = (head + 1) % history_size + + def true_fn(tail): + paddle.assign(tail + 1, tail) + + # when array is full, the tail should move forward too. + paddle.static.nn.cond(head == tail, lambda: true_fn(tail), None) + + xk = xk + sk + g1 = g2 + k += 1 + + ############# check convergence ############# + gnorm = paddle.linalg.norm(g1, p=np.inf) + pk_norm = paddle.linalg.norm(pk, p=np.inf) + paddle.assign(done | (gnorm < tolerance_grad) | + (pk_norm < tolerance_change), done) + paddle.assign(done, is_converge) + # when alpha=0, there is no chance to get xk change. + paddle.assign(done | (alpha == 0.), done) + + return [ + k, done, is_converge, num_func_calls, value, xk, g1, sk_vec, yk_vec, + rhok_vec, head, tail + ] + + paddle.static.nn.while_loop( + cond=cond, + body=body, + loop_vars=[ + k, done, is_converge, num_func_calls, value, xk, g1, sk_vec, yk_vec, + rhok_vec, head, tail + ]) + return is_converge, num_func_calls, xk, value, g1 diff --git a/python/paddle/incubate/optimizer/functional/line_search.py b/python/paddle/incubate/optimizer/functional/line_search.py new file mode 100644 index 00000000000..d42732e605e --- /dev/null +++ b/python/paddle/incubate/optimizer/functional/line_search.py @@ -0,0 +1,297 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .utils import _value_and_gradient +import paddle + + +def cubic_interpolation_(x1, f1, g1, x2, f2, g2): + r"""Cubic interpolation between (x1, f1, g1) and (x2, f2, g2). + Use two points and their gradient to determine a cubic function and get the minimun point + between them in the cubic curve. + + Reference: + Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. + pp59: formula 3.59 + + Args: + x1, f1, g1: point1's position, value and gradient. + x2, f2, g2: point2's position, value and gradient. + Returns: + min_pos: the minimun point between the specified points in the cubic curve. + """ + xmin, xmax = paddle.static.nn.cond(x1 <= x2, lambda: (x1, x2), + lambda: (x2, x1)) + d1 = g1 + g2 - 3 * (f1 - f2) / (x1 - x2) + d2_square = d1**2 - g1 * g2 + + def true_func1(): + d2 = d2_square.sqrt() + + def true_fn2(): + return x2 - (x2 - x1) * ((g2 + d2 - d1) / (g2 - g1 + 2 * d2)) + + def false_fn2(): + return x1 - (x1 - x2) * ((g1 + d2 - d1) / (g1 - g2 + 2 * d2)) + + pred = paddle.less_equal(x=x1, y=x2) + min_pos = paddle.static.nn.cond(pred, true_fn2, false_fn2) + + return paddle.minimum(paddle.maximum(min_pos, xmin), xmax) + + def false_func1(): + return (xmin + xmax) / 2. + + min_pos = paddle.static.nn.cond(d2_square >= 0., true_func1, false_func1) + return min_pos + + +def strong_wolfe(f, + xk, + pk, + max_iters=20, + tolerance_change=1e-8, + initial_step_length=1.0, + c1=1e-4, + c2=0.9, + alpha_max=10, + dtype='float32'): + r"""Implements of line search algorithm that satisfies the strong Wolfe conditions using double zoom. + + Reference: + Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. + pp60: Algorithm 3.5 (Line Search Algorithm). + + Args: + f: the objective function to minimize. ``f`` accepts a multivariate input and returns a scalar. + xk (Tensor): the starting point of the iterates. + pk (Tensor): search direction. + max_iters (Scalar): the maximum number of iterations. + tolerance_grad (Scalar): terminates if the gradient norm is smaller than + this. Currently gradient norm uses inf norm. + tolerance_change (Scalar): terminates if the change of function value/position/parameter between + two iterations is smaller than this value. + initial_step_length (Scalar): step length used in first iteration. + c1 (Scalar): parameter for sufficient decrease condition. + c2 (Scalar): parameter for curvature condition. + alpha_max (float): max step length. + dtype ('float32' | 'float64'): the datatype to be used. + + Returns: + num_func_calls (float): number of objective function called in line search process. + a_star(Tensor): optimal step length, or 0. if the line search algorithm did not converge. + phi_star (Tensor): phi at a_star. + derphi_star (Tensor): derivative of phi at a_star. + + Following summarizes the essentials of the strong Wolfe line search algorithm. + Some notations used in the description: + + - `f` denotes the objective function. + - `phi` is a function of step size alpha, restricting `f` on a line. + + phi = f(xk + a * pk), + where xk is the position of k'th iterate, pk is the line search direction(decent direction), + and a is the step size. + - a : substitute of alpha + - a1 is a of last iteration, which is alpha_(i-1). + - a2 is a of current iteration, which is alpha_i. + - a_lo is a in left position when calls zoom, which is alpha_low. + - a_hi is a in right position when calls zoom, which is alpha_high. + + Line Search Algorithm: + repeat + Compute phi(a2) and derphi(a2). + 1. If phi(a2) > phi(0) + c_1 * a2 * phi'(0) or [phi(a2) >= phi(a1) and i > 1], + a_star= zoom(a1, a2) and stop; + + 2. If |phi'(a2)| <= -c_2 * phi'(0), + a_star= a2 and stop; + + 3. If phi'(a2) >= 0, + a_star= zoom(a2, a1) and stop; + + a1 = a2 + a2 = min(2 * a2, a2) + i = i + 1 + end(repeat) + + zoom(a_lo, a_hi) Algorithm: + repeat + aj = cubic_interpolation(a_lo, a_hi) + Compute phi(aj) and derphi(aj). + 1. If phi(aj) > phi(0) + c_1 * aj * phi'(0) or phi(aj) >= phi(a_lo), + then a_hi <- aj; + 2. + 2.1. If |phi'(aj)| <= -c_2 * phi'(0), then a_star= a2 and stop; + + 2.2. If phi'(aj) * (a2 - a1) >= 0, then a_hi = a_lo + + a_lo = aj; + end(repeat) + """ + + def phi_and_derphi(a): + r"""Compute function value and derivative of phi at a. + phi = f(xk + a * pk) + phi'(a) = f'(xk + a * pk) * pk + """ + phi_value, f_grad = _value_and_gradient(f, xk + a * pk) + phi_grad = paddle.dot(f_grad, pk) + # return f_grad to be used in bfgs/l-bfgs to compute yk to avoid computint repeatly. + return phi_value, f_grad, phi_grad + + def zoom(a_lo, phi_lo, derphi_lo, derf_lo, a_hi, phi_hi, derphi_hi, phi_0, + derphi_0): + # find the exact a from the bracket [a_lo, a_hi] + max_zoom_iters = max_iters + j = paddle.full(shape=[1], fill_value=0, dtype='int64') + done_zoom = paddle.full(shape=[1], fill_value=False, dtype='bool') + + def cond_zoom(j, done_zoom, a_lo, phi_lo, derphi_lo, derf_lo, a_hi, + phi_hi, derphi_hi): + pred = paddle.abs(a_hi - a_lo) < tolerance_change + paddle.assign(done_zoom | pred, done_zoom) + return (j < max_zoom_iters) & ~done_zoom + + def body_zoom(j, done_zoom, a_lo, phi_lo, derphi_lo, derf_lo, a_hi, + phi_hi, derphi_hi): + aj = cubic_interpolation_(a_lo, phi_lo, derphi_lo, a_hi, phi_hi, + derphi_hi) # 21 + min_change = 0.1 * paddle.abs(a_hi - a_lo) + pred = paddle.minimum( + paddle.abs(aj - a_lo), paddle.abs(aj - a_hi)) < min_change + aj = paddle.static.nn.cond(pred, lambda: 0.5 * (a_lo + a_hi), + lambda: aj) + + phi_j, derf_j, derphi_j = phi_and_derphi(aj) + + def true_fn(): + # use assing to modify the variable in-place + paddle.assign(aj, a_hi) + paddle.assign(phi_j, phi_hi) + paddle.assign(derphi_j, derphi_hi) + + def false_fn(a_lo, done_zoom): + pred3 = (paddle.abs(derphi_j) <= -c2 * derphi_0) + paddle.assign(pred3, done_zoom) + + def true_fn(): + paddle.assign(a_lo, a_hi) + paddle.assign(phi_lo, phi_hi) + paddle.assign(derphi_lo, derphi_hi) + + pred4 = ~done_zoom & (derphi_j * (a_hi - a_lo) >= 0) + paddle.static.nn.cond(pred4, true_fn, None) + + paddle.assign(aj, a_lo) + paddle.assign(phi_j, phi_lo) + paddle.assign(derphi_j, derphi_lo) + paddle.assign(derf_j, derf_lo) + + pred2 = (phi_j > phi_0 + c1 * aj * derphi_0) | (phi_j >= phi_lo) + paddle.static.nn.cond(pred2, true_fn, + lambda: false_fn(a_lo, done_zoom)) + j = paddle.static.nn.cond(done_zoom, lambda: j, lambda: j + 1) + return [ + j, done_zoom, a_lo, phi_lo, derphi_lo, derf_lo, a_hi, phi_hi, + derphi_hi + ] + + paddle.static.nn.while_loop( + cond=cond_zoom, + body=body_zoom, + loop_vars=[ + j, done_zoom, a_lo, phi_lo, derphi_lo, derf_lo, a_hi, phi_hi, + derphi_hi + ]) + # j is the number of object function called in zoom. + return j + + alpha_max = paddle.full(shape=[1], fill_value=alpha_max, dtype=dtype) + + a1 = paddle.full(shape=[1], fill_value=0., dtype=dtype) + a2 = paddle.full(shape=[1], fill_value=initial_step_length, dtype=dtype) + + phi_1, derf_1, derphi_1 = phi_and_derphi(a1) + # use assign to cut off binding between two variables + phi_0 = paddle.assign(phi_1) + derphi_0 = paddle.assign(derphi_1) + ls_func_calls = paddle.full(shape=[1], fill_value=1, dtype='int64') + + # If not found the a_star, will return alpha=0 and f(xk), derf(xk) + a_star = paddle.full(shape=[1], fill_value=0, dtype=dtype) + phi_star = paddle.assign(phi_1) + derf_star = paddle.assign(derf_1) + + i = paddle.full(shape=[1], fill_value=0, dtype='int64') + done = paddle.full(shape=[1], fill_value=False, dtype='bool') + + def cond(i, ls_func_calls, a1, a2, phi_1, derf_1, done): + return (i < max_iters) & ~done + + def body(i, ls_func_calls, a1, a2, phi_1, derf_1, done): + phi_2, derf_2, derphi_2 = phi_and_derphi(a2) + paddle.assign(ls_func_calls + 1, ls_func_calls) + paddle.assign(done | paddle.any(paddle.isinf(phi_2)), done) + + def true_fn1(): + j = zoom(a1, phi_1, derphi_1, derf_1, a2, phi_2, derphi_2, phi_0, + derphi_0) + paddle.assign(a1, a_star) + paddle.assign(phi_1, phi_star) + paddle.assign(derf_1, derf_star) + paddle.assign(ls_func_calls + j, ls_func_calls) + + pred1 = ~done & ((phi_2 > phi_0 + c1 * a2 * derphi_0) | ( + (phi_2 >= phi_0) & (i > 1))) + paddle.assign(done | pred1, done) + paddle.static.nn.cond(pred1, true_fn1, None) + + def true_fn2(): + paddle.assign(a2, a_star) + paddle.assign(phi_2, phi_star) + paddle.assign(derf_2, derf_star) + + pred2 = ~done & (paddle.abs(derphi_2) <= -c2 * derphi_0) + paddle.assign(done | pred2, done) + paddle.static.nn.cond(pred2, true_fn2, None) + + def true_fn3(): + j = zoom(a2, phi_2, derphi_2, derf_2, a1, phi_1, derphi_1, phi_0, + derphi_0) + paddle.assign(a2, a_star) + paddle.assign(phi_2, phi_star) + paddle.assign(derf_2, derf_star) + paddle.assign(ls_func_calls + j, ls_func_calls) + + pred3 = ~done & (derphi_2 >= 0) + paddle.assign(done | pred3, done) + paddle.static.nn.cond(pred3, true_fn3, None) + + def false_fn(): + paddle.assign(a2, a1) + paddle.assign(phi_2, phi_1) + paddle.assign(derf_2, derf_1) + paddle.assign(paddle.minimum(2 * a2, alpha_max), a2) + paddle.assign(i + 1, i) + + paddle.static.nn.cond(done, None, false_fn) + return [i, ls_func_calls, a1, a2, phi_1, derf_1, done] + + paddle.static.nn.while_loop( + cond=cond, + body=body, + loop_vars=[i, ls_func_calls, a1, a2, phi_1, derf_1, done]) + + return a_star, phi_star, derf_star, ls_func_calls diff --git a/python/paddle/incubate/optimizer/functional/utils.py b/python/paddle/incubate/optimizer/functional/utils.py new file mode 100644 index 00000000000..c197f8a1acb --- /dev/null +++ b/python/paddle/incubate/optimizer/functional/utils.py @@ -0,0 +1,96 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle.autograd.functional import vjp, Jacobian +from paddle.fluid.framework import Variable +from paddle.fluid.data_feeder import check_type, check_dtype + + +def check_input_type(input, name, op_name): + r"""Check whether the input is tensor or variable.""" + if paddle.in_dynamic_mode(): + if not isinstance(input, paddle.Tensor): + raise ValueError("The input: {} must be tensor.".format(input)) + else: + check_type(input, name, Variable, op_name) + + +def check_initial_inverse_hessian_estimate(H0): + r"""Check whether the specified initial_inverse_hessian_estimate is symmetric and positive definite. + Raise errors when precondition not met. + + Note: + In static graph can not raise error directly, so use py_func make raise_func as a op, + and use paddle.static.nn.cond to decide if put the op in net. + cholesky is the fast way to check positive definition, but in static graph can not catch + exception to raise value error, so use eigvals rather than cholesky in static graph. + """ + is_symmetric = paddle.all(paddle.equal(H0, H0.t())) + + def raise_func(): + raise ValueError( + "The initial_inverse_hessian_estimate should be symmetric and positive definite, but the specified is not." + ) + + if paddle.in_dynamic_mode(): + if not is_symmetric: + raise_func() + try: + paddle.linalg.cholesky(H0) + except RuntimeError as error: + raise_func() + else: + + def create_tmp_var(program, name, dtype, shape): + return program.current_block().create_var( + name=name, dtype=dtype, shape=shape) + + out_var = create_tmp_var( + paddle.static.default_main_program(), + name='output', + dtype='float32', + shape=[-1]) + + def false_fn(): + paddle.static.nn.py_func( + func=raise_func, x=is_symmetric, out=out_var) + + paddle.static.nn.cond(is_symmetric, None, false_fn) + # eigvals only support cpu + paddle.set_device("cpu") + eigvals = paddle.paddle.linalg.eigvals(H0) + is_positive = paddle.all(eigvals.real() > 0.) and paddle.all( + eigvals.imag() == 0.) + paddle.static.nn.cond(is_positive, None, false_fn) + + +def _value_and_gradient(f, x, v=None): + r"""Compute function value and gradient of f at x. + + Args: + f (Callable): the objective function. + x (Tensor): the input tensor. + Returns: + value: a tensor that holds the function value. + gradient: a tensor that holds the function gradients. + """ + if paddle.in_dynamic_mode(): + value, gradient = vjp(f, x, v=v) + gradient = gradient[0] + else: + JJ = Jacobian(f, x) + gradient = JJ[:][0] + value = f(x) + return value, gradient diff --git a/python/setup.py.in b/python/setup.py.in index 7c1232c1d41..3e59e22fcbc 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -362,6 +362,7 @@ packages=['paddle', 'paddle.incubate.nn', 'paddle.incubate.nn.functional', 'paddle.incubate.nn.layer', + 'paddle.incubate.optimizer.functional', 'paddle.io', 'paddle.optimizer', 'paddle.nn', -- GitLab