lbfgs.py 11.2 KB
Newer Older
1
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2
#
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
6
#
7
#     http://www.apache.org/licenses/LICENSE-2.0
8
#
9 10 11 12 13 14 15 16
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np

17 18
import paddle

19
from .line_search import strong_wolfe
20 21 22
from .utils import (
    _value_and_gradient,
    check_initial_inverse_hessian_estimate,
23
    check_input_type,
24
)
25 26


27 28 29 30 31 32 33 34 35 36 37 38 39 40
def minimize_lbfgs(
    objective_func,
    initial_position,
    history_size=100,
    max_iters=50,
    tolerance_grad=1e-8,
    tolerance_change=1e-8,
    initial_inverse_hessian_estimate=None,
    line_search_fn='strong_wolfe',
    max_line_search_iters=50,
    initial_step_length=1.0,
    dtype='float32',
    name=None,
):
S
Sing_chan 已提交
41 42 43 44 45 46 47 48 49
    r"""
    Minimizes a differentiable function `func` using the L-BFGS method.
    The L-BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function.
    Closely related is the Newton method for minimization. Consider the iterate update formula:

    .. math::
        x_{k+1} = x_{k} + H_k \nabla{f_k}

    If :math:`H_k` is the inverse Hessian of :math:`f` at :math:`x_k`, then it's the Newton method.
50
    If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then
S
Sing_chan 已提交
51
    it's a quasi-Newton. In practice, the approximated Hessians are obtained
52
    by only using the gradients, over either whole or part of the search
S
Sing_chan 已提交
53
    history, the former is BFGS, the latter is L-BFGS.
54

S
Sing_chan 已提交
55 56
    Reference:
        Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS).
57 58

    Args:
59
        objective_func: the objective function to minimize. ``objective_func`` accepts a 1D Tensor and returns a scalar.
60
        initial_position (Tensor): the starting point of the iterates, has the same shape with the input of ``objective_func`` .
S
Sing_chan 已提交
61 62 63 64
        history_size (Scalar): the number of stored vector pairs {si,yi}. Default value: 100.
        max_iters (int, optional): the maximum number of minimization iterations. Default value: 50.
        tolerance_grad (float, optional): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. Default value: 1e-7.
        tolerance_change (float, optional): terminates if the change of function value/position/parameter between two iterations is smaller than this value. Default value: 1e-9.
65
        initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. If not given, will use an identity matrix of order N, which is size of ``initial_position`` . Default value: None.
S
Sing_chan 已提交
66 67 68
        line_search_fn (str, optional): indicate which line search method to use, only support 'strong wolfe' right now. May support 'Hager Zhang' in the futrue. Default value: 'strong wolfe'.
        max_line_search_iters (int, optional): the maximum number of line search iterations. Default value: 50.
        initial_step_length (float, optional): step length used in first iteration of line search. different initial_step_length may cause different optimal result. For methods like Newton and quasi-Newton the initial trial step length should always be 1.0. Default value: 1.0.
69
        dtype ('float32' | 'float64', optional): data type used in the algorithm, the data type of the input parameter must be consistent with the dtype. Default value: 'float32'.
S
Sing_chan 已提交
70 71
        name (str, optional): Name for the operation. For more information, please refer to :ref:`api_guide_Name`. Default value: None.

72
    Returns:
S
Sing_chan 已提交
73
        output(tuple):
74

S
Sing_chan 已提交
75 76 77 78 79
            - is_converge (bool): Indicates whether found the minimum within tolerance.
            - num_func_calls (int): number of objective function called.
            - position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of the objective function regrading to the initial position.
            - objective_value (Tensor): objective function value at the `position`.
            - objective_gradient (Tensor): objective function gradient at the `position`.
80

81 82 83 84
    Examples:
        .. code-block:: python

            import paddle
85

86 87 88 89
            def func(x):
                return paddle.dot(x, x)

            x0 = paddle.to_tensor([1.3, 2.7])
90
            results = paddle.incubate.optimizer.functional.minimize_lbfgs(func, x0)
91 92 93 94 95 96 97 98 99
            print("is_converge: ", results[0])
            print("the minimum of func is: ", results[2])
            # is_converge:  is_converge:  Tensor(shape=[1], dtype=bool, place=Place(gpu:0), stop_gradient=True,
            #        [True])
            # the minimum of func is:  Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
            #        [0., 0.])
    """
    if dtype not in ['float32', 'float64']:
        raise ValueError(
100 101 102 103
            "The dtype must be 'float32' or 'float64', but the specified is {}.".format(
                dtype
            )
        )
104 105 106 107 108 109 110

    op_name = 'minimize_lbfgs'
    check_input_type(initial_position, 'initial_position', op_name)

    if initial_inverse_hessian_estimate is None:
        H0 = paddle.eye(initial_position.shape[0], dtype=dtype)
    else:
111 112 113 114 115
        check_input_type(
            initial_inverse_hessian_estimate,
            'initial_inverse_hessian_estimate',
            op_name,
        )
116 117 118
        check_initial_inverse_hessian_estimate(initial_inverse_hessian_estimate)
        H0 = initial_inverse_hessian_estimate

119 120
    # use detach and assign to create new tensor rather than =, or xk will share memory and grad with initial_position
    xk = paddle.assign(initial_position.detach())
121 122 123 124 125 126 127
    value, g1 = _value_and_gradient(objective_func, xk)

    k = paddle.full(shape=[1], fill_value=0, dtype='int64')
    done = paddle.full(shape=[1], fill_value=False, dtype='bool')
    is_converge = paddle.full(shape=[1], fill_value=False, dtype='bool')
    num_func_calls = paddle.full(shape=[1], fill_value=1, dtype='int64')

J
JYChen 已提交
128
    history_size = paddle.full(shape=[], fill_value=history_size, dtype='int64')
129 130 131 132
    head = paddle.full(shape=[1], fill_value=1, dtype='int64')
    tail = paddle.full(shape=[1], fill_value=0, dtype='int64')

    shape = initial_position.shape[0]
133
    # Use tensor as array of fixed length, rather than flexible tensor array. Because in static graph mode,
134 135 136 137 138 139 140 141
    # tensor array will produce tensor of shape[-1], which will cause error when calling jacobian. In this way, can not use append
    # or pop, so we need head and tail to record where is the newest data and where is the oldest.
    # Totally speaking, realized a stack by array.
    sk_vec = paddle.zeros((history_size + 1, shape), dtype=dtype)
    yk_vec = paddle.zeros((history_size + 1, shape), dtype=dtype)
    rhok_vec = paddle.zeros((history_size + 1, 1), dtype=dtype)
    ai_vec = paddle.zeros((history_size + 1, 1), dtype=dtype)

142 143 144 145 146 147 148 149 150 151 152 153 154 155
    def cond(
        k,
        done,
        is_converge,
        num_func_calls,
        value,
        xk,
        g1,
        sk_vec,
        yk_vec,
        rhok_vec,
        head,
        tail,
    ):
156 157
        return (k < max_iters) & ~done

158 159 160 161 162 163 164 165 166 167 168 169 170 171
    def body(
        k,
        done,
        is_converge,
        num_func_calls,
        value,
        xk,
        g1,
        sk_vec,
        yk_vec,
        rhok_vec,
        head,
        tail,
    ):
172 173
        # use assign to cut off the relevance between g1 and q, or they will change together.

174
        # --------------   compute p_k by two-loop recursion    -------------- #
175 176
        q = paddle.assign(g1)
        # In a array circle, the index may out of range, so must use mod.
177
        i = paddle.full(
J
JYChen 已提交
178
            shape=[], fill_value=(head - 1).mod(history_size), dtype='int64'
179
        )
180 181 182 183 184 185 186 187 188 189 190 191 192 193

        def cond(i, q):
            return i != tail

        def body(i, q):
            ai_vec[i] = rhok_vec[i] * paddle.dot(sk_vec[i], q)
            q = q - ai_vec[i] * yk_vec[i]
            i = (i - 1).mod(history_size)
            return i, q

        paddle.static.nn.while_loop(cond=cond, body=body, loop_vars=[i, q])

        r = paddle.matmul(H0, q)

J
JYChen 已提交
194
        i = paddle.full(shape=[], fill_value=tail + 1, dtype='int64')
195 196 197 198 199 200 201 202 203 204 205 206 207 208

        def cond(i, r):
            return i != head

        def body(i, r):
            beta = rhok_vec[i] * paddle.dot(yk_vec[i], r)
            r = r + sk_vec[i] * (ai_vec[i] - beta)
            i = (i + 1).mod(history_size)
            return i, r

        paddle.static.nn.while_loop(cond=cond, body=body, loop_vars=[i, r])

        pk = -r

209
        # --------------   compute alpha by line serach    -------------- #
210 211 212 213 214 215
        if line_search_fn == 'strong_wolfe':
            alpha, value, g2, ls_func_calls = strong_wolfe(
                f=objective_func,
                xk=xk,
                pk=pk,
                initial_step_length=initial_step_length,
216 217
                dtype=dtype,
            )
218 219
        else:
            raise NotImplementedError(
220 221 222 223
                "Currently only support line_search_fn = 'strong_wolfe', but the specified is '{}'".format(
                    line_search_fn
                )
            )
224 225
        paddle.assign(num_func_calls + ls_func_calls, num_func_calls)

226
        # --------------   update sk_vec, yk_vec, rhok_vec    -------------- #
227 228 229 230 231
        sk = alpha * pk
        yk = g2 - g1

        rhok_inv = paddle.dot(yk, sk)
        rhok = paddle.static.nn.cond(
232
            rhok_inv == 0.0,
233
            lambda: paddle.full(shape=[1], fill_value=1000.0, dtype=dtype),
234 235
            lambda: 1.0 / rhok_inv,
        )
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251

        sk_vec[head] = sk
        yk_vec[head] = yk
        rhok_vec[head] = rhok
        head = (head + 1) % history_size

        def true_fn(tail):
            paddle.assign(tail + 1, tail)

        # when array is full, the tail should move forward too.
        paddle.static.nn.cond(head == tail, lambda: true_fn(tail), None)

        xk = xk + sk
        g1 = g2
        k += 1

252
        # --------------   check convergence    -------------- #
253 254
        gnorm = paddle.linalg.norm(g1, p=np.inf)
        pk_norm = paddle.linalg.norm(pk, p=np.inf)
255
        paddle.assign(
256 257
            done | (gnorm < tolerance_grad) | (pk_norm < tolerance_change), done
        )
258 259
        paddle.assign(done, is_converge)
        # when alpha=0, there is no chance to get xk change.
260
        paddle.assign(done | (alpha == 0.0), done)
261 262

        return [
263 264 265 266 267 268 269 270 271 272 273 274
            k,
            done,
            is_converge,
            num_func_calls,
            value,
            xk,
            g1,
            sk_vec,
            yk_vec,
            rhok_vec,
            head,
            tail,
275 276
        ]

277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
    paddle.static.nn.while_loop(
        cond=cond,
        body=body,
        loop_vars=[
            k,
            done,
            is_converge,
            num_func_calls,
            value,
            xk,
            g1,
            sk_vec,
            yk_vec,
            rhok_vec,
            head,
            tail,
        ],
    )
295
    return is_converge, num_func_calls, xk, value, g1