From 7f73ef2c7304ea3a4d22659ac8701d36e588c4e3 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Thu, 14 Apr 2022 12:46:30 +0800 Subject: [PATCH] fix bfgs_doc (#41505) * fix bfgs_doc; test=document_fix * add parameter name; test=document_fix * modify according to chenlong's comments;test=document_fix --- .../incubate/optimizer/functional/bfgs.py | 82 +++++++------------ .../incubate/optimizer/functional/lbfgs.py | 78 ++++++++---------- 2 files changed, 66 insertions(+), 94 deletions(-) diff --git a/python/paddle/incubate/optimizer/functional/bfgs.py b/python/paddle/incubate/optimizer/functional/bfgs.py index abdab457fd..23fd8dc082 100644 --- a/python/paddle/incubate/optimizer/functional/bfgs.py +++ b/python/paddle/incubate/optimizer/functional/bfgs.py @@ -33,63 +33,43 @@ def minimize_bfgs(objective_func, name=None): r""" Minimizes a differentiable function `func` using the BFGS method. - The BFGS is a quasi-Newton method for solving an unconstrained - optimization problem over a differentiable function. - Closely related is the Newton method for minimization. Consider the iterate - update formula + The BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function. + Closely related is the Newton method for minimization. Consider the iterate update formula: + .. math:: - x_{k+1} = x_{k} + H \nabla{f}, - If $H$ is the inverse Hessian of $f$ at $x_{k}$, then it's the Newton method. - If $H$ is symmetric and positive definite, used as an approximation of the inverse Hessian, then + x_{k+1} = x_{k} + H_k \nabla{f_k} + + If :math:`H_k` is the inverse Hessian of :math:`f` at :math:`x_k`, then it's the Newton method. + If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then it's a quasi-Newton. In practice, the approximated Hessians are obtained by only using the gradients, over either whole or part of the search - history, the former is BFGS. - - Reference: - Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. - pp140: Algorithm 6.1 (BFGS Method). - - Following summarizes the the main logic of the program based on BFGS. Note: _k represents value of - k_th iteration, ^T represents the transposition of a vector or matrix. - repeat - p_k = H_k * g_k - alpha = strong_wolfe(f, x_k, p_k) - x_k+1 = x_k + alpha * p_k - s_k = x_k+1 - x_k - y_k = g_k+1 - g_k - rho_k = 1 / (s_k^T * y_k) - V_k^T = I - rho_k * s_k * y_k^T - V_k = I - rho_k * y_k * s_k^T - H_k+1 = V_k^T * H_k * V_k + rho_k * s_k * s_k^T - check_converge - end + history, the former is BFGS, the latter is L-BFGS. + + Reference: + Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp140: Algorithm 6.1 (BFGS Method). Args: - objective_func: the objective function to minimize. ``func`` accepts - a multivariate input and returns a scalar. - initial_position (Tensor): the starting point of the iterates. For methods like Newton and quasi-Newton - the initial trial step length should always be 1.0. - max_iters (int): the maximum number of minimization iterations. - tolerance_grad (float): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. - tolerance_change (float): terminates if the change of function value/position/parameter between - two iterations is smaller than this value. - initial_inverse_hessian_estimate (Tensor): the initial inverse hessian approximation at initial_position. - It must be symmetric and positive definite. - line_search_fn (str): indicate which line search method to use, only support 'strong wolfe' right now. May support - 'Hager Zhang' in the futrue. - max_line_search_iters (int): the maximum number of line search iterations. - initial_step_length (float): step length used in first iteration of line search. different initial_step_length - may cause different optimal result. - dtype ('float32' | 'float64'): In static graph, float64 will be convert to float32 due to paddle.assign limit. - + objective_func: the objective function to minimize. ``objective_func`` accepts a multivariate input and returns a scalar. + initial_position (Tensor): the starting point of the iterates. + max_iters (int, optional): the maximum number of minimization iterations. Default value: 50. + tolerance_grad (float, optional): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. Default value: 1e-7. + tolerance_change (float, optional): terminates if the change of function value/position/parameter between two iterations is smaller than this value. Default value: 1e-9. + initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. Default value: None. + line_search_fn (str, optional): indicate which line search method to use, only support 'strong wolfe' right now. May support 'Hager Zhang' in the futrue. Default value: 'strong wolfe'. + max_line_search_iters (int, optional): the maximum number of line search iterations. Default value: 50. + initial_step_length (float, optional): step length used in first iteration of line search. different initial_step_length may cause different optimal result. For methods like Newton and quasi-Newton the initial trial step length should always be 1.0. Default value: 1.0. + dtype ('float32' | 'float64', optional): data type used in the algorithm. Default value: 'float32'. + name (str, optional): Name for the operation. For more information, please refer to :ref:`api_guide_Name`. Default value: None. + Returns: - is_converge (bool): Indicates whether found the minimum within tolerance. - num_func_calls (int): number of objective function called. - position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of - the objective function regrading to the initial position. - objective_value (Tensor): objective function value at the `position`. - objective_gradient (Tensor): objective function gradient at the `position`. - inverse_hessian_estimate (Tensor): the estimate of inverse hessian at the `position`. + output(tuple): + + - is_converge (bool): Indicates whether found the minimum within tolerance. + - num_func_calls (int): number of objective function called. + - position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of the objective function regrading to the initial position. + - objective_value (Tensor): objective function value at the `position`. + - objective_gradient (Tensor): objective function gradient at the `position`. + - inverse_hessian_estimate (Tensor): the estimate of inverse hessian at the `position`. Examples: .. code-block:: python diff --git a/python/paddle/incubate/optimizer/functional/lbfgs.py b/python/paddle/incubate/optimizer/functional/lbfgs.py index d4bf511f85..f283381597 100644 --- a/python/paddle/incubate/optimizer/functional/lbfgs.py +++ b/python/paddle/incubate/optimizer/functional/lbfgs.py @@ -32,54 +32,46 @@ def minimize_lbfgs(objective_func, initial_step_length=1.0, dtype='float32', name=None): - r"""Minimizes a differentiable function `func` using the L-BFGS method. - The L-BFGS is simalar as BFGS, the only difference is that L-BFGS use historical - sk, yk, rhok rather than H_k-1 to compute Hk. + r""" + Minimizes a differentiable function `func` using the L-BFGS method. + The L-BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function. + Closely related is the Newton method for minimization. Consider the iterate update formula: + + .. math:: + x_{k+1} = x_{k} + H_k \nabla{f_k} + + If :math:`H_k` is the inverse Hessian of :math:`f` at :math:`x_k`, then it's the Newton method. + If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then + it's a quasi-Newton. In practice, the approximated Hessians are obtained + by only using the gradients, over either whole or part of the search + history, the former is BFGS, the latter is L-BFGS. + Reference: - Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. - pp179: Algorithm 7.5 (L-BFGS). - - Following summarizes the the main logic of the program based on L-BFGS.Note: _k represents - value of k_th iteration, ^T represents the transposition of a vector or matrix. - repeat - compute p_k by two-loop recursion - alpha = strong_wolfe(f, x_k, p_k) - x_k+1 = x_k + alpha * p_k - s_k = x_k+1 - x_k - y_k = g_k+1 - g_k - rho_k = 1 / (s_k^T * y_k) - update sk_vec, yk_vec, rhok_vec - check_converge - end + Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS). Args: - objective_func: the objective function to minimize. ``func`` accepts - a multivariate input and returns a scalar. - initial_position (Tensor): the starting point of the iterates. For methods like Newton and quasi-Newton - the initial trial step length should always be 1.0 . - history_size (Scalar): the number of stored vector pairs {si,yi}. - max_iters (Scalar): the maximum number of minimization iterations. - tolerance_grad (Scalar): terminates if the gradient norm is smaller than - this. Currently gradient norm uses inf norm. - tolerance_change (Scalar): terminates if the change of function value/position/parameter between - two iterations is smaller than this value. - initial_inverse_hessian_estimate (Tensor): the initial inverse hessian approximation. - line_search_fn (str): indicate which line search method to use, only support 'strong wolfe' right now. May support - 'Hager Zhang' in the futrue. - max_line_search_iters (Scalar): the maximum number of line search iterations. - initial_step_length: step length used in first iteration of line search. different initial_step_length - may cause different optimal result. - dtype ('float' | 'float32' | 'float64' | 'double'): the data - type to be used. - + objective_func: the objective function to minimize. ``objective_func`` accepts a multivariate input and returns a scalar. + initial_position (Tensor): the starting point of the iterates. + history_size (Scalar): the number of stored vector pairs {si,yi}. Default value: 100. + max_iters (int, optional): the maximum number of minimization iterations. Default value: 50. + tolerance_grad (float, optional): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. Default value: 1e-7. + tolerance_change (float, optional): terminates if the change of function value/position/parameter between two iterations is smaller than this value. Default value: 1e-9. + initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. Default value: None. + line_search_fn (str, optional): indicate which line search method to use, only support 'strong wolfe' right now. May support 'Hager Zhang' in the futrue. Default value: 'strong wolfe'. + max_line_search_iters (int, optional): the maximum number of line search iterations. Default value: 50. + initial_step_length (float, optional): step length used in first iteration of line search. different initial_step_length may cause different optimal result. For methods like Newton and quasi-Newton the initial trial step length should always be 1.0. Default value: 1.0. + dtype ('float32' | 'float64', optional): data type used in the algorithm. Default value: 'float32'. + name (str, optional): Name for the operation. For more information, please refer to :ref:`api_guide_Name`. Default value: None. + Returns: - is_converge (bool): Indicates whether found the minimum within tolerance. - num_func_calls (int): number of objective function called. - position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of - the objective function regrading to the initial position. - objective_value (Tensor): objective function value at the `position`. - objective_gradient (Tensor): objective function gradient at the `position`. + output(tuple): + - is_converge (bool): Indicates whether found the minimum within tolerance. + - num_func_calls (int): number of objective function called. + - position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of the objective function regrading to the initial position. + - objective_value (Tensor): objective function value at the `position`. + - objective_gradient (Tensor): objective function gradient at the `position`. + Examples: .. code-block:: python -- GitLab