未验证 提交 e7928a06 编写于 作者: S Sing_chan 提交者: GitHub

[New API]: miminize_bfgs and miminize_lbfgs (#40710)

* [New API]: miminize_bfgs and miminize_lbfgs

* modify for python module call correctly

* add functional package, add error raise in static_graph, change assign to set_value

* unify static_graph and dygraph, fix bug when x or H0 is float64

* now only accept input is tensor, put check args in utils.py, put exception test together

* temp

* add more detailed algorithm illustration and comment, reduce test case to limit test time in 15s

* change in_dygraph_mode to in_dynamic_mode

* fix bug of sample code; reduce test case to reduce test time

* change dir to incubate
上级 4974fdfd
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle
import paddle.nn.functional as F
from paddle.incubate.optimizer.functional.bfgs import minimize_bfgs
np.random.seed(123)
def test_static_graph(func, x0, line_search_fn='strong_wolfe', dtype='float32'):
dimension = x0.shape[0]
paddle.enable_static()
main = paddle.static.Program()
startup = paddle.static.Program()
with paddle.static.program_guard(main, startup):
X = paddle.static.data(name='x', shape=[dimension], dtype=dtype)
Y = minimize_bfgs(func, X, line_search_fn=line_search_fn, dtype=dtype)
exe = paddle.static.Executor()
exe.run(startup)
return exe.run(main, feed={'x': x0}, fetch_list=[Y])
def test_static_graph_H0(func, x0, H0, dtype='float32'):
paddle.enable_static()
main = paddle.static.Program()
startup = paddle.static.Program()
with paddle.static.program_guard(main, startup):
X = paddle.static.data(name='x', shape=[x0.shape[0]], dtype=dtype)
H = paddle.static.data(
name='h', shape=[H0.shape[0], H0.shape[1]], dtype=dtype)
Y = minimize_bfgs(
func, X, initial_inverse_hessian_estimate=H, dtype=dtype)
exe = paddle.static.Executor()
exe.run(startup)
return exe.run(main, feed={'x': x0, 'h': H0}, fetch_list=[Y])
def test_dynamic_graph(func,
x0,
H0=None,
line_search_fn='strong_wolfe',
dtype='float32'):
paddle.disable_static()
x0 = paddle.to_tensor(x0)
if H0 is not None:
H0 = paddle.to_tensor(H0)
return minimize_bfgs(
func,
x0,
initial_inverse_hessian_estimate=H0,
line_search_fn=line_search_fn,
dtype=dtype)
class TestBfgs(unittest.TestCase):
def test_quadratic_nd(self):
for dimension in [1, 10]:
minimum = np.random.random(size=[dimension]).astype('float32')
scale = np.exp(np.random.random(size=[dimension]).astype('float32'))
def func(x):
minimum_ = paddle.assign(minimum)
scale_ = paddle.assign(scale)
return paddle.sum(
paddle.multiply(scale_, (F.square_error_cost(x, minimum_))))
x0 = np.random.random(size=[dimension]).astype('float32')
results = test_static_graph(func=func, x0=x0)
self.assertTrue(np.allclose(minimum, results[2]))
results = test_dynamic_graph(func=func, x0=x0)
self.assertTrue(np.allclose(minimum, results[2].numpy()))
def test_inf_minima(self):
extream_point = np.array([-1, 2]).astype('float32')
def func(x):
# df = 3(x - 1.01)(x - 0.99)
# f = x^3 - 3x^2 + 3*1.01*0.99x
return x * x * x / 3.0 - (
extream_point[0] + extream_point[1]
) * x * x / 2 + extream_point[0] * extream_point[1] * x
x0 = np.array([-1.7]).astype('float32')
results = test_static_graph(func, x0)
self.assertFalse(results[0][0])
def test_multi_minima(self):
def func(x):
# df = 12(x + 1.1)(x - 0.2)(x - 0.8)
# f = 3*x^4+0.4*x^3-5.46*x^2+2.112*x
# minimum = -1.1 or 0.8.
# All these minima may be reached from appropriate starting points.
return 3 * x**4 + 0.4 * x**3 - 5.64 * x**2 + 2.112 * x
x0 = np.array([0.82], dtype='float64')
results = test_static_graph(func, x0, dtype='float64')
self.assertTrue(np.allclose(0.8, results[2]))
def test_rosenbrock(self):
# The Rosenbrock function is a standard optimization test case.
a = np.random.random(size=[1]).astype('float32')
minimum = [a.item(), (a**2).item()]
b = np.random.random(size=[1]).astype('float32')
def func(position):
# f(x, y) = (a - x)^2 + b (y - x^2)^2
# minimum = (a, a^2)
x, y = position[0], position[1]
c = (a - x)**2 + b * (y - x**2)**2
# the return cant be np array[1], or in jacobin will cause flat error
return c[0]
x0 = np.random.random(size=[2]).astype('float32')
results = test_dynamic_graph(func, x0)
self.assertTrue(np.allclose(minimum, results[2]))
def test_exception(self):
def func(x):
return paddle.dot(x, x)
x0 = np.random.random(size=[2]).astype('float32')
H0 = np.array([[2.0, 0.0], [0.0, 0.9]]).astype('float32')
# test initial_inverse_hessian_estimate is good
results = test_static_graph_H0(func, x0, H0, dtype='float32')
self.assertTrue(np.allclose([0., 0.], results[2]))
self.assertTrue(results[0][0])
# test initial_inverse_hessian_estimate is bad
H1 = np.array([[1.0, 2.0], [2.0, 1.0]]).astype('float32')
self.assertRaises(ValueError, test_dynamic_graph, func, x0, H0=H1)
# test line_search_fn is bad
self.assertRaises(
NotImplementedError,
test_static_graph,
func,
x0,
line_search_fn='other')
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle
import paddle.nn.functional as F
from paddle.incubate.optimizer.functional.lbfgs import minimize_lbfgs
np.random.seed(123)
def test_static_graph(func, x0, line_search_fn='strong_wolfe', dtype='float32'):
dimension = x0.shape[0]
paddle.enable_static()
main = paddle.static.Program()
startup = paddle.static.Program()
with paddle.static.program_guard(main, startup):
X = paddle.static.data(name='x', shape=[dimension], dtype=dtype)
Y = minimize_lbfgs(func, X, line_search_fn=line_search_fn, dtype=dtype)
exe = paddle.static.Executor()
exe.run(startup)
return exe.run(main, feed={'x': x0}, fetch_list=[Y])
def test_static_graph_H0(func, x0, H0, dtype='float32'):
paddle.enable_static()
main = paddle.static.Program()
startup = paddle.static.Program()
with paddle.static.program_guard(main, startup):
X = paddle.static.data(name='x', shape=[x0.shape[0]], dtype=dtype)
H = paddle.static.data(
name='h', shape=[H0.shape[0], H0.shape[1]], dtype=dtype)
Y = minimize_lbfgs(
func, X, initial_inverse_hessian_estimate=H, dtype=dtype)
exe = paddle.static.Executor()
exe.run(startup)
return exe.run(main, feed={'x': x0, 'h': H0}, fetch_list=[Y])
def test_dynamic_graph(func,
x0,
H0=None,
line_search_fn='strong_wolfe',
dtype='float32'):
paddle.disable_static()
x0 = paddle.to_tensor(x0)
if H0 is not None:
H0 = paddle.to_tensor(H0)
return minimize_lbfgs(
func,
x0,
initial_inverse_hessian_estimate=H0,
line_search_fn=line_search_fn,
dtype=dtype)
class TestLbfgs(unittest.TestCase):
def test_quadratic_nd(self):
for dimension in [1, 10]:
minimum = np.random.random(size=[dimension]).astype('float32')
scale = np.exp(np.random.random(size=[dimension]).astype('float32'))
def func(x):
minimum_ = paddle.assign(minimum)
scale_ = paddle.assign(scale)
return paddle.sum(
paddle.multiply(scale_, (F.square_error_cost(x, minimum_))))
x0 = np.random.random(size=[dimension]).astype('float32')
results = test_static_graph(func, x0)
self.assertTrue(np.allclose(minimum, results[2]))
results = test_dynamic_graph(func, x0)
self.assertTrue(np.allclose(minimum, results[2].numpy()))
def test_inf_minima(self):
extream_point = np.array([-1, 2]).astype('float32')
def func(x):
# df = 3(x - 1.01)(x - 0.99)
# f = x^3 - 3x^2 + 3*1.01*0.99x
return x * x * x / 3.0 - (
extream_point[0] + extream_point[1]
) * x * x / 2 + extream_point[0] * extream_point[1] * x
x0 = np.array([-1.7]).astype('float32')
results = test_static_graph(func, x0)
self.assertFalse(results[0][0])
def test_multi_minima(self):
def func(x):
# df = 12(x + 1.1)(x - 0.2)(x - 0.8)
# f = 3*x^4+0.4*x^3-5.46*x^2+2.112*x
# minimum = -1.1 or 0.8.
# All these minima may be reached from appropriate starting points.
return 3 * x**4 + 0.4 * x**3 - 5.64 * x**2 + 2.112 * x
x0 = np.array([0.82], dtype='float64')
results = test_static_graph(func, x0, dtype='float64')
self.assertTrue(np.allclose(0.8, results[2]))
def test_rosenbrock(self):
# The Rosenbrock function is a standard optimization test case.
a = np.random.random(size=[1]).astype('float32')
minimum = [a.item(), (a**2).item()]
b = np.random.random(size=[1]).astype('float32')
def func(position):
# f(x, y) = (a - x)^2 + b (y - x^2)^2
# minimum = (a, a^2)
x, y = position[0], position[1]
c = (a - x)**2 + b * (y - x**2)**2
# the return cant be np array[1], or in jacobin will cause flat error
return c[0]
x0 = np.random.random(size=[2]).astype('float32')
results = test_dynamic_graph(func, x0)
self.assertTrue(np.allclose(minimum, results[2]))
def test_exception(self):
def func(x):
return paddle.dot(x, x)
x0 = np.random.random(size=[2]).astype('float32')
H0 = np.array([[2.0, 0.0], [0.0, 0.9]]).astype('float32')
# test dtype is not float32 or float64
x1 = np.random.random(size=[2]).astype('int32')
self.assertRaises(
ValueError, test_static_graph, func, x1, dtype='int32')
# test initial_inverse_hessian_estimate is good
results = test_static_graph_H0(func, x0, H0, dtype='float32')
self.assertTrue(np.allclose([0., 0.], results[2]))
self.assertTrue(results[0][0])
# test initial_inverse_hessian_estimate is bad and float64
x2 = np.random.random(size=[2]).astype('float64')
H1 = np.array([[1.0, 2.0], [3.0, 1.0]]).astype('float64')
self.assertRaises(
ValueError, test_static_graph_H0, func, x2, H0=H1, dtype='float64')
if __name__ == '__main__':
unittest.main()
......@@ -15,5 +15,6 @@
from .lookahead import LookAhead # noqa: F401
from .modelaverage import ModelAverage # noqa: F401
from .distributed_fused_lamb import DistributedFusedLamb # noqa: F401
from . import functional # noqa: F401
__all__ = []
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .bfgs import minimize_bfgs # noqa: F401
from .lbfgs import minimize_lbfgs # noqa: F401
__all__ = ['minimize_bfgs', 'minimize_lbfgs']
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from .line_search import strong_wolfe
from .utils import _value_and_gradient, check_input_type, check_initial_inverse_hessian_estimate
import paddle
def minimize_bfgs(objective_func,
initial_position,
max_iters=50,
tolerance_grad=1e-7,
tolerance_change=1e-9,
initial_inverse_hessian_estimate=None,
line_search_fn='strong_wolfe',
max_line_search_iters=50,
initial_step_length=1.0,
dtype='float32',
name=None):
r"""
Minimizes a differentiable function `func` using the BFGS method.
The BFGS is a quasi-Newton method for solving an unconstrained
optimization problem over a differentiable function.
Closely related is the Newton method for minimization. Consider the iterate
update formula
.. math::
x_{k+1} = x_{k} + H \nabla{f},
If $H$ is the inverse Hessian of $f$ at $x_{k}$, then it's the Newton method.
If $H$ is symmetric and positive definite, used as an approximation of the inverse Hessian, then
it's a quasi-Newton. In practice, the approximated Hessians are obtained
by only using the gradients, over either whole or part of the search
history, the former is BFGS.
Reference:
Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006.
pp140: Algorithm 6.1 (BFGS Method).
Following summarizes the the main logic of the program based on BFGS. Note: _k represents value of
k_th iteration, ^T represents the transposition of a vector or matrix.
repeat
p_k = H_k * g_k
alpha = strong_wolfe(f, x_k, p_k)
x_k+1 = x_k + alpha * p_k
s_k = x_k+1 - x_k
y_k = g_k+1 - g_k
rho_k = 1 / (s_k^T * y_k)
V_k^T = I - rho_k * s_k * y_k^T
V_k = I - rho_k * y_k * s_k^T
H_k+1 = V_k^T * H_k * V_k + rho_k * s_k * s_k^T
check_converge
end
Args:
objective_func: the objective function to minimize. ``func`` accepts
a multivariate input and returns a scalar.
initial_position (Tensor): the starting point of the iterates. For methods like Newton and quasi-Newton
the initial trial step length should always be 1.0.
max_iters (int): the maximum number of minimization iterations.
tolerance_grad (float): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm.
tolerance_change (float): terminates if the change of function value/position/parameter between
two iterations is smaller than this value.
initial_inverse_hessian_estimate (Tensor): the initial inverse hessian approximation at initial_position.
It must be symmetric and positive definite.
line_search_fn (str): indicate which line search method to use, only support 'strong wolfe' right now. May support
'Hager Zhang' in the futrue.
max_line_search_iters (int): the maximum number of line search iterations.
initial_step_length (float): step length used in first iteration of line search. different initial_step_length
may cause different optimal result.
dtype ('float32' | 'float64'): In static graph, float64 will be convert to float32 due to paddle.assign limit.
Returns:
is_converge (bool): Indicates whether found the minimum within tolerance.
num_func_calls (int): number of objective function called.
position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of
the objective function regrading to the initial position.
objective_value (Tensor): objective function value at the `position`.
objective_gradient (Tensor): objective function gradient at the `position`.
inverse_hessian_estimate (Tensor): the estimate of inverse hessian at the `position`.
Examples:
.. code-block:: python
import paddle
def func(x):
return paddle.dot(x, x)
x0 = paddle.to_tensor([1.3, 2.7])
results = paddle.optimizer.functional.minimize_bfgs(func, x0)
print("is_converge: ", results[0])
print("the minimum of func is: ", results[2])
# is_converge: is_converge: Tensor(shape=[1], dtype=bool, place=Place(gpu:0), stop_gradient=True,
# [True])
# the minimum of func is: Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
# [0., 0.])
"""
if dtype not in ['float32', 'float64']:
raise ValueError(
"The dtype must be 'float32' or 'float64', but the specified is {}.".
format(dtype))
op_name = 'minimize_bfgs'
check_input_type(initial_position, 'initial_position', op_name)
I = paddle.eye(initial_position.shape[0], dtype=dtype)
if initial_inverse_hessian_estimate is None:
initial_inverse_hessian_estimate = I
else:
check_input_type(initial_inverse_hessian_estimate,
'initial_inverse_hessian_estimate', op_name)
check_initial_inverse_hessian_estimate(initial_inverse_hessian_estimate)
Hk = paddle.assign(initial_inverse_hessian_estimate)
xk = initial_position
value, g1 = _value_and_gradient(objective_func, xk)
num_func_calls = paddle.full(shape=[1], fill_value=1, dtype='int64')
# when the dim of x is 1000, it needs more than 30 iters to get all element converge to minimum.
k = paddle.full(shape=[1], fill_value=0, dtype='int64')
done = paddle.full(shape=[1], fill_value=False, dtype='bool')
is_converge = paddle.full(shape=[1], fill_value=False, dtype='bool')
def cond(k, done, is_converge, num_func_calls, xk, value, g1, Hk):
return (k < max_iters) & ~done
def body(k, done, is_converge, num_func_calls, xk, value, g1, Hk):
############# compute pk #############
pk = -paddle.matmul(Hk, g1)
############# compute alpha by line serach #############
if line_search_fn == 'strong_wolfe':
alpha, value, g2, ls_func_calls = strong_wolfe(
f=objective_func,
xk=xk,
pk=pk,
initial_step_length=initial_step_length,
dtype=dtype)
else:
raise NotImplementedError(
"Currently only support line_search_fn = 'strong_wolfe', but the specified is '{}'".
format(line_search_fn))
num_func_calls += ls_func_calls
############# update Hk #############
sk = alpha * pk
yk = g2 - g1
xk = xk + sk
g1 = g2
sk = paddle.unsqueeze(sk, 0)
yk = paddle.unsqueeze(yk, 0)
rhok_inv = paddle.dot(yk, sk)
rhok = paddle.static.nn.cond(
rhok_inv == 0., lambda: paddle.full(shape=[1], fill_value=1000.0, dtype=dtype), lambda: 1. / rhok_inv)
Vk_transpose = I - rhok * sk * yk.t()
Vk = I - rhok * yk * sk.t()
Hk = paddle.matmul(paddle.matmul(Vk_transpose, Hk),
Vk) + rhok * sk * sk.t()
k += 1
############# check convergence #############
gnorm = paddle.linalg.norm(g1, p=np.inf)
pk_norm = paddle.linalg.norm(pk, p=np.inf)
paddle.assign(done | (gnorm < tolerance_grad) |
(pk_norm < tolerance_change), done)
paddle.assign(done, is_converge)
# when alpha=0, there is no chance to get xk change.
paddle.assign(done | (alpha == 0.), done)
return [k, done, is_converge, num_func_calls, xk, value, g1, Hk]
paddle.static.nn.while_loop(
cond=cond,
body=body,
loop_vars=[k, done, is_converge, num_func_calls, xk, value, g1, Hk])
return is_converge, num_func_calls, xk, value, g1, Hk
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from .line_search import strong_wolfe
from .utils import _value_and_gradient, check_input_type, check_initial_inverse_hessian_estimate
import paddle
def minimize_lbfgs(objective_func,
initial_position,
history_size=100,
max_iters=50,
tolerance_grad=1e-8,
tolerance_change=1e-8,
initial_inverse_hessian_estimate=None,
line_search_fn='strong_wolfe',
max_line_search_iters=50,
initial_step_length=1.0,
dtype='float32',
name=None):
r"""Minimizes a differentiable function `func` using the L-BFGS method.
The L-BFGS is simalar as BFGS, the only difference is that L-BFGS use historical
sk, yk, rhok rather than H_k-1 to compute Hk.
Reference:
Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006.
pp179: Algorithm 7.5 (L-BFGS).
Following summarizes the the main logic of the program based on L-BFGS.Note: _k represents
value of k_th iteration, ^T represents the transposition of a vector or matrix.
repeat
compute p_k by two-loop recursion
alpha = strong_wolfe(f, x_k, p_k)
x_k+1 = x_k + alpha * p_k
s_k = x_k+1 - x_k
y_k = g_k+1 - g_k
rho_k = 1 / (s_k^T * y_k)
update sk_vec, yk_vec, rhok_vec
check_converge
end
Args:
objective_func: the objective function to minimize. ``func`` accepts
a multivariate input and returns a scalar.
initial_position (Tensor): the starting point of the iterates. For methods like Newton and quasi-Newton
the initial trial step length should always be 1.0 .
history_size (Scalar): the number of stored vector pairs {si,yi}.
max_iters (Scalar): the maximum number of minimization iterations.
tolerance_grad (Scalar): terminates if the gradient norm is smaller than
this. Currently gradient norm uses inf norm.
tolerance_change (Scalar): terminates if the change of function value/position/parameter between
two iterations is smaller than this value.
initial_inverse_hessian_estimate (Tensor): the initial inverse hessian approximation.
line_search_fn (str): indicate which line search method to use, only support 'strong wolfe' right now. May support
'Hager Zhang' in the futrue.
max_line_search_iters (Scalar): the maximum number of line search iterations.
initial_step_length: step length used in first iteration of line search. different initial_step_length
may cause different optimal result.
dtype ('float' | 'float32' | 'float64' | 'double'): the data
type to be used.
Returns:
is_converge (bool): Indicates whether found the minimum within tolerance.
num_func_calls (int): number of objective function called.
position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of
the objective function regrading to the initial position.
objective_value (Tensor): objective function value at the `position`.
objective_gradient (Tensor): objective function gradient at the `position`.
Examples:
.. code-block:: python
import paddle
def func(x):
return paddle.dot(x, x)
x0 = paddle.to_tensor([1.3, 2.7])
results = paddle.optimizer.functional.minimize_lbfgs(func, x0)
print("is_converge: ", results[0])
print("the minimum of func is: ", results[2])
# is_converge: is_converge: Tensor(shape=[1], dtype=bool, place=Place(gpu:0), stop_gradient=True,
# [True])
# the minimum of func is: Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
# [0., 0.])
"""
if dtype not in ['float32', 'float64']:
raise ValueError(
"The dtype must be 'float32' or 'float64', but the specified is {}.".
format(dtype))
op_name = 'minimize_lbfgs'
check_input_type(initial_position, 'initial_position', op_name)
if initial_inverse_hessian_estimate is None:
H0 = paddle.eye(initial_position.shape[0], dtype=dtype)
else:
check_input_type(initial_inverse_hessian_estimate,
'initial_inverse_hessian_estimate', op_name)
check_initial_inverse_hessian_estimate(initial_inverse_hessian_estimate)
H0 = initial_inverse_hessian_estimate
xk = initial_position
value, g1 = _value_and_gradient(objective_func, xk)
k = paddle.full(shape=[1], fill_value=0, dtype='int64')
done = paddle.full(shape=[1], fill_value=False, dtype='bool')
is_converge = paddle.full(shape=[1], fill_value=False, dtype='bool')
num_func_calls = paddle.full(shape=[1], fill_value=1, dtype='int64')
history_size = paddle.full(
shape=[1], fill_value=history_size, dtype='int64')
head = paddle.full(shape=[1], fill_value=1, dtype='int64')
tail = paddle.full(shape=[1], fill_value=0, dtype='int64')
shape = initial_position.shape[0]
# Use tensor as array of fixed length, rather than flexible tensor array. Because in static mode,
# tensor array will produce tensor of shape[-1], which will cause error when calling jacobian. In this way, can not use append
# or pop, so we need head and tail to record where is the newest data and where is the oldest.
# Totally speaking, realized a stack by array.
sk_vec = paddle.zeros((history_size + 1, shape), dtype=dtype)
yk_vec = paddle.zeros((history_size + 1, shape), dtype=dtype)
rhok_vec = paddle.zeros((history_size + 1, 1), dtype=dtype)
ai_vec = paddle.zeros((history_size + 1, 1), dtype=dtype)
def cond(k, done, is_converge, num_func_calls, value, xk, g1, sk_vec,
yk_vec, rhok_vec, head, tail):
return (k < max_iters) & ~done
def body(k, done, is_converge, num_func_calls, value, xk, g1, sk_vec,
yk_vec, rhok_vec, head, tail):
# use assign to cut off the relevance between g1 and q, or they will change together.
############# compute p_k by two-loop recursion #############
q = paddle.assign(g1)
# In a array circle, the index may out of range, so must use mod.
i = paddle.full(
shape=[1], fill_value=(head - 1).mod(history_size), dtype='int64')
def cond(i, q):
return i != tail
def body(i, q):
ai_vec[i] = rhok_vec[i] * paddle.dot(sk_vec[i], q)
q = q - ai_vec[i] * yk_vec[i]
i = (i - 1).mod(history_size)
return i, q
paddle.static.nn.while_loop(cond=cond, body=body, loop_vars=[i, q])
r = paddle.matmul(H0, q)
i = paddle.full(shape=[1], fill_value=tail + 1, dtype='int64')
def cond(i, r):
return i != head
def body(i, r):
beta = rhok_vec[i] * paddle.dot(yk_vec[i], r)
r = r + sk_vec[i] * (ai_vec[i] - beta)
i = (i + 1).mod(history_size)
return i, r
paddle.static.nn.while_loop(cond=cond, body=body, loop_vars=[i, r])
pk = -r
############# compute alpha by line serach #############
if line_search_fn == 'strong_wolfe':
alpha, value, g2, ls_func_calls = strong_wolfe(
f=objective_func,
xk=xk,
pk=pk,
initial_step_length=initial_step_length,
dtype=dtype)
else:
raise NotImplementedError(
"Currently only support line_search_fn = 'strong_wolfe', but the specified is '{}'".
format(line_search_fn))
paddle.assign(num_func_calls + ls_func_calls, num_func_calls)
############# update sk_vec, yk_vec, rhok_vec #############
sk = alpha * pk
yk = g2 - g1
rhok_inv = paddle.dot(yk, sk)
rhok = paddle.static.nn.cond(
rhok_inv == 0., lambda: paddle.full(shape=[1], fill_value=1000.0, dtype=dtype), lambda: 1. / rhok_inv)
sk_vec[head] = sk
yk_vec[head] = yk
rhok_vec[head] = rhok
head = (head + 1) % history_size
def true_fn(tail):
paddle.assign(tail + 1, tail)
# when array is full, the tail should move forward too.
paddle.static.nn.cond(head == tail, lambda: true_fn(tail), None)
xk = xk + sk
g1 = g2
k += 1
############# check convergence #############
gnorm = paddle.linalg.norm(g1, p=np.inf)
pk_norm = paddle.linalg.norm(pk, p=np.inf)
paddle.assign(done | (gnorm < tolerance_grad) |
(pk_norm < tolerance_change), done)
paddle.assign(done, is_converge)
# when alpha=0, there is no chance to get xk change.
paddle.assign(done | (alpha == 0.), done)
return [
k, done, is_converge, num_func_calls, value, xk, g1, sk_vec, yk_vec,
rhok_vec, head, tail
]
paddle.static.nn.while_loop(
cond=cond,
body=body,
loop_vars=[
k, done, is_converge, num_func_calls, value, xk, g1, sk_vec, yk_vec,
rhok_vec, head, tail
])
return is_converge, num_func_calls, xk, value, g1
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .utils import _value_and_gradient
import paddle
def cubic_interpolation_(x1, f1, g1, x2, f2, g2):
r"""Cubic interpolation between (x1, f1, g1) and (x2, f2, g2).
Use two points and their gradient to determine a cubic function and get the minimun point
between them in the cubic curve.
Reference:
Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006.
pp59: formula 3.59
Args:
x1, f1, g1: point1's position, value and gradient.
x2, f2, g2: point2's position, value and gradient.
Returns:
min_pos: the minimun point between the specified points in the cubic curve.
"""
xmin, xmax = paddle.static.nn.cond(x1 <= x2, lambda: (x1, x2),
lambda: (x2, x1))
d1 = g1 + g2 - 3 * (f1 - f2) / (x1 - x2)
d2_square = d1**2 - g1 * g2
def true_func1():
d2 = d2_square.sqrt()
def true_fn2():
return x2 - (x2 - x1) * ((g2 + d2 - d1) / (g2 - g1 + 2 * d2))
def false_fn2():
return x1 - (x1 - x2) * ((g1 + d2 - d1) / (g1 - g2 + 2 * d2))
pred = paddle.less_equal(x=x1, y=x2)
min_pos = paddle.static.nn.cond(pred, true_fn2, false_fn2)
return paddle.minimum(paddle.maximum(min_pos, xmin), xmax)
def false_func1():
return (xmin + xmax) / 2.
min_pos = paddle.static.nn.cond(d2_square >= 0., true_func1, false_func1)
return min_pos
def strong_wolfe(f,
xk,
pk,
max_iters=20,
tolerance_change=1e-8,
initial_step_length=1.0,
c1=1e-4,
c2=0.9,
alpha_max=10,
dtype='float32'):
r"""Implements of line search algorithm that satisfies the strong Wolfe conditions using double zoom.
Reference:
Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006.
pp60: Algorithm 3.5 (Line Search Algorithm).
Args:
f: the objective function to minimize. ``f`` accepts a multivariate input and returns a scalar.
xk (Tensor): the starting point of the iterates.
pk (Tensor): search direction.
max_iters (Scalar): the maximum number of iterations.
tolerance_grad (Scalar): terminates if the gradient norm is smaller than
this. Currently gradient norm uses inf norm.
tolerance_change (Scalar): terminates if the change of function value/position/parameter between
two iterations is smaller than this value.
initial_step_length (Scalar): step length used in first iteration.
c1 (Scalar): parameter for sufficient decrease condition.
c2 (Scalar): parameter for curvature condition.
alpha_max (float): max step length.
dtype ('float32' | 'float64'): the datatype to be used.
Returns:
num_func_calls (float): number of objective function called in line search process.
a_star(Tensor): optimal step length, or 0. if the line search algorithm did not converge.
phi_star (Tensor): phi at a_star.
derphi_star (Tensor): derivative of phi at a_star.
Following summarizes the essentials of the strong Wolfe line search algorithm.
Some notations used in the description:
- `f` denotes the objective function.
- `phi` is a function of step size alpha, restricting `f` on a line.
phi = f(xk + a * pk),
where xk is the position of k'th iterate, pk is the line search direction(decent direction),
and a is the step size.
- a : substitute of alpha
- a1 is a of last iteration, which is alpha_(i-1).
- a2 is a of current iteration, which is alpha_i.
- a_lo is a in left position when calls zoom, which is alpha_low.
- a_hi is a in right position when calls zoom, which is alpha_high.
Line Search Algorithm:
repeat
Compute phi(a2) and derphi(a2).
1. If phi(a2) > phi(0) + c_1 * a2 * phi'(0) or [phi(a2) >= phi(a1) and i > 1],
a_star= zoom(a1, a2) and stop;
2. If |phi'(a2)| <= -c_2 * phi'(0),
a_star= a2 and stop;
3. If phi'(a2) >= 0,
a_star= zoom(a2, a1) and stop;
a1 = a2
a2 = min(2 * a2, a2)
i = i + 1
end(repeat)
zoom(a_lo, a_hi) Algorithm:
repeat
aj = cubic_interpolation(a_lo, a_hi)
Compute phi(aj) and derphi(aj).
1. If phi(aj) > phi(0) + c_1 * aj * phi'(0) or phi(aj) >= phi(a_lo),
then a_hi <- aj;
2.
2.1. If |phi'(aj)| <= -c_2 * phi'(0), then a_star= a2 and stop;
2.2. If phi'(aj) * (a2 - a1) >= 0, then a_hi = a_lo
a_lo = aj;
end(repeat)
"""
def phi_and_derphi(a):
r"""Compute function value and derivative of phi at a.
phi = f(xk + a * pk)
phi'(a) = f'(xk + a * pk) * pk
"""
phi_value, f_grad = _value_and_gradient(f, xk + a * pk)
phi_grad = paddle.dot(f_grad, pk)
# return f_grad to be used in bfgs/l-bfgs to compute yk to avoid computint repeatly.
return phi_value, f_grad, phi_grad
def zoom(a_lo, phi_lo, derphi_lo, derf_lo, a_hi, phi_hi, derphi_hi, phi_0,
derphi_0):
# find the exact a from the bracket [a_lo, a_hi]
max_zoom_iters = max_iters
j = paddle.full(shape=[1], fill_value=0, dtype='int64')
done_zoom = paddle.full(shape=[1], fill_value=False, dtype='bool')
def cond_zoom(j, done_zoom, a_lo, phi_lo, derphi_lo, derf_lo, a_hi,
phi_hi, derphi_hi):
pred = paddle.abs(a_hi - a_lo) < tolerance_change
paddle.assign(done_zoom | pred, done_zoom)
return (j < max_zoom_iters) & ~done_zoom
def body_zoom(j, done_zoom, a_lo, phi_lo, derphi_lo, derf_lo, a_hi,
phi_hi, derphi_hi):
aj = cubic_interpolation_(a_lo, phi_lo, derphi_lo, a_hi, phi_hi,
derphi_hi) # 21
min_change = 0.1 * paddle.abs(a_hi - a_lo)
pred = paddle.minimum(
paddle.abs(aj - a_lo), paddle.abs(aj - a_hi)) < min_change
aj = paddle.static.nn.cond(pred, lambda: 0.5 * (a_lo + a_hi),
lambda: aj)
phi_j, derf_j, derphi_j = phi_and_derphi(aj)
def true_fn():
# use assing to modify the variable in-place
paddle.assign(aj, a_hi)
paddle.assign(phi_j, phi_hi)
paddle.assign(derphi_j, derphi_hi)
def false_fn(a_lo, done_zoom):
pred3 = (paddle.abs(derphi_j) <= -c2 * derphi_0)
paddle.assign(pred3, done_zoom)
def true_fn():
paddle.assign(a_lo, a_hi)
paddle.assign(phi_lo, phi_hi)
paddle.assign(derphi_lo, derphi_hi)
pred4 = ~done_zoom & (derphi_j * (a_hi - a_lo) >= 0)
paddle.static.nn.cond(pred4, true_fn, None)
paddle.assign(aj, a_lo)
paddle.assign(phi_j, phi_lo)
paddle.assign(derphi_j, derphi_lo)
paddle.assign(derf_j, derf_lo)
pred2 = (phi_j > phi_0 + c1 * aj * derphi_0) | (phi_j >= phi_lo)
paddle.static.nn.cond(pred2, true_fn,
lambda: false_fn(a_lo, done_zoom))
j = paddle.static.nn.cond(done_zoom, lambda: j, lambda: j + 1)
return [
j, done_zoom, a_lo, phi_lo, derphi_lo, derf_lo, a_hi, phi_hi,
derphi_hi
]
paddle.static.nn.while_loop(
cond=cond_zoom,
body=body_zoom,
loop_vars=[
j, done_zoom, a_lo, phi_lo, derphi_lo, derf_lo, a_hi, phi_hi,
derphi_hi
])
# j is the number of object function called in zoom.
return j
alpha_max = paddle.full(shape=[1], fill_value=alpha_max, dtype=dtype)
a1 = paddle.full(shape=[1], fill_value=0., dtype=dtype)
a2 = paddle.full(shape=[1], fill_value=initial_step_length, dtype=dtype)
phi_1, derf_1, derphi_1 = phi_and_derphi(a1)
# use assign to cut off binding between two variables
phi_0 = paddle.assign(phi_1)
derphi_0 = paddle.assign(derphi_1)
ls_func_calls = paddle.full(shape=[1], fill_value=1, dtype='int64')
# If not found the a_star, will return alpha=0 and f(xk), derf(xk)
a_star = paddle.full(shape=[1], fill_value=0, dtype=dtype)
phi_star = paddle.assign(phi_1)
derf_star = paddle.assign(derf_1)
i = paddle.full(shape=[1], fill_value=0, dtype='int64')
done = paddle.full(shape=[1], fill_value=False, dtype='bool')
def cond(i, ls_func_calls, a1, a2, phi_1, derf_1, done):
return (i < max_iters) & ~done
def body(i, ls_func_calls, a1, a2, phi_1, derf_1, done):
phi_2, derf_2, derphi_2 = phi_and_derphi(a2)
paddle.assign(ls_func_calls + 1, ls_func_calls)
paddle.assign(done | paddle.any(paddle.isinf(phi_2)), done)
def true_fn1():
j = zoom(a1, phi_1, derphi_1, derf_1, a2, phi_2, derphi_2, phi_0,
derphi_0)
paddle.assign(a1, a_star)
paddle.assign(phi_1, phi_star)
paddle.assign(derf_1, derf_star)
paddle.assign(ls_func_calls + j, ls_func_calls)
pred1 = ~done & ((phi_2 > phi_0 + c1 * a2 * derphi_0) | (
(phi_2 >= phi_0) & (i > 1)))
paddle.assign(done | pred1, done)
paddle.static.nn.cond(pred1, true_fn1, None)
def true_fn2():
paddle.assign(a2, a_star)
paddle.assign(phi_2, phi_star)
paddle.assign(derf_2, derf_star)
pred2 = ~done & (paddle.abs(derphi_2) <= -c2 * derphi_0)
paddle.assign(done | pred2, done)
paddle.static.nn.cond(pred2, true_fn2, None)
def true_fn3():
j = zoom(a2, phi_2, derphi_2, derf_2, a1, phi_1, derphi_1, phi_0,
derphi_0)
paddle.assign(a2, a_star)
paddle.assign(phi_2, phi_star)
paddle.assign(derf_2, derf_star)
paddle.assign(ls_func_calls + j, ls_func_calls)
pred3 = ~done & (derphi_2 >= 0)
paddle.assign(done | pred3, done)
paddle.static.nn.cond(pred3, true_fn3, None)
def false_fn():
paddle.assign(a2, a1)
paddle.assign(phi_2, phi_1)
paddle.assign(derf_2, derf_1)
paddle.assign(paddle.minimum(2 * a2, alpha_max), a2)
paddle.assign(i + 1, i)
paddle.static.nn.cond(done, None, false_fn)
return [i, ls_func_calls, a1, a2, phi_1, derf_1, done]
paddle.static.nn.while_loop(
cond=cond,
body=body,
loop_vars=[i, ls_func_calls, a1, a2, phi_1, derf_1, done])
return a_star, phi_star, derf_star, ls_func_calls
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle.autograd.functional import vjp, Jacobian
from paddle.fluid.framework import Variable
from paddle.fluid.data_feeder import check_type, check_dtype
def check_input_type(input, name, op_name):
r"""Check whether the input is tensor or variable."""
if paddle.in_dynamic_mode():
if not isinstance(input, paddle.Tensor):
raise ValueError("The input: {} must be tensor.".format(input))
else:
check_type(input, name, Variable, op_name)
def check_initial_inverse_hessian_estimate(H0):
r"""Check whether the specified initial_inverse_hessian_estimate is symmetric and positive definite.
Raise errors when precondition not met.
Note:
In static graph can not raise error directly, so use py_func make raise_func as a op,
and use paddle.static.nn.cond to decide if put the op in net.
cholesky is the fast way to check positive definition, but in static graph can not catch
exception to raise value error, so use eigvals rather than cholesky in static graph.
"""
is_symmetric = paddle.all(paddle.equal(H0, H0.t()))
def raise_func():
raise ValueError(
"The initial_inverse_hessian_estimate should be symmetric and positive definite, but the specified is not."
)
if paddle.in_dynamic_mode():
if not is_symmetric:
raise_func()
try:
paddle.linalg.cholesky(H0)
except RuntimeError as error:
raise_func()
else:
def create_tmp_var(program, name, dtype, shape):
return program.current_block().create_var(
name=name, dtype=dtype, shape=shape)
out_var = create_tmp_var(
paddle.static.default_main_program(),
name='output',
dtype='float32',
shape=[-1])
def false_fn():
paddle.static.nn.py_func(
func=raise_func, x=is_symmetric, out=out_var)
paddle.static.nn.cond(is_symmetric, None, false_fn)
# eigvals only support cpu
paddle.set_device("cpu")
eigvals = paddle.paddle.linalg.eigvals(H0)
is_positive = paddle.all(eigvals.real() > 0.) and paddle.all(
eigvals.imag() == 0.)
paddle.static.nn.cond(is_positive, None, false_fn)
def _value_and_gradient(f, x, v=None):
r"""Compute function value and gradient of f at x.
Args:
f (Callable): the objective function.
x (Tensor): the input tensor.
Returns:
value: a tensor that holds the function value.
gradient: a tensor that holds the function gradients.
"""
if paddle.in_dynamic_mode():
value, gradient = vjp(f, x, v=v)
gradient = gradient[0]
else:
JJ = Jacobian(f, x)
gradient = JJ[:][0]
value = f(x)
return value, gradient
......@@ -362,6 +362,7 @@ packages=['paddle',
'paddle.incubate.nn',
'paddle.incubate.nn.functional',
'paddle.incubate.nn.layer',
'paddle.incubate.optimizer.functional',
'paddle.io',
'paddle.optimizer',
'paddle.nn',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册