From 8386417ed2e08f465ee69492b4ef3131afe47b78 Mon Sep 17 00:00:00 2001 From: lijialin03 <124568209+lijialin03@users.noreply.github.com> Date: Wed, 26 Apr 2023 16:20:52 +0800 Subject: [PATCH] add paddle.optimizer.LBFGS API and a modify its test case test=develop (#51912) * modify numel in lbfgs and add a new test case. test=develop * change param 'lr' to 'learning_rate' in lbfgs and its test * add opt LBFGS and change test --- .../fluid/tests/unittests/test_lbfgs_class.py | 560 ++++++++++++++ .../fluid/tests/unittests/test_lbfgs_v2.py | 274 ------- python/paddle/incubate/optimizer/lbfgs.py | 21 +- python/paddle/optimizer/__init__.py | 2 + python/paddle/optimizer/lbfgs.py | 701 ++++++++++++++++++ 5 files changed, 1275 insertions(+), 283 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_lbfgs_class.py delete mode 100644 python/paddle/fluid/tests/unittests/test_lbfgs_v2.py create mode 100644 python/paddle/optimizer/lbfgs.py diff --git a/python/paddle/fluid/tests/unittests/test_lbfgs_class.py b/python/paddle/fluid/tests/unittests/test_lbfgs_class.py new file mode 100644 index 00000000000..44c854f2119 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_lbfgs_class.py @@ -0,0 +1,560 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +from paddle.incubate.optimizer import lbfgs as incubate_lbfgs +from paddle.incubate.optimizer import line_search_dygraph +from paddle.optimizer import lbfgs + +np.random.seed(123) + +# func()should be func(w, x)where w is parameter to be optimize ,x is input of optimizer func +# np_w is the init parameter of w + + +class Net(paddle.nn.Layer): + def __init__(self, np_w, func): + super().__init__() + self.func = func + w = paddle.to_tensor(np_w) + self.w = paddle.create_parameter( + shape=w.shape, + dtype=w.dtype, + default_initializer=paddle.nn.initializer.Assign(w), + ) + + def forward(self, x): + return self.func(self.w, x) + + +def train_step(inputs, targets, net, opt): + def closure(): + outputs = net(inputs) + loss = paddle.nn.functional.mse_loss(outputs, targets) + opt.clear_grad() + loss.backward() + return loss + + loss = opt.step(closure) + return loss + + +class TestLbfgs(unittest.TestCase): + def test_function_fix_incubate(self): + paddle.disable_static() + np_w = np.random.rand(1).astype(np.float32) + + input = np.random.rand(1).astype(np.float32) + weights = [np.random.rand(1).astype(np.float32) for i in range(5)] + targets = [weights[i] * input for i in range(5)] + + def func(w, x): + return w * x + + net = Net(np_w, func) + opt = incubate_lbfgs.LBFGS( + learning_rate=1, + max_iter=10, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=5, + line_search_fn='strong_wolfe', + parameters=net.parameters(), + ) + + for weight, target in zip(weights, targets): + input = paddle.to_tensor(input) + target = paddle.to_tensor(target) + loss = 1 + while loss > 1e-4: + loss = train_step(input, target, net, opt) + np.testing.assert_allclose(net.w, weight, rtol=1e-05) + + def test_inf_minima_incubate(self): + # not converage + input = np.random.rand(1).astype(np.float32) + + def outputs1(x): + # weight[0] = 1.01 weight[1] = 0.99 + return x * x * x - 3 * x * x + 3 * 1.01 * 0.99 * x + + def outputs2(x): + # weight[0] = 4 weight[1] = 2 + return pow(x, 4) + 5 * pow(x, 2) + + targets = [outputs1(input), outputs2(input)] + input = paddle.to_tensor(input) + + def func1(extream_point, x): + return ( + x * x * x + - 3 * x * x + + 3 * extream_point[0] * extream_point[1] * x + ) + + def func2(extream_point, x): + return pow(x, extream_point[0]) + 5 * pow(x, extream_point[1]) + + extream_point = np.array([-2.34, 1.45]).astype('float32') + net1 = Net(extream_point, func1) + # converge of old_sk.pop() + opt1 = incubate_lbfgs.LBFGS( + learning_rate=1, + max_iter=10, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=1, + line_search_fn='strong_wolfe', + parameters=net1.parameters(), + ) + + net2 = Net(extream_point, func2) + # converge of line_search = None + opt2 = incubate_lbfgs.LBFGS( + learning_rate=1, + max_iter=50, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=10, + line_search_fn=None, + parameters=net2.parameters(), + ) + + n_iter = 0 + while n_iter < 20: + loss = train_step(input, paddle.to_tensor(targets[0]), net1, opt1) + n_iter = opt1.state_dict()["state"]["func_evals"] + + n_iter = 0 + while n_iter < 10: + loss = train_step(input, paddle.to_tensor(targets[1]), net2, opt2) + n_iter = opt1.state_dict()["state"]["func_evals"] + + def test_error_incubate(self): + # test parameter is not Paddle Tensor + def error_func1(): + extream_point = np.array([-1, 2]).astype('float32') + extream_point = paddle.to_tensor(extream_point) + return incubate_lbfgs.LBFGS( + learning_rate=1, + max_iter=10, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=3, + line_search_fn='strong_wolfe', + parameters=extream_point, + ) + + self.assertRaises(TypeError, error_func1) + + def test_error2_incubate(self): + # not converage + input = np.random.rand(1).astype(np.float32) + + def outputs2(x): + # weight[0] = 4 weight[1] = 2 + return pow(x, 4) + 5 * pow(x, 2) + + targets = [outputs2(input)] + input = paddle.to_tensor(input) + + def func2(extream_point, x): + return pow(x, extream_point[0]) + 5 * pow(x, extream_point[1]) + + extream_point = np.array([-2.34, 1.45]).astype('float32') + net2 = Net(extream_point, func2) + # converge of line_search = None + opt2 = incubate_lbfgs.LBFGS( + learning_rate=1, + max_iter=50, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=10, + line_search_fn='None', + parameters=net2.parameters(), + ) + + def error_func(): + n_iter = 0 + while n_iter < 10: + loss = train_step( + input, paddle.to_tensor(targets[0]), net2, opt2 + ) + n_iter = opt2.state_dict()["state"]["func_evals"] + + self.assertRaises(RuntimeError, error_func) + + def test_line_search_incubate(self): + def func1(x, alpha, d): + return paddle.to_tensor(x + alpha * d), paddle.to_tensor([0.0]) + + def func2(x, alpha, d): + return paddle.to_tensor(x + alpha * d), paddle.to_tensor([1.0]) + + def func3(x, alpha, d): + return paddle.to_tensor(x + alpha * d), paddle.to_tensor([-1.0]) + + line_search_dygraph._strong_wolfe( + func1, + paddle.to_tensor([1.0]), + paddle.to_tensor([0.001]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([0.0]), + max_ls=1, + ) + + line_search_dygraph._strong_wolfe( + func1, + paddle.to_tensor([1.0]), + paddle.to_tensor([0.001]), + paddle.to_tensor([0.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([0.0]), + max_ls=0, + ) + + line_search_dygraph._strong_wolfe( + func2, + paddle.to_tensor([1.0]), + paddle.to_tensor([-0.001]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + max_ls=1, + ) + + line_search_dygraph._strong_wolfe( + func3, + paddle.to_tensor([1.0]), + paddle.to_tensor([-0.001]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + max_ls=1, + ) + + line_search_dygraph._cubic_interpolate( + paddle.to_tensor([2.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([2.0]), + paddle.to_tensor([0.0]), + [0.1, 0.5], + ) + + line_search_dygraph._cubic_interpolate( + paddle.to_tensor([2.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([-3.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([-0.1]), + [0.1, 0.5], + ) + + def test_error3_incubate(self): + # test parameter shape size <= 0 + def error_func3(): + extream_point = np.array([-1, 2]).astype('float32') + extream_point = paddle.to_tensor(extream_point) + + def func(w, x): + return w * x + + net = Net(extream_point, func) + net.w = paddle.create_parameter( + shape=[-1, 2], + dtype=net.w.dtype, + ) + opt = incubate_lbfgs.LBFGS( + learning_rate=1, + max_iter=10, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=5, + line_search_fn='strong_wolfe', + parameters=net.parameters(), + ) + + self.assertRaises(AssertionError, error_func3) + + def test_function_fix(self): + paddle.disable_static() + np_w = np.random.rand(1).astype(np.float32) + + input = np.random.rand(1).astype(np.float32) + weights = [np.random.rand(1).astype(np.float32) for i in range(5)] + targets = [weights[i] * input for i in range(5)] + + def func(w, x): + return w * x + + net = Net(np_w, func) + opt = lbfgs.LBFGS( + learning_rate=1, + max_iter=10, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=5, + line_search_fn='strong_wolfe', + parameters=net.parameters(), + ) + + for weight, target in zip(weights, targets): + input = paddle.to_tensor(input) + target = paddle.to_tensor(target) + loss = 1 + while loss > 1e-4: + loss = train_step(input, target, net, opt) + np.testing.assert_allclose(net.w, weight, rtol=1e-05) + + def test_inf_minima(self): + # not converage + input = np.random.rand(1).astype(np.float32) + + def outputs1(x): + # weight[0] = 1.01 weight[1] = 0.99 + return x * x * x - 3 * x * x + 3 * 1.01 * 0.99 * x + + def outputs2(x): + # weight[0] = 4 weight[1] = 2 + return pow(x, 4) + 5 * pow(x, 2) + + targets = [outputs1(input), outputs2(input)] + input = paddle.to_tensor(input) + + def func1(extream_point, x): + return ( + x * x * x + - 3 * x * x + + 3 * extream_point[0] * extream_point[1] * x + ) + + def func2(extream_point, x): + return pow(x, extream_point[0]) + 5 * pow(x, extream_point[1]) + + extream_point = np.array([-2.34, 1.45]).astype('float32') + net1 = Net(extream_point, func1) + # converge of old_sk.pop() + opt1 = lbfgs.LBFGS( + learning_rate=1, + max_iter=10, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=1, + line_search_fn='strong_wolfe', + parameters=net1.parameters(), + ) + + net2 = Net(extream_point, func2) + # converge of line_search = None + opt2 = lbfgs.LBFGS( + learning_rate=1, + max_iter=50, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=10, + line_search_fn=None, + parameters=net2.parameters(), + ) + + n_iter = 0 + while n_iter < 20: + loss = train_step(input, paddle.to_tensor(targets[0]), net1, opt1) + n_iter = opt1.state_dict()["state"]["func_evals"] + + n_iter = 0 + while n_iter < 10: + loss = train_step(input, paddle.to_tensor(targets[1]), net2, opt2) + n_iter = opt1.state_dict()["state"]["func_evals"] + + def test_error(self): + # test parameter is not Paddle Tensor + def error_func1(): + extream_point = np.array([-1, 2]).astype('float32') + extream_point = paddle.to_tensor(extream_point) + return lbfgs.LBFGS( + learning_rate=1, + max_iter=10, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=3, + line_search_fn='strong_wolfe', + parameters=extream_point, + ) + + self.assertRaises(TypeError, error_func1) + + def test_error2(self): + # not converage + input = np.random.rand(1).astype(np.float32) + + def outputs2(x): + # weight[0] = 4 weight[1] = 2 + return pow(x, 4) + 5 * pow(x, 2) + + targets = [outputs2(input)] + input = paddle.to_tensor(input) + + def func2(extream_point, x): + return pow(x, extream_point[0]) + 5 * pow(x, extream_point[1]) + + extream_point = np.array([-2.34, 1.45]).astype('float32') + net2 = Net(extream_point, func2) + # converge of line_search = None + opt2 = lbfgs.LBFGS( + learning_rate=1, + max_iter=50, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=10, + line_search_fn='None', + parameters=net2.parameters(), + ) + + def error_func(): + n_iter = 0 + while n_iter < 10: + loss = train_step( + input, paddle.to_tensor(targets[0]), net2, opt2 + ) + n_iter = opt2.state_dict()["state"]["func_evals"] + + self.assertRaises(RuntimeError, error_func) + + def test_line_search(self): + def func1(x, alpha, d): + return paddle.to_tensor(x + alpha * d), paddle.to_tensor([0.0]) + + def func2(x, alpha, d): + return paddle.to_tensor(x + alpha * d), paddle.to_tensor([1.0]) + + def func3(x, alpha, d): + return paddle.to_tensor(x + alpha * d), paddle.to_tensor([-1.0]) + + lbfgs._strong_wolfe( + func1, + paddle.to_tensor([1.0]), + paddle.to_tensor([0.001]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([0.0]), + max_ls=1, + ) + + lbfgs._strong_wolfe( + func1, + paddle.to_tensor([1.0]), + paddle.to_tensor([0.001]), + paddle.to_tensor([0.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([0.0]), + max_ls=0, + ) + + lbfgs._strong_wolfe( + func2, + paddle.to_tensor([1.0]), + paddle.to_tensor([-0.001]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + max_ls=1, + ) + + lbfgs._strong_wolfe( + func3, + paddle.to_tensor([1.0]), + paddle.to_tensor([-0.001]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + max_ls=1, + ) + + lbfgs._cubic_interpolate( + paddle.to_tensor([2.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([2.0]), + paddle.to_tensor([0.0]), + [0.1, 0.5], + ) + + lbfgs._cubic_interpolate( + paddle.to_tensor([2.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([-3.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([-0.1]), + [0.1, 0.5], + ) + + def test_error3(self): + # test parameter shape size <= 0 + def error_func3(): + extream_point = np.array([-1, 2]).astype('float32') + extream_point = paddle.to_tensor(extream_point) + + def func(w, x): + return w * x + + net = Net(extream_point, func) + net.w = paddle.create_parameter( + shape=[-1, 2], + dtype=net.w.dtype, + ) + opt = lbfgs.LBFGS( + learning_rate=1, + max_iter=10, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=5, + line_search_fn='strong_wolfe', + parameters=net.parameters(), + ) + + self.assertRaises(AssertionError, error_func3) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_lbfgs_v2.py b/python/paddle/fluid/tests/unittests/test_lbfgs_v2.py deleted file mode 100644 index 9617938967c..00000000000 --- a/python/paddle/fluid/tests/unittests/test_lbfgs_v2.py +++ /dev/null @@ -1,274 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np - -import paddle -from paddle.incubate.optimizer import LBFGS -from paddle.incubate.optimizer.line_search_dygraph import ( - _cubic_interpolate, - _strong_wolfe, -) - -np.random.seed(123) - -# func()should be func(w, x)where w is parameter to be optimize ,x is input of optimizer func -# np_w is the init parameter of w - - -class Net(paddle.nn.Layer): - def __init__(self, np_w, func): - super().__init__() - self.func = func - w = paddle.to_tensor(np_w) - self.w = paddle.create_parameter( - shape=w.shape, - dtype=w.dtype, - default_initializer=paddle.nn.initializer.Assign(w), - ) - - def forward(self, x): - return self.func(self.w, x) - - -def train_step(inputs, targets, net, opt): - def closure(): - outputs = net(inputs) - loss = paddle.nn.functional.mse_loss(outputs, targets) - opt.clear_grad() - loss.backward() - return loss - - loss = opt.step(closure) - return loss - - -class TestLbfgs(unittest.TestCase): - def test_function_fix(self): - paddle.disable_static() - np_w = np.random.rand(1).astype(np.float32) - - input = np.random.rand(1).astype(np.float32) - weights = [np.random.rand(1).astype(np.float32) for i in range(5)] - targets = [weights[i] * input for i in range(5)] - - def func(w, x): - return w * x - - net = Net(np_w, func) - opt = LBFGS( - lr=1, - max_iter=10, - max_eval=None, - tolerance_grad=1e-07, - tolerance_change=1e-09, - history_size=5, - line_search_fn='strong_wolfe', - parameters=net.parameters(), - ) - - for weight, target in zip(weights, targets): - input = paddle.to_tensor(input) - target = paddle.to_tensor(target) - loss = 1 - while loss > 1e-4: - loss = train_step(input, target, net, opt) - np.testing.assert_allclose(net.w, weight, rtol=1e-05) - - def test_inf_minima(self): - # not converage - input = np.random.rand(1).astype(np.float32) - - def outputs1(x): - # weight[0] = 1.01 weight[1] = 0.99 - return x * x * x - 3 * x * x + 3 * 1.01 * 0.99 * x - - def outputs2(x): - # weight[0] = 4 weight[1] = 2 - return pow(x, 4) + 5 * pow(x, 2) - - targets = [outputs1(input), outputs2(input)] - input = paddle.to_tensor(input) - - def func1(extream_point, x): - return ( - x * x * x - - 3 * x * x - + 3 * extream_point[0] * extream_point[1] * x - ) - - def func2(extream_point, x): - return pow(x, extream_point[0]) + 5 * pow(x, extream_point[1]) - - extream_point = np.array([-2.34, 1.45]).astype('float32') - net1 = Net(extream_point, func1) - # converge of old_sk.pop() - opt1 = LBFGS( - lr=1, - max_iter=10, - max_eval=None, - tolerance_grad=1e-07, - tolerance_change=1e-09, - history_size=1, - line_search_fn='strong_wolfe', - parameters=net1.parameters(), - ) - - net2 = Net(extream_point, func2) - # converge of line_search = None - opt2 = LBFGS( - lr=1, - max_iter=50, - max_eval=None, - tolerance_grad=1e-07, - tolerance_change=1e-09, - history_size=10, - line_search_fn=None, - parameters=net2.parameters(), - ) - - n_iter = 0 - while n_iter < 20: - loss = train_step(input, paddle.to_tensor(targets[0]), net1, opt1) - n_iter = opt1.state_dict()["state"]["func_evals"] - - n_iter = 0 - while n_iter < 10: - loss = train_step(input, paddle.to_tensor(targets[1]), net2, opt2) - n_iter = opt1.state_dict()["state"]["func_evals"] - - def test_error(self): - # test parameter is not Paddle Tensor - def error_func1(): - extream_point = np.array([-1, 2]).astype('float32') - extream_point = paddle.to_tensor(extream_point) - return LBFGS( - lr=1, - max_iter=10, - max_eval=None, - tolerance_grad=1e-07, - tolerance_change=1e-09, - history_size=3, - line_search_fn='strong_wolfe', - parameters=extream_point, - ) - - self.assertRaises(TypeError, error_func1) - - def test_error2(self): - # not converage - input = np.random.rand(1).astype(np.float32) - - def outputs2(x): - # weight[0] = 4 weight[1] = 2 - return pow(x, 4) + 5 * pow(x, 2) - - targets = [outputs2(input)] - input = paddle.to_tensor(input) - - def func2(extream_point, x): - return pow(x, extream_point[0]) + 5 * pow(x, extream_point[1]) - - extream_point = np.array([-2.34, 1.45]).astype('float32') - net2 = Net(extream_point, func2) - # converge of line_search = None - opt2 = LBFGS( - lr=1, - max_iter=50, - max_eval=None, - tolerance_grad=1e-07, - tolerance_change=1e-09, - history_size=10, - line_search_fn='None', - parameters=net2.parameters(), - ) - - def error_func(): - n_iter = 0 - while n_iter < 10: - loss = train_step( - input, paddle.to_tensor(targets[0]), net2, opt2 - ) - n_iter = opt2.state_dict()["state"]["func_evals"] - - self.assertRaises(RuntimeError, error_func) - - def test_line_search(self): - def func1(x, alpha, d): - return paddle.to_tensor(x + alpha * d), paddle.to_tensor([0.0]) - - def func2(x, alpha, d): - return paddle.to_tensor(x + alpha * d), paddle.to_tensor([1.0]) - - def func3(x, alpha, d): - return paddle.to_tensor(x + alpha * d), paddle.to_tensor([-1.0]) - - _strong_wolfe( - func1, - paddle.to_tensor([1.0]), - paddle.to_tensor([0.001]), - paddle.to_tensor([0.0]), - paddle.to_tensor([1.0]), - paddle.to_tensor([0.0]), - paddle.to_tensor([0.0]), - max_ls=0, - ) - - _strong_wolfe( - func2, - paddle.to_tensor([1.0]), - paddle.to_tensor([-0.001]), - paddle.to_tensor([1.0]), - paddle.to_tensor([1.0]), - paddle.to_tensor([1.0]), - paddle.to_tensor([1.0]), - max_ls=1, - ) - - _strong_wolfe( - func3, - paddle.to_tensor([1.0]), - paddle.to_tensor([-0.001]), - paddle.to_tensor([1.0]), - paddle.to_tensor([1.0]), - paddle.to_tensor([1.0]), - paddle.to_tensor([1.0]), - max_ls=1, - ) - - _cubic_interpolate( - paddle.to_tensor([2.0]), - paddle.to_tensor([1.0]), - paddle.to_tensor([0.0]), - paddle.to_tensor([1.0]), - paddle.to_tensor([2.0]), - paddle.to_tensor([0.0]), - [0.1, 0.5], - ) - - _cubic_interpolate( - paddle.to_tensor([2.0]), - paddle.to_tensor([0.0]), - paddle.to_tensor([-3.0]), - paddle.to_tensor([1.0]), - paddle.to_tensor([1.0]), - paddle.to_tensor([-0.1]), - [0.1, 0.5], - ) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/incubate/optimizer/lbfgs.py b/python/paddle/incubate/optimizer/lbfgs.py index 937a3b2f9af..ae7511ae03e 100644 --- a/python/paddle/incubate/optimizer/lbfgs.py +++ b/python/paddle/incubate/optimizer/lbfgs.py @@ -18,10 +18,12 @@ from functools import reduce import paddle from paddle.optimizer import Optimizer +from paddle.utils import deprecated from .line_search_dygraph import _strong_wolfe +@deprecated(since="2.5.0", update_to="paddle.optimizer.LBFGS", level=1) class LBFGS(Optimizer): r""" The L-BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function. @@ -40,7 +42,7 @@ class LBFGS(Optimizer): Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS). Args: - lr (float, optional): learning rate .The default value is 1. + learning_rate (float, optional): learning rate .The default value is 1. max_iter (int, optional): maximal number of iterations per optimization step. The default value is 20. max_eval (int, optional): maximal number of function evaluations per optimization @@ -97,7 +99,7 @@ class LBFGS(Optimizer): return self.w * x net = Net() - opt = LBFGS(lr=1, max_iter=1, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=100, line_search_fn='strong_wolfe', parameters=net.parameters()) + opt = LBFGS(learning_rate=1, max_iter=1, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=100, line_search_fn='strong_wolfe', parameters=net.parameters()) def train_step(inputs, targets): def closure(): outputs = net(inputs) @@ -118,7 +120,7 @@ class LBFGS(Optimizer): def __init__( self, - lr=1.0, + learning_rate=1.0, max_iter=20, max_eval=None, tolerance_grad=1e-7, @@ -133,7 +135,7 @@ class LBFGS(Optimizer): if max_eval is None: max_eval = max_iter * 5 // 4 - self.lr = lr + self.learning_rate = learning_rate self.max_iter = max_iter self.max_eval = max_eval self.tolerance_grad = tolerance_grad @@ -202,7 +204,7 @@ class LBFGS(Optimizer): def _add_grad(self, alpha, direction): offset = 0 for p in self._params: - numel = p.numel().item() + numel = reduce(lambda x, y: x * y, p.shape) p = paddle.assign( p.add( direction[offset : offset + numel].reshape(p.shape) * alpha @@ -234,11 +236,10 @@ class LBFGS(Optimizer): """ with paddle.no_grad(): - # Make sure the closure is always called with grad enabled closure = paddle.enable_grad()(closure) - lr = self.lr + learning_rate = self.learning_rate max_iter = self.max_iter max_eval = self.max_eval tolerance_grad = self.tolerance_grad @@ -342,9 +343,11 @@ class LBFGS(Optimizer): ############################################################ # reset initial guess for step size if state['n_iter'] == 1: - alpha = min(1.0, 1.0 / flat_grad.abs().sum()) * lr + alpha = ( + min(1.0, 1.0 / flat_grad.abs().sum()) * learning_rate + ) else: - alpha = lr + alpha = learning_rate # directional derivative gtd = flat_grad.dot(d) diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py index cef51897b20..7d9737dc7da 100644 --- a/python/paddle/optimizer/__init__.py +++ b/python/paddle/optimizer/__init__.py @@ -22,6 +22,7 @@ from .adadelta import Adadelta # noqa: F401 from .sgd import SGD # noqa: F401 from .momentum import Momentum # noqa: F401 from .lamb import Lamb # noqa: F401 +from .lbfgs import LBFGS # noqa: F401 from . import lr # noqa: F401 __all__ = [ # noqa @@ -35,4 +36,5 @@ __all__ = [ # noqa 'SGD', 'Momentum', 'Lamb', + 'LBFGS', ] diff --git a/python/paddle/optimizer/lbfgs.py b/python/paddle/optimizer/lbfgs.py new file mode 100644 index 00000000000..aaa93354196 --- /dev/null +++ b/python/paddle/optimizer/lbfgs.py @@ -0,0 +1,701 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from functools import reduce + +import paddle + +from ..fluid import framework +from .optimizer import Optimizer + +__all__ = [] + + +def _cubic_interpolate(x1, f1, g1, x2, f2, g2, bounds=None): + r"""Cubic interpolation between (x1, f1, g1) and (x2, f2, g2). + Use two points and their gradient to determine a cubic function and get the minimun point + between them in the cubic curve. + + Reference: + Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. + pp59: formula 3.59 + + Args: + x1, f1, g1: point1's position, value and gradient. + x2, f2, g2: point2's position, value and gradient. + bounds: bounds of interpolation area + + Returns: + min_pos: the minimun point between the specified points in the cubic curve. + """ + # Compute bounds of interpolation area + if bounds is not None: + xmin_bound, xmax_bound = bounds + else: + xmin_bound, xmax_bound = (x1, x2) if x1 <= x2 else (x2, x1) + + d1 = g1 + g2 - 3 * (f1 - f2) / (x1 - x2) + d2_square = d1**2 - g1 * g2 + if d2_square >= 0: + d2 = d2_square.sqrt() + if x1 <= x2: + min_pos = x2 - (x2 - x1) * ((g2 + d2 - d1) / (g2 - g1 + 2 * d2)) + else: + min_pos = x1 - (x1 - x2) * ((g1 + d2 - d1) / (g1 - g2 + 2 * d2)) + return min(max(min_pos, xmin_bound), xmax_bound) + else: + return (xmin_bound + xmax_bound) / 2.0 + + +def _strong_wolfe( + obj_func, + xk, + alpha, + d, + loss, + grad, + gtd, + c1=1e-4, + c2=0.9, + tolerance_change=1e-9, + max_ls=25, +): + r"""Implements of line search algorithm that satisfies the strong Wolfe conditions using double zoom. + + Reference: + Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. + pp60: Algorithm 3.5 (Line Search Algorithm). + + Args: + obj_func: the objective function to minimize. ```` accepts a multivariate input and returns a scalar. + xk (Tensor): the starting point of the iterates. + alpha (Scalar): the initial step size. + d (Tensor): search direction. + loss (scalar): the initial loss + grad (Tensor): the initial grad + c1 (Scalar): parameter for sufficient decrease condition. + c2 (Scalar): parameter for curvature condition. + tolerance_change (Scalar): terminates if the change of function value/position/parameter between + two iterations is smaller than this value. + max_ls(int): max iteration of line search. + alpha_max (float): max step length. + + Returns: + loss_new (Scaler): loss of obj_func at final alpha. + grad_new, (Tensor): derivative of obj_func at final alpha. + alpha(Tensor): optimal step length, or 0. if the line search algorithm did not converge. + ls_func_evals (Scaler): number of objective function called in line search process. + + Following summarizes the essentials of the strong Wolfe line search algorithm. + Some notations used in the description: + + - `func` denotes the objective function. + - `obi_func` is a function of step size alpha, restricting `obj_func` on a line. + + obi_func = func(xk + alpha * d), + where xk is the position of k'th iterate, d is the line search direction(decent direction), + and a is the step size. + - alpha : substitute of alpha + - a1 is alpha of last iteration, which is alpha_(i-1). + - a2 is alpha of current iteration, which is alpha_i. + - a_lo is alpha in left position when calls zoom, which is alpha_low. + - a_hi is alpha in right position when calls zoom, which is alpha_high. + + Line Search Algorithm: + repeat + Compute obi_func(a2) and derphi(a2). + 1. If obi_func(a2) > obi_func(0) + c_1 * a2 * obi_func'(0) or [obi_func(a2) >= obi_func(a1) and i > 1], + alpha= zoom(a1, a2) and stop; + + 2. If |obi_func'(a2)| <= -c_2 * obi_func'(0), + alpha= a2 and stop; + + 3. If obi_func'(a2) >= 0, + alpha= zoom(a2, a1) and stop; + + a1 = a2 + a2 = min(2 * a2, a2) + i = i + 1 + end(repeat) + + zoom(a_lo, a_hi) Algorithm: + repeat + aj = cubic_interpolation(a_lo, a_hi) + Compute obi_func(aj) and derphi(aj). + 1. If obi_func(aj) > obi_func(0) + c_1 * aj * obi_func'(0) or obi_func(aj) >= obi_func(a_lo), + then a_hi <- aj; + 2. + 2.1. If |obi_func'(aj)| <= -c_2 * obi_func'(0), then alpha= a2 and stop; + + 2.2. If obi_func'(aj) * (a2 - a1) >= 0, then a_hi = a_lo + + a_lo = aj; + end(repeat) + """ + + d_norm = d.abs().max() + grad = grad.clone() + # evaluate objective and gradient using initial step + loss_new, grad_new = obj_func(xk, alpha, d) + ls_func_evals = 1 + gtd_new = paddle.dot(grad_new, d) + + # bracket an interval containing a point satisfying the Wolfe criteria + t_prev, f_prev, g_prev, gtd_prev = ( + paddle.to_tensor(0, dtype=grad.dtype), + loss, + grad, + gtd, + ) + done = False + ls_iter = 0 + while ls_iter < max_ls: + # check conditions + if loss_new > (loss + c1 * alpha * gtd) or ( + ls_iter > 1 and loss_new >= f_prev + ): + bracket = [t_prev, alpha] + bracket_f = [f_prev, loss_new] + bracket_g = [g_prev, grad_new.clone()] + bracket_gtd = [gtd_prev, gtd_new] + break + + if paddle.abs(gtd_new) <= -c2 * gtd: + bracket = [alpha] + bracket_f = [loss_new] + bracket_g = [grad_new] + done = True + break + + if gtd_new >= 0: + bracket = [t_prev, alpha] + bracket_f = [f_prev, loss_new] + bracket_g = [g_prev, grad_new.clone()] + bracket_gtd = [gtd_prev, gtd_new] + break + + # interpolate + min_step = alpha + 0.01 * (alpha - t_prev) + max_step = alpha * 10 + tmp = alpha + alpha = _cubic_interpolate( + t_prev, + f_prev, + gtd_prev, + alpha, + loss_new, + gtd_new, + bounds=(min_step, max_step), + ) + + # next step + t_prev = tmp + f_prev = loss_new + g_prev = grad_new.clone() + gtd_prev = gtd_new + + loss_new, grad_new = obj_func(xk, alpha, d) + ls_func_evals += 1 + gtd_new = grad_new.dot(d) + ls_iter += 1 + + # reached max number of iterations? + if ls_iter == max_ls: + bracket = [0, alpha] + bracket_f = [loss, loss_new] + bracket_g = [grad, grad_new] + + # zoom phase: we now have a point satisfying the criteria, or + # a bracket around it. We refine the bracket until we find the + # exact point satisfying the criteria + insuf_progress = False + # find high and low points in bracket + low_pos, high_pos = (0, 1) if bracket_f[0] <= bracket_f[-1] else (1, 0) + while not done and ls_iter < max_ls: + # line-search bracket is so small + if paddle.abs(bracket[1] - bracket[0]) * d_norm < tolerance_change: + break + + # compute new trial value + alpha = _cubic_interpolate( + bracket[0], + bracket_f[0], + bracket_gtd[0], + bracket[1], + bracket_f[1], + bracket_gtd[1], + ) + + # test that we are making sufficient progress: + # in case `alpha` is so close to boundary, we mark that we are making + # insufficient progress, and if + # + we have made insufficient progress in the last step, or + # + `alpha` is at one of the boundary, + # we will move `alpha` to a position which is `0.1 * len(bracket)` + # away from the nearest boundary point. + + eps = 0.1 * (max(bracket) - min(bracket)) + if min(max(bracket) - alpha, alpha - min(bracket)) < eps: + # interpolation close to boundary + if insuf_progress or alpha >= max(bracket) or alpha <= min(bracket): + # evaluate at 0.1 away from boundary + if paddle.abs(alpha - max(bracket)) < paddle.abs( + alpha - min(bracket) + ): + alpha = max(bracket) - eps + else: + alpha = min(bracket) + eps + insuf_progress = False + else: + insuf_progress = True + else: + insuf_progress = False + # Evaluate new point + loss_new, grad_new = obj_func(xk, alpha, d) + ls_func_evals += 1 + gtd_new = grad_new.dot(d) + ls_iter += 1 + + if ( + loss_new > (loss + c1 * alpha * gtd) + or loss_new >= bracket_f[low_pos] + ): + # Armijo condition not satisfied or not lower than lowest point + bracket[high_pos] = alpha + bracket_f[high_pos] = loss_new + # bracket_g[high_pos] = grad_new.clone(memory_format=torch.contiguous_format) + bracket_g[high_pos] = grad_new.clone() + bracket_gtd[high_pos] = gtd_new + low_pos, high_pos = ( + (0, 1) if bracket_f[0] <= bracket_f[1] else (1, 0) + ) + else: + if paddle.abs(gtd_new) <= -c2 * gtd: + # Wolfe conditions satisfied + done = True + elif gtd_new * (bracket[high_pos] - bracket[low_pos]) >= 0: + # old high becomes new low + bracket[high_pos] = bracket[low_pos] + bracket_f[high_pos] = bracket_f[low_pos] + bracket_g[high_pos] = bracket_g[low_pos] + bracket_gtd[high_pos] = bracket_gtd[low_pos] + + # new point becomes new low + bracket[low_pos] = alpha + bracket_f[low_pos] = loss_new + bracket_g[low_pos] = grad_new.clone() + bracket_gtd[low_pos] = gtd_new + + # return stuff + alpha = bracket[low_pos] + loss_new = bracket_f[low_pos] + grad_new = bracket_g[low_pos] + return loss_new, grad_new, alpha, ls_func_evals + + +class LBFGS(Optimizer): + r""" + The L-BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function. + Closely related is the Newton method for minimization. Consider the iterate update formula: + + .. math:: + x_{k+1} = x_{k} + H_k \nabla{f_k} + + If :math:`H_k` is the inverse Hessian of :math:`f` at :math:`x_k`, then it's the Newton method. + If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then + it's a quasi-Newton. In practice, the approximated Hessians are obtained + by only using the gradients, over either whole or part of the search + history, the former is BFGS, the latter is L-BFGS. + + Reference: + Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS). + + Args: + learning_rate (float, optional): learning rate .The default value is 1. + max_iter (int, optional): maximal number of iterations per optimization step. + The default value is 20. + max_eval (int, optional): maximal number of function evaluations per optimization + step. The default value is max_iter * 1.25. + tolerance_grad (float, optional): termination tolerance on first order optimality + The default value is 1e-5. + tolerance_change (float, optional): termination tolerance on function + value/parameter changes. The default value is 1e-9. + history_size (int, optional): update history size. The default value is 100. + line_search_fn (string, optional): either 'strong_wolfe' or None. The default value is strong_wolfe. + parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. The default value is None. + weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ + It canbe a float value as coeff of L2 regularization or \ + :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`. + If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \ + the regularization setting here in optimizer will be ignored for this parameter. \ + Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of \ + some derived class of ``GradientClipBase`` . There are three cliping strategies \ + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , \ + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + + Return: + loss (Tensor): the final loss of closure. + + Examples: + .. code-block:: python + + import paddle + import numpy as np + from paddle.incubate.optimizer import LBFGS + + paddle.disable_static() + np.random.seed(0) + np_w = np.random.rand(1).astype(np.float32) + np_x = np.random.rand(1).astype(np.float32) + + inputs = [np.random.rand(1).astype(np.float32) for i in range(10)] + # y = 2x + targets = [2 * x for x in inputs] + + class Net(paddle.nn.Layer): + def __init__(self): + super().__init__() + w = paddle.to_tensor(np_w) + self.w = paddle.create_parameter(shape=w.shape, dtype=w.dtype, default_initializer=paddle.nn.initializer.Assign(w)) + + def forward(self, x): + return self.w * x + + net = Net() + opt = LBFGS(learning_rate=1, max_iter=1, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=100, line_search_fn='strong_wolfe', parameters=net.parameters()) + def train_step(inputs, targets): + def closure(): + outputs = net(inputs) + loss = paddle.nn.functional.mse_loss(outputs, targets) + print('loss: ', loss.item()) + opt.clear_grad() + loss.backward() + return loss + opt.step(closure) + + + for input, target in zip(inputs, targets): + input = paddle.to_tensor(input) + target = paddle.to_tensor(target) + train_step(input, target) + + """ + + def __init__( + self, + learning_rate=1.0, + max_iter=20, + max_eval=None, + tolerance_grad=1e-7, + tolerance_change=1e-9, + history_size=100, + line_search_fn=None, + parameters=None, + weight_decay=None, + grad_clip=None, + name=None, + ): + if max_eval is None: + max_eval = max_iter * 5 // 4 + + self.learning_rate = learning_rate + self.max_iter = max_iter + self.max_eval = max_eval + self.tolerance_grad = tolerance_grad + self.tolerance_change = tolerance_change + self.history_size = history_size + self.line_search_fn = line_search_fn + + if isinstance(parameters, paddle.Tensor): + raise TypeError( + "parameters argument given to the optimizer should be " + "an iterable of Tensors or dicts, but got " + type(parameters) + ) + + self.state = defaultdict(dict) + + super().__init__( + learning_rate=1.0, + parameters=parameters, + weight_decay=weight_decay, + grad_clip=grad_clip, + name=name, + ) + + if not isinstance(self._parameter_list[0], dict): + self._params = self._parameter_list + else: + for idx, param_group in enumerate(self._param_groups): + self._params = param_group['params'] + + self._numel_cache = None + + def state_dict(self): + r"""Returns the state of the optimizer as a :class:`dict`. + + Return: + state, a dict holding current optimization state. Its content + differs between optimizer classes. + """ + + packed_state = {} + for k, v in self.state.items(): + packed_state.update({k: v}) + + return {'state': packed_state} + + def _numel(self): + # compute the number of all parameters + if self._numel_cache is None: + self._numel_cache = reduce( + lambda total, p: total + p.numel(), self._params, 0 + ) + return self._numel_cache + + # flatten grad of all parameters + def _gather_flat_grad(self): + views = [] + for p in self._params: + if p.grad is None: + view = paddle.zeros_like(p).reshape([-1]) + else: + view = p.grad.reshape([-1]) + views.append(view) + return paddle.concat(views, axis=0) + + # compute xk = xk + alpha * direction + def _add_grad(self, alpha, direction): + offset = 0 + for p in self._params: + numel = reduce(lambda x, y: x * y, p.shape) + p = paddle.assign( + p.add( + direction[offset : offset + numel].reshape(p.shape) * alpha + ), + p, + ) + offset += numel + assert offset == self._numel() + + def _clone_param(self): + return [p.clone() for p in self._params] + + def _set_param(self, params_data): + for p, pdata in zip(self._params, params_data): + paddle.assign(pdata, p) + + def _directional_evaluate(self, closure, x, alpha, d): + self._add_grad(alpha, d) + loss = float(closure()) + flat_grad = self._gather_flat_grad() + self._set_param(x) + return loss, flat_grad + + @framework.non_static_only + def step(self, closure): + """Performs a single optimization step. + Args: + closure (callable): A closure that reevaluates the model + and returns the loss. + """ + + with paddle.no_grad(): + # Make sure the closure is always called with grad enabled + closure = paddle.enable_grad()(closure) + + learning_rate = self.learning_rate + max_iter = self.max_iter + max_eval = self.max_eval + tolerance_grad = self.tolerance_grad + tolerance_change = self.tolerance_change + line_search_fn = self.line_search_fn + history_size = self.history_size + state = self.state + state.setdefault('func_evals', 0) + state.setdefault('n_iter', 0) + + # evaluate initial f(x) and df/dx + orig_loss = closure() + loss = float(orig_loss) + + current_evals = 1 + state['func_evals'] += 1 + + flat_grad = self._gather_flat_grad() + opt_cond = flat_grad.abs().max() <= tolerance_grad + + # optimal condition + if opt_cond: + return orig_loss + + # tensors cached in state (for tracing) + d = state.get('d') + alpha = state.get('alpha') + old_yk = state.get('old_yk') + old_sk = state.get('old_sk') + ro = state.get('ro') + H_diag = state.get('H_diag') + prev_flat_grad = state.get('prev_flat_grad') + prev_loss = state.get('prev_loss') + + n_iter = 0 + # optimize for a max of max_iter iterations + while n_iter < max_iter: + # keep track of nb of iterations + n_iter += 1 + state['n_iter'] += 1 + + ############################################################ + # compute gradient descent direction + ############################################################ + if state['n_iter'] == 1: + d = flat_grad.neg() + old_yk = [] + old_sk = [] + ro = [] + H_diag = paddle.to_tensor(1.0, dtype=orig_loss.dtype) + else: + # do lbfgs update (update memory) + y = flat_grad.subtract(prev_flat_grad) + s = d.multiply(paddle.to_tensor(alpha, dtype=d.dtype)) + ys = y.dot(s) + if ys > 1e-10: + # updating memory + if len(old_yk) == history_size: + # shift history by one (limited-memory) + old_yk.pop(0) + old_sk.pop(0) + ro.pop(0) + + # store new direction/step + old_yk.append(y) + old_sk.append(s) + ro.append(1.0 / ys) + + # update scale of initial Hessian approximation + H_diag = ys / y.dot(y) # (y*y) + + # compute the approximate (L-BFGS) inverse Hessian + # multiplied by the gradient + num_old = len(old_yk) + + if 'al' not in state: + state['al'] = [None] * history_size + al = state['al'] + + # iteration in L-BFGS loop collapsed to use just one buffer + q = flat_grad.neg() + for i in range(num_old - 1, -1, -1): + al[i] = old_sk[i].dot(q) * ro[i] + paddle.assign(q.add(old_yk[i] * (-al[i])), q) + + # multiply by initial Hessian + # r/d is the final direction + d = r = paddle.multiply(q, H_diag) + for i in range(num_old): + be_i = old_yk[i].dot(r) * ro[i] + paddle.assign(r.add(old_sk[i] * (al[i] - be_i)), r) + + if prev_flat_grad is None: + prev_flat_grad = flat_grad.clone() + else: + paddle.assign(flat_grad, prev_flat_grad) + prev_loss = loss + + ############################################################ + # compute step length + ############################################################ + # reset initial guess for step size + if state['n_iter'] == 1: + alpha = ( + min(1.0, 1.0 / flat_grad.abs().sum()) * learning_rate + ) + else: + alpha = learning_rate + + # directional derivative + gtd = flat_grad.dot(d) + + # directional derivative is below tolerance + if gtd > -tolerance_change: + break + + # optional line search: user function + ls_func_evals = 0 + if line_search_fn is not None: + # perform line search, using user function + if line_search_fn != "strong_wolfe": + raise RuntimeError("only 'strong_wolfe' is supported") + else: + x_init = self._clone_param() + + def obj_func(x, alpha, d): + return self._directional_evaluate( + closure, x, alpha, d + ) + + loss, flat_grad, alpha, ls_func_evals = _strong_wolfe( + obj_func, x_init, alpha, d, loss, flat_grad, gtd + ) + self._add_grad(alpha, d) + opt_cond = flat_grad.abs().max() <= tolerance_grad + else: + # no line search, simply move with fixed-step + self._add_grad(alpha, d) + if n_iter != max_iter: + with paddle.enable_grad(): + loss = float(closure()) + flat_grad = self._gather_flat_grad() + opt_cond = flat_grad.abs().max() <= tolerance_grad + ls_func_evals = 1 + + # update func eval + current_evals += ls_func_evals + state['func_evals'] += ls_func_evals + + # optimal condition + if opt_cond: + break + + # lack of progress + if (d * alpha).abs().max() <= tolerance_change: + break + + if abs(loss - prev_loss) < tolerance_change: + break + + # check conditions + if current_evals >= max_eval: + break + + if n_iter == max_iter: + break + + state['d'] = d + state['alpha'] = alpha + state['old_yk'] = old_yk + state['old_sk'] = old_sk + state['ro'] = ro + state['H_diag'] = H_diag + state['prev_flat_grad'] = prev_flat_grad + state['prev_loss'] = prev_loss + + return orig_loss -- GitLab