未验证 提交 8386417e 编写于 作者: L lijialin03 提交者: GitHub

add paddle.optimizer.LBFGS API and a modify its test case test=develop (#51912)

* modify numel in lbfgs and add a new test case. test=develop

* change param 'lr' to 'learning_rate' in lbfgs and its test

* add opt LBFGS and change test
上级 f9e5072b
......@@ -17,11 +17,9 @@ import unittest
import numpy as np
import paddle
from paddle.incubate.optimizer import LBFGS
from paddle.incubate.optimizer.line_search_dygraph import (
_cubic_interpolate,
_strong_wolfe,
)
from paddle.incubate.optimizer import lbfgs as incubate_lbfgs
from paddle.incubate.optimizer import line_search_dygraph
from paddle.optimizer import lbfgs
np.random.seed(123)
......@@ -57,6 +55,256 @@ def train_step(inputs, targets, net, opt):
class TestLbfgs(unittest.TestCase):
def test_function_fix_incubate(self):
paddle.disable_static()
np_w = np.random.rand(1).astype(np.float32)
input = np.random.rand(1).astype(np.float32)
weights = [np.random.rand(1).astype(np.float32) for i in range(5)]
targets = [weights[i] * input for i in range(5)]
def func(w, x):
return w * x
net = Net(np_w, func)
opt = incubate_lbfgs.LBFGS(
learning_rate=1,
max_iter=10,
max_eval=None,
tolerance_grad=1e-07,
tolerance_change=1e-09,
history_size=5,
line_search_fn='strong_wolfe',
parameters=net.parameters(),
)
for weight, target in zip(weights, targets):
input = paddle.to_tensor(input)
target = paddle.to_tensor(target)
loss = 1
while loss > 1e-4:
loss = train_step(input, target, net, opt)
np.testing.assert_allclose(net.w, weight, rtol=1e-05)
def test_inf_minima_incubate(self):
# not converage
input = np.random.rand(1).astype(np.float32)
def outputs1(x):
# weight[0] = 1.01 weight[1] = 0.99
return x * x * x - 3 * x * x + 3 * 1.01 * 0.99 * x
def outputs2(x):
# weight[0] = 4 weight[1] = 2
return pow(x, 4) + 5 * pow(x, 2)
targets = [outputs1(input), outputs2(input)]
input = paddle.to_tensor(input)
def func1(extream_point, x):
return (
x * x * x
- 3 * x * x
+ 3 * extream_point[0] * extream_point[1] * x
)
def func2(extream_point, x):
return pow(x, extream_point[0]) + 5 * pow(x, extream_point[1])
extream_point = np.array([-2.34, 1.45]).astype('float32')
net1 = Net(extream_point, func1)
# converge of old_sk.pop()
opt1 = incubate_lbfgs.LBFGS(
learning_rate=1,
max_iter=10,
max_eval=None,
tolerance_grad=1e-07,
tolerance_change=1e-09,
history_size=1,
line_search_fn='strong_wolfe',
parameters=net1.parameters(),
)
net2 = Net(extream_point, func2)
# converge of line_search = None
opt2 = incubate_lbfgs.LBFGS(
learning_rate=1,
max_iter=50,
max_eval=None,
tolerance_grad=1e-07,
tolerance_change=1e-09,
history_size=10,
line_search_fn=None,
parameters=net2.parameters(),
)
n_iter = 0
while n_iter < 20:
loss = train_step(input, paddle.to_tensor(targets[0]), net1, opt1)
n_iter = opt1.state_dict()["state"]["func_evals"]
n_iter = 0
while n_iter < 10:
loss = train_step(input, paddle.to_tensor(targets[1]), net2, opt2)
n_iter = opt1.state_dict()["state"]["func_evals"]
def test_error_incubate(self):
# test parameter is not Paddle Tensor
def error_func1():
extream_point = np.array([-1, 2]).astype('float32')
extream_point = paddle.to_tensor(extream_point)
return incubate_lbfgs.LBFGS(
learning_rate=1,
max_iter=10,
max_eval=None,
tolerance_grad=1e-07,
tolerance_change=1e-09,
history_size=3,
line_search_fn='strong_wolfe',
parameters=extream_point,
)
self.assertRaises(TypeError, error_func1)
def test_error2_incubate(self):
# not converage
input = np.random.rand(1).astype(np.float32)
def outputs2(x):
# weight[0] = 4 weight[1] = 2
return pow(x, 4) + 5 * pow(x, 2)
targets = [outputs2(input)]
input = paddle.to_tensor(input)
def func2(extream_point, x):
return pow(x, extream_point[0]) + 5 * pow(x, extream_point[1])
extream_point = np.array([-2.34, 1.45]).astype('float32')
net2 = Net(extream_point, func2)
# converge of line_search = None
opt2 = incubate_lbfgs.LBFGS(
learning_rate=1,
max_iter=50,
max_eval=None,
tolerance_grad=1e-07,
tolerance_change=1e-09,
history_size=10,
line_search_fn='None',
parameters=net2.parameters(),
)
def error_func():
n_iter = 0
while n_iter < 10:
loss = train_step(
input, paddle.to_tensor(targets[0]), net2, opt2
)
n_iter = opt2.state_dict()["state"]["func_evals"]
self.assertRaises(RuntimeError, error_func)
def test_line_search_incubate(self):
def func1(x, alpha, d):
return paddle.to_tensor(x + alpha * d), paddle.to_tensor([0.0])
def func2(x, alpha, d):
return paddle.to_tensor(x + alpha * d), paddle.to_tensor([1.0])
def func3(x, alpha, d):
return paddle.to_tensor(x + alpha * d), paddle.to_tensor([-1.0])
line_search_dygraph._strong_wolfe(
func1,
paddle.to_tensor([1.0]),
paddle.to_tensor([0.001]),
paddle.to_tensor([1.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([0.0]),
paddle.to_tensor([0.0]),
max_ls=1,
)
line_search_dygraph._strong_wolfe(
func1,
paddle.to_tensor([1.0]),
paddle.to_tensor([0.001]),
paddle.to_tensor([0.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([0.0]),
paddle.to_tensor([0.0]),
max_ls=0,
)
line_search_dygraph._strong_wolfe(
func2,
paddle.to_tensor([1.0]),
paddle.to_tensor([-0.001]),
paddle.to_tensor([1.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([1.0]),
max_ls=1,
)
line_search_dygraph._strong_wolfe(
func3,
paddle.to_tensor([1.0]),
paddle.to_tensor([-0.001]),
paddle.to_tensor([1.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([1.0]),
max_ls=1,
)
line_search_dygraph._cubic_interpolate(
paddle.to_tensor([2.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([0.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([2.0]),
paddle.to_tensor([0.0]),
[0.1, 0.5],
)
line_search_dygraph._cubic_interpolate(
paddle.to_tensor([2.0]),
paddle.to_tensor([0.0]),
paddle.to_tensor([-3.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([-0.1]),
[0.1, 0.5],
)
def test_error3_incubate(self):
# test parameter shape size <= 0
def error_func3():
extream_point = np.array([-1, 2]).astype('float32')
extream_point = paddle.to_tensor(extream_point)
def func(w, x):
return w * x
net = Net(extream_point, func)
net.w = paddle.create_parameter(
shape=[-1, 2],
dtype=net.w.dtype,
)
opt = incubate_lbfgs.LBFGS(
learning_rate=1,
max_iter=10,
max_eval=None,
tolerance_grad=1e-07,
tolerance_change=1e-09,
history_size=5,
line_search_fn='strong_wolfe',
parameters=net.parameters(),
)
self.assertRaises(AssertionError, error_func3)
def test_function_fix(self):
paddle.disable_static()
np_w = np.random.rand(1).astype(np.float32)
......@@ -69,8 +317,8 @@ class TestLbfgs(unittest.TestCase):
return w * x
net = Net(np_w, func)
opt = LBFGS(
lr=1,
opt = lbfgs.LBFGS(
learning_rate=1,
max_iter=10,
max_eval=None,
tolerance_grad=1e-07,
......@@ -116,8 +364,8 @@ class TestLbfgs(unittest.TestCase):
extream_point = np.array([-2.34, 1.45]).astype('float32')
net1 = Net(extream_point, func1)
# converge of old_sk.pop()
opt1 = LBFGS(
lr=1,
opt1 = lbfgs.LBFGS(
learning_rate=1,
max_iter=10,
max_eval=None,
tolerance_grad=1e-07,
......@@ -129,8 +377,8 @@ class TestLbfgs(unittest.TestCase):
net2 = Net(extream_point, func2)
# converge of line_search = None
opt2 = LBFGS(
lr=1,
opt2 = lbfgs.LBFGS(
learning_rate=1,
max_iter=50,
max_eval=None,
tolerance_grad=1e-07,
......@@ -155,8 +403,8 @@ class TestLbfgs(unittest.TestCase):
def error_func1():
extream_point = np.array([-1, 2]).astype('float32')
extream_point = paddle.to_tensor(extream_point)
return LBFGS(
lr=1,
return lbfgs.LBFGS(
learning_rate=1,
max_iter=10,
max_eval=None,
tolerance_grad=1e-07,
......@@ -185,8 +433,8 @@ class TestLbfgs(unittest.TestCase):
extream_point = np.array([-2.34, 1.45]).astype('float32')
net2 = Net(extream_point, func2)
# converge of line_search = None
opt2 = LBFGS(
lr=1,
opt2 = lbfgs.LBFGS(
learning_rate=1,
max_iter=50,
max_eval=None,
tolerance_grad=1e-07,
......@@ -216,7 +464,18 @@ class TestLbfgs(unittest.TestCase):
def func3(x, alpha, d):
return paddle.to_tensor(x + alpha * d), paddle.to_tensor([-1.0])
_strong_wolfe(
lbfgs._strong_wolfe(
func1,
paddle.to_tensor([1.0]),
paddle.to_tensor([0.001]),
paddle.to_tensor([1.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([0.0]),
paddle.to_tensor([0.0]),
max_ls=1,
)
lbfgs._strong_wolfe(
func1,
paddle.to_tensor([1.0]),
paddle.to_tensor([0.001]),
......@@ -227,7 +486,7 @@ class TestLbfgs(unittest.TestCase):
max_ls=0,
)
_strong_wolfe(
lbfgs._strong_wolfe(
func2,
paddle.to_tensor([1.0]),
paddle.to_tensor([-0.001]),
......@@ -238,7 +497,7 @@ class TestLbfgs(unittest.TestCase):
max_ls=1,
)
_strong_wolfe(
lbfgs._strong_wolfe(
func3,
paddle.to_tensor([1.0]),
paddle.to_tensor([-0.001]),
......@@ -249,7 +508,7 @@ class TestLbfgs(unittest.TestCase):
max_ls=1,
)
_cubic_interpolate(
lbfgs._cubic_interpolate(
paddle.to_tensor([2.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([0.0]),
......@@ -259,7 +518,7 @@ class TestLbfgs(unittest.TestCase):
[0.1, 0.5],
)
_cubic_interpolate(
lbfgs._cubic_interpolate(
paddle.to_tensor([2.0]),
paddle.to_tensor([0.0]),
paddle.to_tensor([-3.0]),
......@@ -269,6 +528,33 @@ class TestLbfgs(unittest.TestCase):
[0.1, 0.5],
)
def test_error3(self):
# test parameter shape size <= 0
def error_func3():
extream_point = np.array([-1, 2]).astype('float32')
extream_point = paddle.to_tensor(extream_point)
def func(w, x):
return w * x
net = Net(extream_point, func)
net.w = paddle.create_parameter(
shape=[-1, 2],
dtype=net.w.dtype,
)
opt = lbfgs.LBFGS(
learning_rate=1,
max_iter=10,
max_eval=None,
tolerance_grad=1e-07,
tolerance_change=1e-09,
history_size=5,
line_search_fn='strong_wolfe',
parameters=net.parameters(),
)
self.assertRaises(AssertionError, error_func3)
if __name__ == '__main__':
unittest.main()
......@@ -18,10 +18,12 @@ from functools import reduce
import paddle
from paddle.optimizer import Optimizer
from paddle.utils import deprecated
from .line_search_dygraph import _strong_wolfe
@deprecated(since="2.5.0", update_to="paddle.optimizer.LBFGS", level=1)
class LBFGS(Optimizer):
r"""
The L-BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function.
......@@ -40,7 +42,7 @@ class LBFGS(Optimizer):
Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS).
Args:
lr (float, optional): learning rate .The default value is 1.
learning_rate (float, optional): learning rate .The default value is 1.
max_iter (int, optional): maximal number of iterations per optimization step.
The default value is 20.
max_eval (int, optional): maximal number of function evaluations per optimization
......@@ -97,7 +99,7 @@ class LBFGS(Optimizer):
return self.w * x
net = Net()
opt = LBFGS(lr=1, max_iter=1, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=100, line_search_fn='strong_wolfe', parameters=net.parameters())
opt = LBFGS(learning_rate=1, max_iter=1, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=100, line_search_fn='strong_wolfe', parameters=net.parameters())
def train_step(inputs, targets):
def closure():
outputs = net(inputs)
......@@ -118,7 +120,7 @@ class LBFGS(Optimizer):
def __init__(
self,
lr=1.0,
learning_rate=1.0,
max_iter=20,
max_eval=None,
tolerance_grad=1e-7,
......@@ -133,7 +135,7 @@ class LBFGS(Optimizer):
if max_eval is None:
max_eval = max_iter * 5 // 4
self.lr = lr
self.learning_rate = learning_rate
self.max_iter = max_iter
self.max_eval = max_eval
self.tolerance_grad = tolerance_grad
......@@ -202,7 +204,7 @@ class LBFGS(Optimizer):
def _add_grad(self, alpha, direction):
offset = 0
for p in self._params:
numel = p.numel().item()
numel = reduce(lambda x, y: x * y, p.shape)
p = paddle.assign(
p.add(
direction[offset : offset + numel].reshape(p.shape) * alpha
......@@ -234,11 +236,10 @@ class LBFGS(Optimizer):
"""
with paddle.no_grad():
# Make sure the closure is always called with grad enabled
closure = paddle.enable_grad()(closure)
lr = self.lr
learning_rate = self.learning_rate
max_iter = self.max_iter
max_eval = self.max_eval
tolerance_grad = self.tolerance_grad
......@@ -342,9 +343,11 @@ class LBFGS(Optimizer):
############################################################
# reset initial guess for step size
if state['n_iter'] == 1:
alpha = min(1.0, 1.0 / flat_grad.abs().sum()) * lr
alpha = (
min(1.0, 1.0 / flat_grad.abs().sum()) * learning_rate
)
else:
alpha = lr
alpha = learning_rate
# directional derivative
gtd = flat_grad.dot(d)
......
......@@ -22,6 +22,7 @@ from .adadelta import Adadelta # noqa: F401
from .sgd import SGD # noqa: F401
from .momentum import Momentum # noqa: F401
from .lamb import Lamb # noqa: F401
from .lbfgs import LBFGS # noqa: F401
from . import lr # noqa: F401
__all__ = [ # noqa
......@@ -35,4 +36,5 @@ __all__ = [ # noqa
'SGD',
'Momentum',
'Lamb',
'LBFGS',
]
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册