未验证 提交 8386417e 编写于 作者: L lijialin03 提交者: GitHub

add paddle.optimizer.LBFGS API and a modify its test case test=develop (#51912)

* modify numel in lbfgs and add a new test case. test=develop

* change param 'lr' to 'learning_rate' in lbfgs and its test

* add opt LBFGS and change test
上级 f9e5072b
...@@ -17,11 +17,9 @@ import unittest ...@@ -17,11 +17,9 @@ import unittest
import numpy as np import numpy as np
import paddle import paddle
from paddle.incubate.optimizer import LBFGS from paddle.incubate.optimizer import lbfgs as incubate_lbfgs
from paddle.incubate.optimizer.line_search_dygraph import ( from paddle.incubate.optimizer import line_search_dygraph
_cubic_interpolate, from paddle.optimizer import lbfgs
_strong_wolfe,
)
np.random.seed(123) np.random.seed(123)
...@@ -57,6 +55,256 @@ def train_step(inputs, targets, net, opt): ...@@ -57,6 +55,256 @@ def train_step(inputs, targets, net, opt):
class TestLbfgs(unittest.TestCase): class TestLbfgs(unittest.TestCase):
def test_function_fix_incubate(self):
paddle.disable_static()
np_w = np.random.rand(1).astype(np.float32)
input = np.random.rand(1).astype(np.float32)
weights = [np.random.rand(1).astype(np.float32) for i in range(5)]
targets = [weights[i] * input for i in range(5)]
def func(w, x):
return w * x
net = Net(np_w, func)
opt = incubate_lbfgs.LBFGS(
learning_rate=1,
max_iter=10,
max_eval=None,
tolerance_grad=1e-07,
tolerance_change=1e-09,
history_size=5,
line_search_fn='strong_wolfe',
parameters=net.parameters(),
)
for weight, target in zip(weights, targets):
input = paddle.to_tensor(input)
target = paddle.to_tensor(target)
loss = 1
while loss > 1e-4:
loss = train_step(input, target, net, opt)
np.testing.assert_allclose(net.w, weight, rtol=1e-05)
def test_inf_minima_incubate(self):
# not converage
input = np.random.rand(1).astype(np.float32)
def outputs1(x):
# weight[0] = 1.01 weight[1] = 0.99
return x * x * x - 3 * x * x + 3 * 1.01 * 0.99 * x
def outputs2(x):
# weight[0] = 4 weight[1] = 2
return pow(x, 4) + 5 * pow(x, 2)
targets = [outputs1(input), outputs2(input)]
input = paddle.to_tensor(input)
def func1(extream_point, x):
return (
x * x * x
- 3 * x * x
+ 3 * extream_point[0] * extream_point[1] * x
)
def func2(extream_point, x):
return pow(x, extream_point[0]) + 5 * pow(x, extream_point[1])
extream_point = np.array([-2.34, 1.45]).astype('float32')
net1 = Net(extream_point, func1)
# converge of old_sk.pop()
opt1 = incubate_lbfgs.LBFGS(
learning_rate=1,
max_iter=10,
max_eval=None,
tolerance_grad=1e-07,
tolerance_change=1e-09,
history_size=1,
line_search_fn='strong_wolfe',
parameters=net1.parameters(),
)
net2 = Net(extream_point, func2)
# converge of line_search = None
opt2 = incubate_lbfgs.LBFGS(
learning_rate=1,
max_iter=50,
max_eval=None,
tolerance_grad=1e-07,
tolerance_change=1e-09,
history_size=10,
line_search_fn=None,
parameters=net2.parameters(),
)
n_iter = 0
while n_iter < 20:
loss = train_step(input, paddle.to_tensor(targets[0]), net1, opt1)
n_iter = opt1.state_dict()["state"]["func_evals"]
n_iter = 0
while n_iter < 10:
loss = train_step(input, paddle.to_tensor(targets[1]), net2, opt2)
n_iter = opt1.state_dict()["state"]["func_evals"]
def test_error_incubate(self):
# test parameter is not Paddle Tensor
def error_func1():
extream_point = np.array([-1, 2]).astype('float32')
extream_point = paddle.to_tensor(extream_point)
return incubate_lbfgs.LBFGS(
learning_rate=1,
max_iter=10,
max_eval=None,
tolerance_grad=1e-07,
tolerance_change=1e-09,
history_size=3,
line_search_fn='strong_wolfe',
parameters=extream_point,
)
self.assertRaises(TypeError, error_func1)
def test_error2_incubate(self):
# not converage
input = np.random.rand(1).astype(np.float32)
def outputs2(x):
# weight[0] = 4 weight[1] = 2
return pow(x, 4) + 5 * pow(x, 2)
targets = [outputs2(input)]
input = paddle.to_tensor(input)
def func2(extream_point, x):
return pow(x, extream_point[0]) + 5 * pow(x, extream_point[1])
extream_point = np.array([-2.34, 1.45]).astype('float32')
net2 = Net(extream_point, func2)
# converge of line_search = None
opt2 = incubate_lbfgs.LBFGS(
learning_rate=1,
max_iter=50,
max_eval=None,
tolerance_grad=1e-07,
tolerance_change=1e-09,
history_size=10,
line_search_fn='None',
parameters=net2.parameters(),
)
def error_func():
n_iter = 0
while n_iter < 10:
loss = train_step(
input, paddle.to_tensor(targets[0]), net2, opt2
)
n_iter = opt2.state_dict()["state"]["func_evals"]
self.assertRaises(RuntimeError, error_func)
def test_line_search_incubate(self):
def func1(x, alpha, d):
return paddle.to_tensor(x + alpha * d), paddle.to_tensor([0.0])
def func2(x, alpha, d):
return paddle.to_tensor(x + alpha * d), paddle.to_tensor([1.0])
def func3(x, alpha, d):
return paddle.to_tensor(x + alpha * d), paddle.to_tensor([-1.0])
line_search_dygraph._strong_wolfe(
func1,
paddle.to_tensor([1.0]),
paddle.to_tensor([0.001]),
paddle.to_tensor([1.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([0.0]),
paddle.to_tensor([0.0]),
max_ls=1,
)
line_search_dygraph._strong_wolfe(
func1,
paddle.to_tensor([1.0]),
paddle.to_tensor([0.001]),
paddle.to_tensor([0.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([0.0]),
paddle.to_tensor([0.0]),
max_ls=0,
)
line_search_dygraph._strong_wolfe(
func2,
paddle.to_tensor([1.0]),
paddle.to_tensor([-0.001]),
paddle.to_tensor([1.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([1.0]),
max_ls=1,
)
line_search_dygraph._strong_wolfe(
func3,
paddle.to_tensor([1.0]),
paddle.to_tensor([-0.001]),
paddle.to_tensor([1.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([1.0]),
max_ls=1,
)
line_search_dygraph._cubic_interpolate(
paddle.to_tensor([2.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([0.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([2.0]),
paddle.to_tensor([0.0]),
[0.1, 0.5],
)
line_search_dygraph._cubic_interpolate(
paddle.to_tensor([2.0]),
paddle.to_tensor([0.0]),
paddle.to_tensor([-3.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([-0.1]),
[0.1, 0.5],
)
def test_error3_incubate(self):
# test parameter shape size <= 0
def error_func3():
extream_point = np.array([-1, 2]).astype('float32')
extream_point = paddle.to_tensor(extream_point)
def func(w, x):
return w * x
net = Net(extream_point, func)
net.w = paddle.create_parameter(
shape=[-1, 2],
dtype=net.w.dtype,
)
opt = incubate_lbfgs.LBFGS(
learning_rate=1,
max_iter=10,
max_eval=None,
tolerance_grad=1e-07,
tolerance_change=1e-09,
history_size=5,
line_search_fn='strong_wolfe',
parameters=net.parameters(),
)
self.assertRaises(AssertionError, error_func3)
def test_function_fix(self): def test_function_fix(self):
paddle.disable_static() paddle.disable_static()
np_w = np.random.rand(1).astype(np.float32) np_w = np.random.rand(1).astype(np.float32)
...@@ -69,8 +317,8 @@ class TestLbfgs(unittest.TestCase): ...@@ -69,8 +317,8 @@ class TestLbfgs(unittest.TestCase):
return w * x return w * x
net = Net(np_w, func) net = Net(np_w, func)
opt = LBFGS( opt = lbfgs.LBFGS(
lr=1, learning_rate=1,
max_iter=10, max_iter=10,
max_eval=None, max_eval=None,
tolerance_grad=1e-07, tolerance_grad=1e-07,
...@@ -116,8 +364,8 @@ class TestLbfgs(unittest.TestCase): ...@@ -116,8 +364,8 @@ class TestLbfgs(unittest.TestCase):
extream_point = np.array([-2.34, 1.45]).astype('float32') extream_point = np.array([-2.34, 1.45]).astype('float32')
net1 = Net(extream_point, func1) net1 = Net(extream_point, func1)
# converge of old_sk.pop() # converge of old_sk.pop()
opt1 = LBFGS( opt1 = lbfgs.LBFGS(
lr=1, learning_rate=1,
max_iter=10, max_iter=10,
max_eval=None, max_eval=None,
tolerance_grad=1e-07, tolerance_grad=1e-07,
...@@ -129,8 +377,8 @@ class TestLbfgs(unittest.TestCase): ...@@ -129,8 +377,8 @@ class TestLbfgs(unittest.TestCase):
net2 = Net(extream_point, func2) net2 = Net(extream_point, func2)
# converge of line_search = None # converge of line_search = None
opt2 = LBFGS( opt2 = lbfgs.LBFGS(
lr=1, learning_rate=1,
max_iter=50, max_iter=50,
max_eval=None, max_eval=None,
tolerance_grad=1e-07, tolerance_grad=1e-07,
...@@ -155,8 +403,8 @@ class TestLbfgs(unittest.TestCase): ...@@ -155,8 +403,8 @@ class TestLbfgs(unittest.TestCase):
def error_func1(): def error_func1():
extream_point = np.array([-1, 2]).astype('float32') extream_point = np.array([-1, 2]).astype('float32')
extream_point = paddle.to_tensor(extream_point) extream_point = paddle.to_tensor(extream_point)
return LBFGS( return lbfgs.LBFGS(
lr=1, learning_rate=1,
max_iter=10, max_iter=10,
max_eval=None, max_eval=None,
tolerance_grad=1e-07, tolerance_grad=1e-07,
...@@ -185,8 +433,8 @@ class TestLbfgs(unittest.TestCase): ...@@ -185,8 +433,8 @@ class TestLbfgs(unittest.TestCase):
extream_point = np.array([-2.34, 1.45]).astype('float32') extream_point = np.array([-2.34, 1.45]).astype('float32')
net2 = Net(extream_point, func2) net2 = Net(extream_point, func2)
# converge of line_search = None # converge of line_search = None
opt2 = LBFGS( opt2 = lbfgs.LBFGS(
lr=1, learning_rate=1,
max_iter=50, max_iter=50,
max_eval=None, max_eval=None,
tolerance_grad=1e-07, tolerance_grad=1e-07,
...@@ -216,7 +464,18 @@ class TestLbfgs(unittest.TestCase): ...@@ -216,7 +464,18 @@ class TestLbfgs(unittest.TestCase):
def func3(x, alpha, d): def func3(x, alpha, d):
return paddle.to_tensor(x + alpha * d), paddle.to_tensor([-1.0]) return paddle.to_tensor(x + alpha * d), paddle.to_tensor([-1.0])
_strong_wolfe( lbfgs._strong_wolfe(
func1,
paddle.to_tensor([1.0]),
paddle.to_tensor([0.001]),
paddle.to_tensor([1.0]),
paddle.to_tensor([1.0]),
paddle.to_tensor([0.0]),
paddle.to_tensor([0.0]),
max_ls=1,
)
lbfgs._strong_wolfe(
func1, func1,
paddle.to_tensor([1.0]), paddle.to_tensor([1.0]),
paddle.to_tensor([0.001]), paddle.to_tensor([0.001]),
...@@ -227,7 +486,7 @@ class TestLbfgs(unittest.TestCase): ...@@ -227,7 +486,7 @@ class TestLbfgs(unittest.TestCase):
max_ls=0, max_ls=0,
) )
_strong_wolfe( lbfgs._strong_wolfe(
func2, func2,
paddle.to_tensor([1.0]), paddle.to_tensor([1.0]),
paddle.to_tensor([-0.001]), paddle.to_tensor([-0.001]),
...@@ -238,7 +497,7 @@ class TestLbfgs(unittest.TestCase): ...@@ -238,7 +497,7 @@ class TestLbfgs(unittest.TestCase):
max_ls=1, max_ls=1,
) )
_strong_wolfe( lbfgs._strong_wolfe(
func3, func3,
paddle.to_tensor([1.0]), paddle.to_tensor([1.0]),
paddle.to_tensor([-0.001]), paddle.to_tensor([-0.001]),
...@@ -249,7 +508,7 @@ class TestLbfgs(unittest.TestCase): ...@@ -249,7 +508,7 @@ class TestLbfgs(unittest.TestCase):
max_ls=1, max_ls=1,
) )
_cubic_interpolate( lbfgs._cubic_interpolate(
paddle.to_tensor([2.0]), paddle.to_tensor([2.0]),
paddle.to_tensor([1.0]), paddle.to_tensor([1.0]),
paddle.to_tensor([0.0]), paddle.to_tensor([0.0]),
...@@ -259,7 +518,7 @@ class TestLbfgs(unittest.TestCase): ...@@ -259,7 +518,7 @@ class TestLbfgs(unittest.TestCase):
[0.1, 0.5], [0.1, 0.5],
) )
_cubic_interpolate( lbfgs._cubic_interpolate(
paddle.to_tensor([2.0]), paddle.to_tensor([2.0]),
paddle.to_tensor([0.0]), paddle.to_tensor([0.0]),
paddle.to_tensor([-3.0]), paddle.to_tensor([-3.0]),
...@@ -269,6 +528,33 @@ class TestLbfgs(unittest.TestCase): ...@@ -269,6 +528,33 @@ class TestLbfgs(unittest.TestCase):
[0.1, 0.5], [0.1, 0.5],
) )
def test_error3(self):
# test parameter shape size <= 0
def error_func3():
extream_point = np.array([-1, 2]).astype('float32')
extream_point = paddle.to_tensor(extream_point)
def func(w, x):
return w * x
net = Net(extream_point, func)
net.w = paddle.create_parameter(
shape=[-1, 2],
dtype=net.w.dtype,
)
opt = lbfgs.LBFGS(
learning_rate=1,
max_iter=10,
max_eval=None,
tolerance_grad=1e-07,
tolerance_change=1e-09,
history_size=5,
line_search_fn='strong_wolfe',
parameters=net.parameters(),
)
self.assertRaises(AssertionError, error_func3)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -18,10 +18,12 @@ from functools import reduce ...@@ -18,10 +18,12 @@ from functools import reduce
import paddle import paddle
from paddle.optimizer import Optimizer from paddle.optimizer import Optimizer
from paddle.utils import deprecated
from .line_search_dygraph import _strong_wolfe from .line_search_dygraph import _strong_wolfe
@deprecated(since="2.5.0", update_to="paddle.optimizer.LBFGS", level=1)
class LBFGS(Optimizer): class LBFGS(Optimizer):
r""" r"""
The L-BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function. The L-BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function.
...@@ -40,7 +42,7 @@ class LBFGS(Optimizer): ...@@ -40,7 +42,7 @@ class LBFGS(Optimizer):
Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS). Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS).
Args: Args:
lr (float, optional): learning rate .The default value is 1. learning_rate (float, optional): learning rate .The default value is 1.
max_iter (int, optional): maximal number of iterations per optimization step. max_iter (int, optional): maximal number of iterations per optimization step.
The default value is 20. The default value is 20.
max_eval (int, optional): maximal number of function evaluations per optimization max_eval (int, optional): maximal number of function evaluations per optimization
...@@ -97,7 +99,7 @@ class LBFGS(Optimizer): ...@@ -97,7 +99,7 @@ class LBFGS(Optimizer):
return self.w * x return self.w * x
net = Net() net = Net()
opt = LBFGS(lr=1, max_iter=1, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=100, line_search_fn='strong_wolfe', parameters=net.parameters()) opt = LBFGS(learning_rate=1, max_iter=1, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=100, line_search_fn='strong_wolfe', parameters=net.parameters())
def train_step(inputs, targets): def train_step(inputs, targets):
def closure(): def closure():
outputs = net(inputs) outputs = net(inputs)
...@@ -118,7 +120,7 @@ class LBFGS(Optimizer): ...@@ -118,7 +120,7 @@ class LBFGS(Optimizer):
def __init__( def __init__(
self, self,
lr=1.0, learning_rate=1.0,
max_iter=20, max_iter=20,
max_eval=None, max_eval=None,
tolerance_grad=1e-7, tolerance_grad=1e-7,
...@@ -133,7 +135,7 @@ class LBFGS(Optimizer): ...@@ -133,7 +135,7 @@ class LBFGS(Optimizer):
if max_eval is None: if max_eval is None:
max_eval = max_iter * 5 // 4 max_eval = max_iter * 5 // 4
self.lr = lr self.learning_rate = learning_rate
self.max_iter = max_iter self.max_iter = max_iter
self.max_eval = max_eval self.max_eval = max_eval
self.tolerance_grad = tolerance_grad self.tolerance_grad = tolerance_grad
...@@ -202,7 +204,7 @@ class LBFGS(Optimizer): ...@@ -202,7 +204,7 @@ class LBFGS(Optimizer):
def _add_grad(self, alpha, direction): def _add_grad(self, alpha, direction):
offset = 0 offset = 0
for p in self._params: for p in self._params:
numel = p.numel().item() numel = reduce(lambda x, y: x * y, p.shape)
p = paddle.assign( p = paddle.assign(
p.add( p.add(
direction[offset : offset + numel].reshape(p.shape) * alpha direction[offset : offset + numel].reshape(p.shape) * alpha
...@@ -234,11 +236,10 @@ class LBFGS(Optimizer): ...@@ -234,11 +236,10 @@ class LBFGS(Optimizer):
""" """
with paddle.no_grad(): with paddle.no_grad():
# Make sure the closure is always called with grad enabled # Make sure the closure is always called with grad enabled
closure = paddle.enable_grad()(closure) closure = paddle.enable_grad()(closure)
lr = self.lr learning_rate = self.learning_rate
max_iter = self.max_iter max_iter = self.max_iter
max_eval = self.max_eval max_eval = self.max_eval
tolerance_grad = self.tolerance_grad tolerance_grad = self.tolerance_grad
...@@ -342,9 +343,11 @@ class LBFGS(Optimizer): ...@@ -342,9 +343,11 @@ class LBFGS(Optimizer):
############################################################ ############################################################
# reset initial guess for step size # reset initial guess for step size
if state['n_iter'] == 1: if state['n_iter'] == 1:
alpha = min(1.0, 1.0 / flat_grad.abs().sum()) * lr alpha = (
min(1.0, 1.0 / flat_grad.abs().sum()) * learning_rate
)
else: else:
alpha = lr alpha = learning_rate
# directional derivative # directional derivative
gtd = flat_grad.dot(d) gtd = flat_grad.dot(d)
......
...@@ -22,6 +22,7 @@ from .adadelta import Adadelta # noqa: F401 ...@@ -22,6 +22,7 @@ from .adadelta import Adadelta # noqa: F401
from .sgd import SGD # noqa: F401 from .sgd import SGD # noqa: F401
from .momentum import Momentum # noqa: F401 from .momentum import Momentum # noqa: F401
from .lamb import Lamb # noqa: F401 from .lamb import Lamb # noqa: F401
from .lbfgs import LBFGS # noqa: F401
from . import lr # noqa: F401 from . import lr # noqa: F401
__all__ = [ # noqa __all__ = [ # noqa
...@@ -35,4 +36,5 @@ __all__ = [ # noqa ...@@ -35,4 +36,5 @@ __all__ = [ # noqa
'SGD', 'SGD',
'Momentum', 'Momentum',
'Lamb', 'Lamb',
'LBFGS',
] ]
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册