未验证 提交 1f36d3cd 编写于 作者: M MRXLT 提交者: GitHub

update optimizer (#26711)

* update doc

* update doc

* fix optimizer sample code

* add default value for adamw weight_decay

* fix adamw

* change LearningRateDecay to _LRScheduler

* fix adamw;notest

* fix load;notest

* remove file

* bug fix

* fix code style

* bug fix

* add ut

* adamw support weight_decay=0

* fix ut

* fix set_lr doc

* fix doc

* change parameters place
上级 8e674976
......@@ -207,6 +207,7 @@ def load_dygraph(model_path, keep_name_table=False):
# NOTE: `jit.save` doesn't save optimizer state
else:
# Load state dict by `save_dygraph` save format
para_dict = {}
if os.path.exists(params_file_path):
with open(params_file_path, 'rb') as f:
para_dict = pickle.load(f) if six.PY2 else pickle.load(
......
......@@ -504,6 +504,19 @@ class TestAdamOpV2(unittest.TestCase):
shape=[1], value=lr, dtype='float32')
adam.set_lr(lr_var)
def test_adam_op_invalid_input(self):
paddle.disable_static()
linear = paddle.nn.Linear(10, 10)
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adam(
0.1, beta1=-1, parameters=linear.parameters())
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adam(
0.1, beta2=-1, parameters=linear.parameters())
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adam(
0.1, epsilon=-1, parameters=linear.parameters())
if __name__ == "__main__":
unittest.main()
......@@ -184,5 +184,21 @@ def adamax_step(inputs, attributes):
return param_out, moment_out, inf_norm_out
class TestAdamaxOpV2(unittest.TestCase):
def test_adamax_op_invalid_input(self):
import paddle
paddle.disable_static()
linear = paddle.nn.Linear(10, 10)
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adamax(
0.1, beta1=-1, parameters=linear.parameters())
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adamax(
0.1, beta2=-1, parameters=linear.parameters())
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adamax(
0.1, epsilon=-1, parameters=linear.parameters())
if __name__ == "__main__":
unittest.main()
......@@ -76,6 +76,19 @@ class TestAdamWOp(unittest.TestCase):
rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
assert rets[0] is not None
def test_adamw_op_invalid_input(self):
paddle.disable_static()
linear = paddle.nn.Linear(10, 10)
with self.assertRaises(ValueError):
adam = paddle.optimizer.AdamW(
0.1, beta1=-1, parameters=linear.parameters())
with self.assertRaises(ValueError):
adam = paddle.optimizer.AdamW(
0.1, beta2=-1, parameters=linear.parameters())
with self.assertRaises(ValueError):
adam = paddle.optimizer.AdamW(
0.1, epsilon=-1, parameters=linear.parameters())
if __name__ == "__main__":
unittest.main()
......@@ -401,9 +401,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = fluid.dygraph.nn.Linear(10, 10)
a = fluid.dygraph.to_variable(a)
b = linear(a)
loss = fluid.layers.reduce_mean(b)
......
......@@ -276,6 +276,19 @@ class TestRMSPropV2(unittest.TestCase):
learning_rate=0.1,
momentum=None)
def test_rmsprop_op_invalid_input(self):
paddle.disable_static()
linear = paddle.nn.Linear(10, 10)
with self.assertRaises(ValueError):
adam = paddle.optimizer.RMSProp(
0.1, epsilon=-1, parameters=linear.parameters())
with self.assertRaises(ValueError):
adam = paddle.optimizer.RMSProp(
0.1, momentum=-1, parameters=linear.parameters())
with self.assertRaises(ValueError):
adam = paddle.optimizer.RMSProp(
0.1, rho=-1, parameters=linear.parameters())
if __name__ == "__main__":
unittest.main()
......@@ -45,8 +45,8 @@ class Adam(Optimizer):
Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
Args:
learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
It can be a float value or a LearningRateDecay. The default value is 0.001.
learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a _LRScheduler. The default value is 0.001.
beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
It should be a float number or a Tensor with shape [1] and data type as float32.
The default value is 0.9.
......@@ -55,7 +55,7 @@ class Adam(Optimizer):
The default value is 0.999.
epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-08.
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
......@@ -143,6 +143,12 @@ class Adam(Optimizer):
assert beta1 is not None
assert beta2 is not None
assert epsilon is not None
if not 0 <= beta1 < 1:
raise ValueError("Invaild value of beta1, expect beta1 in [0,1).")
if not 0 <= beta2 < 1:
raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
if not 0 <= epsilon:
raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
super(Adam, self).__init__(
learning_rate=learning_rate,
parameters=parameters,
......
......@@ -47,15 +47,15 @@ class Adamax(Optimizer):
it is added here for numerical stability to prevent the division by 0 error.
Args:
learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
It can be a float value or a LearningRateDecay. The default value is 0.001.
learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a _LRScheduler. The default value is 0.001.
beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
The default value is 0.9.
beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
The default value is 0.999.
epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-08.
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
......@@ -118,6 +118,12 @@ class Adamax(Optimizer):
assert beta1 is not None
assert beta2 is not None
assert epsilon is not None
if not 0 <= beta1 < 1:
raise ValueError("Invaild value of beta1, expect beta1 in [0,1).")
if not 0 <= beta2 < 1:
raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
if not 0 <= epsilon:
raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
super(Adamax, self).__init__(
learning_rate=learning_rate,
parameters=parameters,
......
......@@ -19,112 +19,7 @@ import paddle
__all__ = ['AdamW']
class DecoupledWeightDecay(object):
def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs):
if not isinstance(coeff, float) and \
not isinstance(coeff, framework.Variable):
raise TypeError("coeff should be float or Tensor.")
self._params_name = set()
self._apply_decay_param_fun = apply_decay_param_fun
self._coeff = coeff
super(DecoupledWeightDecay, self).__init__(**kwargs)
def _scale_parameters(self, params_and_grads):
"""
Adds weight decay ops.
scaled_parameter = parameter * coeff
Args:
params_and_grads: A list of (parameters, gradients) pairs,
the parameters need to decay.
Raises:
Exception: The type of coeff and parameter is not consistent.
"""
if isinstance(self._coeff, float) and self._coeff == 0.0:
return
scaled_params = []
for param, grad in params_and_grads:
# If no gradient then we don't need to do anything
if grad is None:
continue
if self._apply_decay_param_fun is not None \
and not self._apply_decay_param_fun(param.name):
continue
if isinstance(self._coeff, float):
assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \
"the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype)
else:
assert self._coeff.dtype == param.dtype, \
"the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)
with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
assert param.name not in self._params_name
scaled_params.append((param, grad, param * self._coeff))
self._params_name.add(param.name)
return scaled_params
def backward(self, **kargs):
return super(DecoupledWeightDecay, self).backward(**kargs)
def _apply_optimize(self, **kargs):
return super(DecoupledWeightDecay, self)._apply_optimize(**kargs)
def minimize(self,
loss,
startup_program=None,
parameters=None,
no_grad_set=None):
params_grads = self.backward(
loss=loss,
startup_program=startup_program,
parameters=parameters,
no_grad_set=no_grad_set)
scaled_params = self._scale_parameters(params_grads)
for p_grad_sgrad in scaled_params:
param, grad, scaled_param = p_grad_sgrad
with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
updated_param = paddle.fluid.layers.elementwise_sub(
x=param, y=scaled_param)
paddle.fluid.layers.assign(input=updated_param, output=param)
optimize_ops = self._apply_optimize(
loss=loss,
params_grads=params_grads,
startup_program=startup_program)
return optimize_ops, params_grads
@framework.dygraph_only
def step(self):
parameter_list = self._parameter_list
self._dtype = None
params_grads = []
for param in self._parameter_list:
if not param.trainable:
continue
if param._grad_ivar() is not None:
grad_var = param._grad_ivar()
params_grads.append((param, grad_var))
scaled_params = self._scale_parameters(params_grads)
for p_grad_sgrad in scaled_params:
param, grad, scaled_param = p_grad_sgrad
with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
updated_param = paddle.fluid.layers.elementwise_sub(
x=param, y=scaled_param)
paddle.fluid.layers.assign(input=updated_param, output=param)
optimize_ops = self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads)
def __str__(self):
return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
class AdamW(DecoupledWeightDecay, Adam):
class AdamW(Adam):
"""
The AdamW optimizer is implemented based on the AdamW Optimization
in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
......@@ -145,8 +40,8 @@ class AdamW(DecoupledWeightDecay, Adam):
Args:
learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
It can be a float value or a LearningRateDecay. The default value is 0.001.
learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a _LRScheduler. The default value is 0.001.
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
......@@ -157,9 +52,9 @@ class AdamW(DecoupledWeightDecay, Adam):
It should be a float number or a Tensor with shape [1] and data type as float32.
The default value is 0.999.
epsilon (float, optional): A small float value for numerical stability.
weight_decay (float|Tensor): The weight decay coefficient, it can be float or Tensor. The default value is 0.0.
The default value is 1e-08.
apply_decay_param_fun (function|None): If it is not None,
weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
apply_decay_param_fun (function|None, optional): If it is not None,
only tensors that makes apply_decay_param_fun(Tensor)==True
will be updated. It only works when we want to specify tensors.
Default: None.
......@@ -208,26 +103,129 @@ class AdamW(DecoupledWeightDecay, Adam):
def __init__(self,
learning_rate=0.001,
parameters=None,
beta1=0.9,
beta2=0.999,
epsilon=1e-8,
weight_decay=0.0,
parameters=None,
weight_decay=0.01,
apply_decay_param_fun=None,
grad_clip=None,
name=None,
lazy_mode=False):
args_dict = {
"learning_rate": learning_rate,
"parameters": parameters,
"beta1": beta1,
"beta2": beta2,
"epsilon": epsilon,
"grad_clip": grad_clip,
"name": name,
"lazy_mode": lazy_mode
}
assert learning_rate is not None
assert beta1 is not None
assert beta2 is not None
assert epsilon is not None
if not 0 <= beta1 < 1:
raise ValueError("Invaild value of beta1, expect beta1 in [0,1).")
if not 0 <= beta2 < 1:
raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
if not 0 <= epsilon:
raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
coeff = weight_decay
if not isinstance(coeff, float) and \
not isinstance(coeff, framework.Variable):
raise TypeError("coeff should be float or Tensor.")
self._params_name = set()
self._apply_decay_param_fun = apply_decay_param_fun
self._coeff = coeff
super(AdamW, self).__init__(
weight_decay,
apply_decay_param_fun=apply_decay_param_fun,
**args_dict)
learning_rate=learning_rate,
parameters=parameters,
beta1=beta1,
beta2=beta2,
epsilon=epsilon,
grad_clip=grad_clip,
name=name,
lazy_mode=lazy_mode)
def _scale_parameters(self, params_and_grads):
"""
Adds weight decay ops.
scaled_parameter = parameter * coeff
Args:
params_and_grads: A list of (parameters, gradients) pairs,
the parameters need to decay.
Raises:
Exception: The type of coeff and parameter is not consistent.
"""
scaled_params = []
for param, grad in params_and_grads:
# If no gradient then we don't need to do anything
if grad is None:
continue
if self._apply_decay_param_fun is not None \
and not self._apply_decay_param_fun(param.name):
continue
if isinstance(self._coeff, float):
assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \
"the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype)
else:
assert self._coeff.dtype == param.dtype, \
"the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)
if isinstance(self._learning_rate, float):
learning_rate = self._learning_rate
else:
self._learning_rate()
with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
if param.name not in self._params_name:
scaled_params.append(
(param, grad, param * self._coeff * learning_rate))
self._params_name.add(param.name)
param = param * self._coeff
return scaled_params
def minimize(self,
loss,
startup_program=None,
parameters=None,
no_grad_set=None):
params_grads = self.backward(
loss=loss,
startup_program=startup_program,
parameters=parameters,
no_grad_set=no_grad_set)
scaled_params = self._scale_parameters(params_grads)
for p_grad_sgrad in scaled_params:
param, grad, scaled_param = p_grad_sgrad
with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
updated_param = paddle.fluid.layers.elementwise_sub(
x=param, y=scaled_param)
paddle.fluid.layers.assign(input=updated_param, output=param)
optimize_ops = self._apply_optimize(
loss=loss,
params_grads=params_grads,
startup_program=startup_program)
return optimize_ops, params_grads
@framework.dygraph_only
def step(self):
parameter_list = self._parameter_list
self._dtype = None
params_grads = []
for param in self._parameter_list:
if not param.trainable:
continue
if param._grad_ivar() is not None:
grad_var = param._grad_ivar()
params_grads.append((param, grad_var))
scaled_params = self._scale_parameters(params_grads)
for p_grad_sgrad in scaled_params:
param, grad, scaled_param = p_grad_sgrad
with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
updated_param = paddle.fluid.layers.elementwise_sub(
x=param, y=scaled_param)
param.set_value(updated_param.numpy())
optimize_ops = self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads)
def __str__(self):
return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
......@@ -80,7 +80,6 @@ class Optimizer(object):
.. code-block:: python
#Take the subclass adam as an example
#Optimizer
import paddle
import numpy as np
......@@ -215,6 +214,8 @@ class Optimizer(object):
adam.set_state_dict(opti_state_dict)
'''
if isinstance(self._learning_rate, _LRScheduler):
self._learning_rate.set_dict(state_dict["LR_Scheduler"])
if isinstance(self._learning_rate, _LRScheduler):
self._learning_rate.set_state_dict(state_dict["LR_Scheduler"])
......@@ -270,6 +271,7 @@ class Optimizer(object):
main_prog = framework.default_main_program()
main_prog.lr_sheduler = self._learning_rate
main_prog.lr_var = lr_var
self._learning_rate_map[framework.default_main_program(
)] = lr_var
......@@ -300,7 +302,7 @@ class Optimizer(object):
this API cannot be invoked, because it will lead to conflict.
Args:
value (float|Tensor): the value of learning rate
value (float): the value of learning rate
Returns:
None
......@@ -358,6 +360,7 @@ class Optimizer(object):
Get current step learning rate. The return value is all the same When _LRScheduler is not used,
otherwise return the current step learning rate.
Returns:
float: The learning rate of the current step.
......@@ -655,7 +658,7 @@ class Optimizer(object):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5, dtype="float32")
linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate = 0.01,
parameters = linear.parameters())
......@@ -798,7 +801,7 @@ class Optimizer(object):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5, dtype="float32")
linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate = 0.01,
parameters = linear.parameters())
......@@ -836,36 +839,33 @@ class Optimizer(object):
tuple: tuple (optimize_ops, params_grads), A list of operators appended
by minimize and a list of (param, grad) tensor pairs, param is
``Parameter``, grad is the gradient value corresponding to the parameter.
The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
indicate program pruning. If so, the program will be pruned by ``feed`` and
``fetch_list`` before run, see details in ``Executor``.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
place = fluid.CPUPlace()
main = fluid.Program()
with fluid.program_guard(main):
x = fluid.data(name='x', shape=[None, 13], dtype='float32')
y = fluid.data(name='y', shape=[None, 1], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
adam_optimizer = paddle.optimizer.Adam(0.01)
adam_optimizer.minimize(avg_cost)
fetch_list = [avg_cost]
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for data in train_reader():
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
out = linear(inp)
loss = paddle.mean(out)
beta1 = paddle.to_tensor([0.9], dtype="float32")
beta2 = paddle.to_tensor([0.99], dtype="float32")
adam = paddle.optimizer.Adam(learning_rate=0.1,
parameters=linear.parameters(),
weight_decay=0.01)
out.backward()
adam.minimize(loss)
adam.clear_grad()
"""
assert isinstance(loss, Variable), "The loss should be an Tensor."
......@@ -885,7 +885,7 @@ class Optimizer(object):
@framework.dygraph_only
def step(self):
"""
Execute the optimizer once.
Execute the optimizer and update parameters once.
Returns:
None
......@@ -898,7 +898,7 @@ class Optimizer(object):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5, dtype="float32")
linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate = 0.01,
parameters = linear.parameters())
......
......@@ -69,8 +69,8 @@ class RMSProp(Optimizer):
Parameters:
learning_rate (float|LearningRateDecay): The learning rate used to update ``Parameter``.
It can be a float value or a LearningRateDecay.
learning_rate (float|_LRScheduler): The learning rate used to update ``Parameter``.
It can be a float value or a _LRScheduler.
rho(float): rho is :math: `\\rho` in equation, default is 0.95.
epsilon(float): :math: `\\epsilon` in equation is smoothing term to
avoid division by zero, default is 1e-6.
......@@ -80,7 +80,7 @@ class RMSProp(Optimizer):
the gradient; if False, by the uncentered second moment. Setting this to
True may help with training, but is slightly more expensive in terms of
computation and memory. Defaults to False.
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
......@@ -147,6 +147,12 @@ class RMSProp(Optimizer):
raise ValueError("epsilon is not set.")
if momentum is None:
raise ValueError("momentum is not set.")
if not 0.0 <= epsilon:
raise ValueError("Invalid value of epsilon, expect epsilon >= 0.")
if not 0.0 <= momentum:
raise ValueError("Invalid value of momentum, expect momentum >= 0.")
if not 0.0 <= rho:
raise ValueError("Invalid value of rho, expect rho >= 0.")
super(RMSProp, self).__init__(
learning_rate=learning_rate,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册