未验证 提交 cbb0f59d 编写于 作者: M MRXLT 提交者: GitHub

[cherry pick to 2.0-beta]update optimizer (#26711) (#26943)

* update optimizer (#26711)

* update doc

* update doc

* fix optimizer sample code

* add default value for adamw weight_decay

* fix adamw

* change LearningRateDecay to _LRScheduler

* fix adamw;notest

* fix load;notest

* remove file

* bug fix

* fix code style

* bug fix

* add ut

* adamw support weight_decay=0

* fix ut

* fix set_lr doc

* fix doc

* change parameters place

* fix sample code
上级 5f239a19
...@@ -207,6 +207,7 @@ def load_dygraph(model_path, keep_name_table=False): ...@@ -207,6 +207,7 @@ def load_dygraph(model_path, keep_name_table=False):
# NOTE: `jit.save` doesn't save optimizer state # NOTE: `jit.save` doesn't save optimizer state
else: else:
# Load state dict by `save_dygraph` save format # Load state dict by `save_dygraph` save format
para_dict = {}
if os.path.exists(params_file_path): if os.path.exists(params_file_path):
with open(params_file_path, 'rb') as f: with open(params_file_path, 'rb') as f:
para_dict = pickle.load(f) if six.PY2 else pickle.load( para_dict = pickle.load(f) if six.PY2 else pickle.load(
......
...@@ -504,6 +504,19 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -504,6 +504,19 @@ class TestAdamOpV2(unittest.TestCase):
shape=[1], value=lr, dtype='float32') shape=[1], value=lr, dtype='float32')
adam.set_lr(lr_var) adam.set_lr(lr_var)
def test_adam_op_invalid_input(self):
paddle.disable_static()
linear = paddle.nn.Linear(10, 10)
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adam(
0.1, beta1=-1, parameters=linear.parameters())
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adam(
0.1, beta2=-1, parameters=linear.parameters())
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adam(
0.1, epsilon=-1, parameters=linear.parameters())
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -184,5 +184,21 @@ def adamax_step(inputs, attributes): ...@@ -184,5 +184,21 @@ def adamax_step(inputs, attributes):
return param_out, moment_out, inf_norm_out return param_out, moment_out, inf_norm_out
class TestAdamaxOpV2(unittest.TestCase):
def test_adamax_op_invalid_input(self):
import paddle
paddle.disable_static()
linear = paddle.nn.Linear(10, 10)
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adamax(
0.1, beta1=-1, parameters=linear.parameters())
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adamax(
0.1, beta2=-1, parameters=linear.parameters())
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adamax(
0.1, epsilon=-1, parameters=linear.parameters())
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -76,6 +76,19 @@ class TestAdamWOp(unittest.TestCase): ...@@ -76,6 +76,19 @@ class TestAdamWOp(unittest.TestCase):
rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss]) rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
assert rets[0] is not None assert rets[0] is not None
def test_adamw_op_invalid_input(self):
paddle.disable_static()
linear = paddle.nn.Linear(10, 10)
with self.assertRaises(ValueError):
adam = paddle.optimizer.AdamW(
0.1, beta1=-1, parameters=linear.parameters())
with self.assertRaises(ValueError):
adam = paddle.optimizer.AdamW(
0.1, beta2=-1, parameters=linear.parameters())
with self.assertRaises(ValueError):
adam = paddle.optimizer.AdamW(
0.1, epsilon=-1, parameters=linear.parameters())
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -401,9 +401,7 @@ class TestOptimizerLearningRate(unittest.TestCase): ...@@ -401,9 +401,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = fluid.dygraph.nn.Linear(10, 10) linear = fluid.dygraph.nn.Linear(10, 10)
a = fluid.dygraph.to_variable(a) a = fluid.dygraph.to_variable(a)
b = linear(a) b = linear(a)
loss = fluid.layers.reduce_mean(b) loss = fluid.layers.reduce_mean(b)
......
...@@ -276,6 +276,19 @@ class TestRMSPropV2(unittest.TestCase): ...@@ -276,6 +276,19 @@ class TestRMSPropV2(unittest.TestCase):
learning_rate=0.1, learning_rate=0.1,
momentum=None) momentum=None)
def test_rmsprop_op_invalid_input(self):
paddle.disable_static()
linear = paddle.nn.Linear(10, 10)
with self.assertRaises(ValueError):
adam = paddle.optimizer.RMSProp(
0.1, epsilon=-1, parameters=linear.parameters())
with self.assertRaises(ValueError):
adam = paddle.optimizer.RMSProp(
0.1, momentum=-1, parameters=linear.parameters())
with self.assertRaises(ValueError):
adam = paddle.optimizer.RMSProp(
0.1, rho=-1, parameters=linear.parameters())
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -45,8 +45,8 @@ class Adam(Optimizer): ...@@ -45,8 +45,8 @@ class Adam(Optimizer):
Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_ Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
Args: Args:
learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``. learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a LearningRateDecay. The default value is 0.001. It can be a float value or a _LRScheduler. The default value is 0.001.
beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates. beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
It should be a float number or a Tensor with shape [1] and data type as float32. It should be a float number or a Tensor with shape [1] and data type as float32.
The default value is 0.9. The default value is 0.9.
...@@ -55,7 +55,7 @@ class Adam(Optimizer): ...@@ -55,7 +55,7 @@ class Adam(Optimizer):
The default value is 0.999. The default value is 0.999.
epsilon (float, optional): A small float value for numerical stability. epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-08. The default value is 1e-08.
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
This parameter is required in dygraph mode. \ This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated. The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
...@@ -143,6 +143,12 @@ class Adam(Optimizer): ...@@ -143,6 +143,12 @@ class Adam(Optimizer):
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
assert epsilon is not None assert epsilon is not None
if not 0 <= beta1 < 1:
raise ValueError("Invaild value of beta1, expect beta1 in [0,1).")
if not 0 <= beta2 < 1:
raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
if not 0 <= epsilon:
raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
super(Adam, self).__init__( super(Adam, self).__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
parameters=parameters, parameters=parameters,
......
...@@ -47,15 +47,15 @@ class Adamax(Optimizer): ...@@ -47,15 +47,15 @@ class Adamax(Optimizer):
it is added here for numerical stability to prevent the division by 0 error. it is added here for numerical stability to prevent the division by 0 error.
Args: Args:
learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``. learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a LearningRateDecay. The default value is 0.001. It can be a float value or a _LRScheduler. The default value is 0.001.
beta1 (float, optional): The exponential decay rate for the 1st moment estimates. beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
The default value is 0.9. The default value is 0.9.
beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
The default value is 0.999. The default value is 0.999.
epsilon (float, optional): A small float value for numerical stability. epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-08. The default value is 1e-08.
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
This parameter is required in dygraph mode. \ This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated. The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
...@@ -118,6 +118,12 @@ class Adamax(Optimizer): ...@@ -118,6 +118,12 @@ class Adamax(Optimizer):
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
assert epsilon is not None assert epsilon is not None
if not 0 <= beta1 < 1:
raise ValueError("Invaild value of beta1, expect beta1 in [0,1).")
if not 0 <= beta2 < 1:
raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
if not 0 <= epsilon:
raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
super(Adamax, self).__init__( super(Adamax, self).__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
parameters=parameters, parameters=parameters,
......
...@@ -19,15 +19,125 @@ import paddle ...@@ -19,15 +19,125 @@ import paddle
__all__ = ['AdamW'] __all__ = ['AdamW']
class DecoupledWeightDecay(object): class AdamW(Adam):
def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs): """
The AdamW optimizer is implemented based on the AdamW Optimization
in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
it can resolves the problem of L2 regularization failure in the Adam optimizer.
.. math::
t & = t + 1
moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
learning\_rate & = learning\_rate * \\
\\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {beta}_1^t}
param\_out & = param - learning\_rate * (\\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param)
Args:
learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a _LRScheduler. The default value is 0.001.
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
It should be a float number or a Tensor with shape [1] and data type as float32.
The default value is 0.9.
beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
It should be a float number or a Tensor with shape [1] and data type as float32.
The default value is 0.999.
epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-08.
weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
apply_decay_param_fun (function|None, optional): If it is not None,
only tensors that makes apply_decay_param_fun(Tensor)==True
will be updated. It only works when we want to specify tensors.
Default: None.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
The accumulators are updated at every step. Every element of the two moving-average
is updated in both dense mode and sparse mode. If the size of parameter is very large,
then the update may be very slow. The lazy mode only update the element that has
gradient in current mini-batch, so it will be much more faster. But this mode has
different semantics with the original Adam algorithm and may lead to different result.
The default value is False.
**Notes**:
**Currently, AdamW doesn't support sparse parameter optimization.**
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
out = linear(inp)
loss = paddle.mean(out)
beta1 = paddle.to_tensor([0.9], dtype="float32")
beta2 = paddle.to_tensor([0.99], dtype="float32")
adam = paddle.optimizer.AdamW(learning_rate=0.1,
parameters=linear.parameters(),
beta1=beta1,
beta2=beta2,
weight_decay=0.01)
out.backward()
adam.step()
adam.clear_grad()
"""
def __init__(self,
learning_rate=0.001,
beta1=0.9,
beta2=0.999,
epsilon=1e-8,
parameters=None,
weight_decay=0.01,
apply_decay_param_fun=None,
grad_clip=None,
name=None,
lazy_mode=False):
assert learning_rate is not None
assert beta1 is not None
assert beta2 is not None
assert epsilon is not None
if not 0 <= beta1 < 1:
raise ValueError("Invaild value of beta1, expect beta1 in [0,1).")
if not 0 <= beta2 < 1:
raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
if not 0 <= epsilon:
raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
coeff = weight_decay
if not isinstance(coeff, float) and \ if not isinstance(coeff, float) and \
not isinstance(coeff, framework.Variable): not isinstance(coeff, framework.Variable):
raise TypeError("coeff should be float or Tensor.") raise TypeError("coeff should be float or Tensor.")
self._params_name = set() self._params_name = set()
self._apply_decay_param_fun = apply_decay_param_fun self._apply_decay_param_fun = apply_decay_param_fun
self._coeff = coeff self._coeff = coeff
super(DecoupledWeightDecay, self).__init__(**kwargs) super(AdamW, self).__init__(
learning_rate=learning_rate,
parameters=parameters,
beta1=beta1,
beta2=beta2,
epsilon=epsilon,
grad_clip=grad_clip,
name=name,
lazy_mode=lazy_mode)
def _scale_parameters(self, params_and_grads): def _scale_parameters(self, params_and_grads):
""" """
...@@ -40,8 +150,6 @@ class DecoupledWeightDecay(object): ...@@ -40,8 +150,6 @@ class DecoupledWeightDecay(object):
Raises: Raises:
Exception: The type of coeff and parameter is not consistent. Exception: The type of coeff and parameter is not consistent.
""" """
if isinstance(self._coeff, float) and self._coeff == 0.0:
return
scaled_params = [] scaled_params = []
for param, grad in params_and_grads: for param, grad in params_and_grads:
...@@ -58,20 +166,19 @@ class DecoupledWeightDecay(object): ...@@ -58,20 +166,19 @@ class DecoupledWeightDecay(object):
else: else:
assert self._coeff.dtype == param.dtype, \ assert self._coeff.dtype == param.dtype, \
"the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype) "the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)
if isinstance(self._learning_rate, float):
learning_rate = self._learning_rate
else:
self._learning_rate()
with param.block.program._optimized_guard( with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'): [param, grad]), framework.name_scope('weight decay'):
assert param.name not in self._params_name if param.name not in self._params_name:
scaled_params.append((param, grad, param * self._coeff)) scaled_params.append(
(param, grad, param * self._coeff * learning_rate))
self._params_name.add(param.name) self._params_name.add(param.name)
param = param * self._coeff
return scaled_params return scaled_params
def backward(self, **kargs):
return super(DecoupledWeightDecay, self).backward(**kargs)
def _apply_optimize(self, **kargs):
return super(DecoupledWeightDecay, self)._apply_optimize(**kargs)
def minimize(self, def minimize(self,
loss, loss,
startup_program=None, startup_program=None,
...@@ -116,118 +223,9 @@ class DecoupledWeightDecay(object): ...@@ -116,118 +223,9 @@ class DecoupledWeightDecay(object):
[param, grad]), framework.name_scope('weight decay'): [param, grad]), framework.name_scope('weight decay'):
updated_param = paddle.fluid.layers.elementwise_sub( updated_param = paddle.fluid.layers.elementwise_sub(
x=param, y=scaled_param) x=param, y=scaled_param)
paddle.fluid.layers.assign(input=updated_param, output=param) param.set_value(updated_param.numpy())
optimize_ops = self._apply_optimize( optimize_ops = self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads) loss=None, startup_program=None, params_grads=params_grads)
def __str__(self): def __str__(self):
return " ".join(["Weight Decay, params:", ",".join(self._params_name)]) return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
class AdamW(DecoupledWeightDecay, Adam):
"""
The AdamW optimizer is implemented based on the AdamW Optimization
in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
it can resolves the problem of L2 regularization failure in the Adam optimizer.
.. math::
t & = t + 1
moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
learning\_rate & = learning\_rate * \\
\\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {beta}_1^t}
param\_out & = param - learning\_rate * (\\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param)
Args:
learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
It can be a float value or a LearningRateDecay. The default value is 0.001.
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
It should be a float number or a Tensor with shape [1] and data type as float32.
The default value is 0.9.
beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
It should be a float number or a Tensor with shape [1] and data type as float32.
The default value is 0.999.
epsilon (float, optional): A small float value for numerical stability.
weight_decay (float|Tensor): The weight decay coefficient, it can be float or Tensor. The default value is 0.0.
The default value is 1e-08.
apply_decay_param_fun (function|None): If it is not None,
only tensors that makes apply_decay_param_fun(Tensor)==True
will be updated. It only works when we want to specify tensors.
Default: None.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
The accumulators are updated at every step. Every element of the two moving-average
is updated in both dense mode and sparse mode. If the size of parameter is very large,
then the update may be very slow. The lazy mode only update the element that has
gradient in current mini-batch, so it will be much more faster. But this mode has
different semantics with the original Adam algorithm and may lead to different result.
The default value is False.
**Notes**:
**Currently, AdamW doesn't support sparse parameter optimization.**
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
out = linear(inp)
loss = paddle.mean(out)
beta1 = paddle.to_tensor([0.9], dtype="float32")
beta2 = paddle.to_tensor([0.99], dtype="float32")
adam = paddle.optimizer.AdamW(learning_rate=0.1,
parameters=linear.parameters(),
beta1=beta1,
beta2=beta2,
weight_decay=0.01)
out.backward()
adam.step()
adam.clear_grad()
"""
def __init__(self,
learning_rate=0.001,
parameters=None,
beta1=0.9,
beta2=0.999,
epsilon=1e-8,
weight_decay=0.0,
apply_decay_param_fun=None,
grad_clip=None,
name=None,
lazy_mode=False):
args_dict = {
"learning_rate": learning_rate,
"parameters": parameters,
"beta1": beta1,
"beta2": beta2,
"epsilon": epsilon,
"grad_clip": grad_clip,
"name": name,
"lazy_mode": lazy_mode
}
super(AdamW, self).__init__(
weight_decay,
apply_decay_param_fun=apply_decay_param_fun,
**args_dict)
...@@ -80,7 +80,6 @@ class Optimizer(object): ...@@ -80,7 +80,6 @@ class Optimizer(object):
.. code-block:: python .. code-block:: python
#Take the subclass adam as an example #Take the subclass adam as an example
#Optimizer
import paddle import paddle
import numpy as np import numpy as np
...@@ -170,7 +169,7 @@ class Optimizer(object): ...@@ -170,7 +169,7 @@ class Optimizer(object):
import paddle import paddle
paddle.disable_static() paddle.disable_static()
emb = paddle.nn.Embedding([10, 10]) emb = paddle.nn.Embedding(10, 10)
adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
state_dict = adam.state_dict() state_dict = adam.state_dict()
...@@ -200,7 +199,7 @@ class Optimizer(object): ...@@ -200,7 +199,7 @@ class Optimizer(object):
import paddle import paddle
paddle.disable_static() paddle.disable_static()
emb = paddle.nn.Embedding([10, 10]) emb = paddle.nn.Embedding(10, 10)
state_dict = emb.state_dict() state_dict = emb.state_dict()
paddle.framework.save(state_dict, "paddle_dy") paddle.framework.save(state_dict, "paddle_dy")
...@@ -215,6 +214,8 @@ class Optimizer(object): ...@@ -215,6 +214,8 @@ class Optimizer(object):
adam.set_state_dict(opti_state_dict) adam.set_state_dict(opti_state_dict)
''' '''
if isinstance(self._learning_rate, _LRScheduler):
self._learning_rate.set_dict(state_dict["LR_Scheduler"])
if isinstance(self._learning_rate, _LRScheduler): if isinstance(self._learning_rate, _LRScheduler):
self._learning_rate.set_state_dict(state_dict["LR_Scheduler"]) self._learning_rate.set_state_dict(state_dict["LR_Scheduler"])
...@@ -270,6 +271,7 @@ class Optimizer(object): ...@@ -270,6 +271,7 @@ class Optimizer(object):
main_prog = framework.default_main_program() main_prog = framework.default_main_program()
main_prog.lr_sheduler = self._learning_rate main_prog.lr_sheduler = self._learning_rate
main_prog.lr_var = lr_var main_prog.lr_var = lr_var
self._learning_rate_map[framework.default_main_program( self._learning_rate_map[framework.default_main_program(
)] = lr_var )] = lr_var
...@@ -300,7 +302,7 @@ class Optimizer(object): ...@@ -300,7 +302,7 @@ class Optimizer(object):
this API cannot be invoked, because it will lead to conflict. this API cannot be invoked, because it will lead to conflict.
Args: Args:
value (float|Tensor): the value of learning rate value (float): the value of learning rate
Returns: Returns:
None None
...@@ -358,6 +360,7 @@ class Optimizer(object): ...@@ -358,6 +360,7 @@ class Optimizer(object):
Get current step learning rate. The return value is all the same When _LRScheduler is not used, Get current step learning rate. The return value is all the same When _LRScheduler is not used,
otherwise return the current step learning rate. otherwise return the current step learning rate.
Returns: Returns:
float: The learning rate of the current step. float: The learning rate of the current step.
...@@ -368,7 +371,7 @@ class Optimizer(object): ...@@ -368,7 +371,7 @@ class Optimizer(object):
import paddle import paddle
# example1: _LRScheduler is not used, return value is all the same # example1: _LRScheduler is not used, return value is all the same
paddle.disable_static() paddle.disable_static()
emb = paddle.nn.Embedding([10, 10]) emb = paddle.nn.Embedding(10, 10)
adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters()) adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters())
lr = adam.get_lr() lr = adam.get_lr()
print(lr) # 0.001 print(lr) # 0.001
...@@ -655,7 +658,7 @@ class Optimizer(object): ...@@ -655,7 +658,7 @@ class Optimizer(object):
paddle.disable_static() paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32") value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value) a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5, dtype="float32") linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph. # This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate = 0.01, adam = paddle.optimizer.Adam(learning_rate = 0.01,
parameters = linear.parameters()) parameters = linear.parameters())
...@@ -798,7 +801,7 @@ class Optimizer(object): ...@@ -798,7 +801,7 @@ class Optimizer(object):
paddle.disable_static() paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32") value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value) a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5, dtype="float32") linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph. # This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate = 0.01, adam = paddle.optimizer.Adam(learning_rate = 0.01,
parameters = linear.parameters()) parameters = linear.parameters())
...@@ -836,7 +839,7 @@ class Optimizer(object): ...@@ -836,7 +839,7 @@ class Optimizer(object):
tuple: tuple (optimize_ops, params_grads), A list of operators appended tuple: tuple (optimize_ops, params_grads), A list of operators appended
by minimize and a list of (param, grad) tensor pairs, param is by minimize and a list of (param, grad) tensor pairs, param is
``Parameter``, grad is the gradient value corresponding to the parameter. ``Parameter``, grad is the gradient value corresponding to the parameter.
The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
indicate program pruning. If so, the program will be pruned by ``feed`` and indicate program pruning. If so, the program will be pruned by ``feed`` and
``fetch_list`` before run, see details in ``Executor``. ``fetch_list`` before run, see details in ``Executor``.
...@@ -844,28 +847,25 @@ class Optimizer(object): ...@@ -844,28 +847,25 @@ class Optimizer(object):
.. code-block:: python .. code-block:: python
import paddle import paddle
import paddle.fluid as fluid import numpy as np
place = fluid.CPUPlace() paddle.disable_static()
main = fluid.Program() inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
with fluid.program_guard(main): linear = paddle.nn.Linear(10, 10)
x = fluid.data(name='x', shape=[None, 13], dtype='float32') inp = paddle.to_tensor(inp)
y = fluid.data(name='y', shape=[None, 1], dtype='float32') out = linear(inp)
y_predict = fluid.layers.fc(input=x, size=1, act=None) loss = paddle.mean(out)
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost) beta1 = paddle.to_tensor([0.9], dtype="float32")
beta2 = paddle.to_tensor([0.99], dtype="float32")
adam_optimizer = paddle.optimizer.Adam(0.01)
adam_optimizer.minimize(avg_cost) adam = paddle.optimizer.Adam(learning_rate=0.1,
parameters=linear.parameters(),
fetch_list = [avg_cost] weight_decay=0.01)
train_reader = paddle.batch( out.backward()
paddle.dataset.uci_housing.train(), batch_size=1) adam.minimize(loss)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) adam.clear_grad()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for data in train_reader():
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
""" """
assert isinstance(loss, Variable), "The loss should be an Tensor." assert isinstance(loss, Variable), "The loss should be an Tensor."
...@@ -885,7 +885,7 @@ class Optimizer(object): ...@@ -885,7 +885,7 @@ class Optimizer(object):
@framework.dygraph_only @framework.dygraph_only
def step(self): def step(self):
""" """
Execute the optimizer once. Execute the optimizer and update parameters once.
Returns: Returns:
None None
...@@ -898,7 +898,7 @@ class Optimizer(object): ...@@ -898,7 +898,7 @@ class Optimizer(object):
paddle.disable_static() paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32") value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value) a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5, dtype="float32") linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph. # This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate = 0.01, adam = paddle.optimizer.Adam(learning_rate = 0.01,
parameters = linear.parameters()) parameters = linear.parameters())
......
...@@ -69,8 +69,8 @@ class RMSProp(Optimizer): ...@@ -69,8 +69,8 @@ class RMSProp(Optimizer):
Parameters: Parameters:
learning_rate (float|LearningRateDecay): The learning rate used to update ``Parameter``. learning_rate (float|_LRScheduler): The learning rate used to update ``Parameter``.
It can be a float value or a LearningRateDecay. It can be a float value or a _LRScheduler.
rho(float): rho is :math: `\\rho` in equation, default is 0.95. rho(float): rho is :math: `\\rho` in equation, default is 0.95.
epsilon(float): :math: `\\epsilon` in equation is smoothing term to epsilon(float): :math: `\\epsilon` in equation is smoothing term to
avoid division by zero, default is 1e-6. avoid division by zero, default is 1e-6.
...@@ -80,7 +80,7 @@ class RMSProp(Optimizer): ...@@ -80,7 +80,7 @@ class RMSProp(Optimizer):
the gradient; if False, by the uncentered second moment. Setting this to the gradient; if False, by the uncentered second moment. Setting this to
True may help with training, but is slightly more expensive in terms of True may help with training, but is slightly more expensive in terms of
computation and memory. Defaults to False. computation and memory. Defaults to False.
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
This parameter is required in dygraph mode. \ This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated. The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
...@@ -147,6 +147,12 @@ class RMSProp(Optimizer): ...@@ -147,6 +147,12 @@ class RMSProp(Optimizer):
raise ValueError("epsilon is not set.") raise ValueError("epsilon is not set.")
if momentum is None: if momentum is None:
raise ValueError("momentum is not set.") raise ValueError("momentum is not set.")
if not 0.0 <= epsilon:
raise ValueError("Invalid value of epsilon, expect epsilon >= 0.")
if not 0.0 <= momentum:
raise ValueError("Invalid value of momentum, expect momentum >= 0.")
if not 0.0 <= rho:
raise ValueError("Invalid value of rho, expect rho >= 0.")
super(RMSProp, self).__init__( super(RMSProp, self).__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册