未验证 提交 2a771c06 编写于 作者: W wangguanzhong 提交者: GitHub

support params groups, test=develop (#32830)

* support params groups, test=develop

* simplify updating opt attr

* update according to review
上级 cf9a4bd0
...@@ -127,6 +127,7 @@ class TestAdadeltaV2(unittest.TestCase): ...@@ -127,6 +127,7 @@ class TestAdadeltaV2(unittest.TestCase):
adam.clear_gradients() adam.clear_gradients()
def test_adadelta(self): def test_adadelta(self):
paddle.enable_static()
place = fluid.CPUPlace() place = fluid.CPUPlace()
main = fluid.Program() main = fluid.Program()
with fluid.program_guard(main): with fluid.program_guard(main):
...@@ -159,5 +160,29 @@ class TestAdadeltaV2(unittest.TestCase): ...@@ -159,5 +160,29 @@ class TestAdadeltaV2(unittest.TestCase):
epsilon=None) epsilon=None)
class TestAdadeltaV2Group(TestAdadeltaV2):
def test_adadelta_dygraph(self):
paddle.disable_static(paddle.CPUPlace())
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear_1 = paddle.nn.Linear(13, 5)
linear_2 = paddle.nn.Linear(5, 5)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adadelta(
learning_rate=0.01,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001,
}],
weight_decay=0.1)
out = linear_1(a)
out = linear_2(out)
out.backward()
adam.step()
adam.clear_gradients()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -37,5 +37,28 @@ class TestAdagradOpV2(unittest.TestCase): ...@@ -37,5 +37,28 @@ class TestAdagradOpV2(unittest.TestCase):
adagrad.clear_grad() adagrad.clear_grad()
class TestAdagradOpV2Group(TestAdagradOpV2):
def test_v20_coverage(self):
paddle.disable_static()
inp = paddle.rand(shape=[10, 10])
linear_1 = paddle.nn.Linear(10, 10)
linear_2 = paddle.nn.Linear(10, 10)
out = linear_1(inp)
out = linear_2(out)
loss = paddle.mean(out)
adagrad = paddle.optimizer.Adagrad(
learning_rate=0.01,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001,
}],
weight_decay=0.1)
out.backward()
adagrad.step()
adagrad.clear_grad()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -810,5 +810,31 @@ class TestNetWithEpsilonTensor(unittest.TestCase): ...@@ -810,5 +810,31 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
paddle.enable_static() paddle.enable_static()
class TestAdamOpV2Group(TestAdamOpV2):
def test_adam_op(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear_1 = paddle.nn.Linear(13, 5)
linear_2 = paddle.nn.Linear(5, 3)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(
learning_rate=0.01,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001,
'beta1': 0.1,
'beta2': 0.99
}],
weight_decay=0.1)
out = linear_1(a)
out = linear_2(out)
out.backward()
adam.step()
adam.clear_gradients()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -37,6 +37,7 @@ class TestAdamaxAPI(unittest.TestCase): ...@@ -37,6 +37,7 @@ class TestAdamaxAPI(unittest.TestCase):
adam.clear_gradients() adam.clear_gradients()
def test_adamax_api(self): def test_adamax_api(self):
paddle.enable_static()
place = fluid.CPUPlace() place = fluid.CPUPlace()
shape = [2, 3, 8, 8] shape = [2, 3, 8, 8]
exe = fluid.Executor(place) exe = fluid.Executor(place)
...@@ -63,5 +64,31 @@ class TestAdamaxAPI(unittest.TestCase): ...@@ -63,5 +64,31 @@ class TestAdamaxAPI(unittest.TestCase):
assert rets[0] is not None assert rets[0] is not None
class TestAdamaxAPIGroup(TestAdamaxAPI):
def test_adamax_api_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear_1 = paddle.nn.Linear(13, 5)
linear_2 = paddle.nn.Linear(5, 3)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adamax(
learning_rate=0.01,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001,
'beta1': 0.1,
'beta2': 0.99
}],
weight_decay=0.1)
out = linear_1(a)
out = linear_2(out)
out.backward()
adam.step()
adam.clear_gradients()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -121,5 +121,31 @@ class TestAdamWOp(unittest.TestCase): ...@@ -121,5 +121,31 @@ class TestAdamWOp(unittest.TestCase):
adam.clear_gradients() adam.clear_gradients()
class TestAdamWOpGroup(TestAdamWOp):
def test_adamw_op_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear_1 = paddle.nn.Linear(13, 5)
linear_2 = paddle.nn.Linear(5, 3)
adam = paddle.optimizer.AdamW(
learning_rate=0.01,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001
}],
apply_decay_param_fun=lambda name: True,
weight_decay=0.01)
for _ in range(2):
out = linear_1(a)
out = linear_2(out)
out.backward()
adam.step()
adam.clear_gradients()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -155,5 +155,31 @@ class TestLambOpWithCombinedOp(unittest.TestCase): ...@@ -155,5 +155,31 @@ class TestLambOpWithCombinedOp(unittest.TestCase):
self.assertTrue(np.allclose(out, output)) self.assertTrue(np.allclose(out, output))
class TestLambOpV2Group(TestLambOpV2):
def test_lamb_op(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear_1 = paddle.nn.Linear(13, 5)
linear_2 = paddle.nn.Linear(5, 3)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Lamb(
learning_rate=0.01,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'lamb_weight_decay': 0.001,
'beta1': 0.9,
'beta2': 0.99
}],
lamb_weight_decay=0.01)
out = linear_1(a)
out = linear_2(out)
out.backward()
adam.step()
adam.clear_gradients()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -610,5 +610,32 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase): ...@@ -610,5 +610,32 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
self.__test_vs(place=place) self.__test_vs(place=place)
class TestMomentumV2Group(TestMomentumV2):
def test_momentum_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear_1 = paddle.nn.Linear(13, 5)
linear_2 = paddle.nn.Linear(5, 3)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Momentum(
learning_rate=0.01,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001,
'learning_rate': 0.1,
'momentum': 0.99
}],
weight_decay=0.1,
momentum=0.9)
out = linear_1(a)
out = linear_2(out)
out.backward()
adam.step()
adam.clear_gradients()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -240,6 +240,7 @@ class TestRMSPropV2(unittest.TestCase): ...@@ -240,6 +240,7 @@ class TestRMSPropV2(unittest.TestCase):
adam.clear_gradients() adam.clear_gradients()
def test_rmsprop(self): def test_rmsprop(self):
paddle.enable_static()
place = fluid.CPUPlace() place = fluid.CPUPlace()
main = fluid.Program() main = fluid.Program()
with fluid.program_guard(main): with fluid.program_guard(main):
...@@ -290,5 +291,29 @@ class TestRMSPropV2(unittest.TestCase): ...@@ -290,5 +291,29 @@ class TestRMSPropV2(unittest.TestCase):
0.1, rho=-1, parameters=linear.parameters()) 0.1, rho=-1, parameters=linear.parameters())
class TestRMSPropV2Group(TestRMSPropV2):
def test_rmsprop_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear_1 = paddle.nn.Linear(13, 5)
linear_2 = paddle.nn.Linear(5, 3)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.RMSProp(
learning_rate=0.01,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001
}],
weight_decay=0.01)
out = linear_1(a)
out = linear_2(out)
out.backward()
adam.step()
adam.clear_gradients()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -225,6 +225,7 @@ class TestSGDV2(unittest.TestCase): ...@@ -225,6 +225,7 @@ class TestSGDV2(unittest.TestCase):
adam.clear_gradients() adam.clear_gradients()
def test_sgd(self): def test_sgd(self):
paddle.enable_static()
place = fluid.CPUPlace() place = fluid.CPUPlace()
main = fluid.Program() main = fluid.Program()
with fluid.program_guard(main): with fluid.program_guard(main):
...@@ -250,5 +251,29 @@ class TestSGDV2(unittest.TestCase): ...@@ -250,5 +251,29 @@ class TestSGDV2(unittest.TestCase):
self.assertRaises(ValueError, paddle.optimizer.SGD, learning_rate=None) self.assertRaises(ValueError, paddle.optimizer.SGD, learning_rate=None)
class TestSGDV2Group(TestSGDV2):
def test_sgd_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear_1 = paddle.nn.Linear(13, 5)
linear_2 = paddle.nn.Linear(5, 3)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.SGD(learning_rate=0.01,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001,
'learning_rate': 0.1
}],
weight_decay=0.01)
out = linear_1(a)
out = linear_2(out)
out.backward()
adam.step()
adam.clear_gradients()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -43,7 +43,10 @@ class Adadelta(Optimizer): ...@@ -43,7 +43,10 @@ class Adadelta(Optimizer):
epsilon (float): a small float number for numeric stability. Default 1.0e-6. epsilon (float): a small float number for numeric stability. Default 1.0e-6.
rho (float): a floating point value indicating the decay rate. Default 0.95. rho (float): a floating point value indicating the decay rate. Default 0.95.
parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \ parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
This parameter is required in dygraph mode. \ This parameter is required in dygraph mode. And you can specify different options for \
different parameter groups such as the learning rate, weight decay, etc, \
then the parameters are list of dict. Note that the learning_rate in paramter groups \
represents the scale of base learning_rate. \
The default value is None in static mode, at this time all parameters will be updated. The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
It canbe a float value as coeff of L2 regularization or \ It canbe a float value as coeff of L2 regularization or \
...@@ -77,6 +80,27 @@ class Adadelta(Optimizer): ...@@ -77,6 +80,27 @@ class Adadelta(Optimizer):
adadelta.step() adadelta.step()
adadelta.clear_grad() adadelta.clear_grad()
#Note that the learning_rate of linear_2 is 0.01.
linear_1 = paddle.nn.Linear(10, 10)
linear_2 = paddle.nn.Linear(10, 10)
inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
out = linear_1(inp)
out = linear_2(out)
loss = paddle.mean(out)
adadelta = paddle.optimizer.Adadelta(
learning_rate=0.1,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001,
'learning_rate': 0.1,
}],
weight_decay=0.01)
out.backward()
adadelta.step()
adadelta.clear_grad()
""" """
_avg_squared_grad_acc_str = "_avg_squared_grad" _avg_squared_grad_acc_str = "_avg_squared_grad"
...@@ -105,10 +129,16 @@ class Adadelta(Optimizer): ...@@ -105,10 +129,16 @@ class Adadelta(Optimizer):
self.type = "adadelta" self.type = "adadelta"
self._epsilon = epsilon self._epsilon = epsilon
self._rho = rho self._rho = rho
self._default_dict = {
'epsilon': epsilon,
'rho': rho,
}
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block): if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.") raise TypeError("block is not instance of framework.Block.")
if isinstance(parameters, dict):
parameters = parameters.get('params')
for p in parameters: for p in parameters:
self._add_accumulator(self._avg_squared_grad_acc_str, p) self._add_accumulator(self._avg_squared_grad_acc_str, p)
...@@ -118,6 +148,9 @@ class Adadelta(Optimizer): ...@@ -118,6 +148,9 @@ class Adadelta(Optimizer):
if not isinstance(block, framework.Block): if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.") raise TypeError("block is not instance of framework.Block.")
if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad)
avg_squared_grad_acc = self._get_accumulator( avg_squared_grad_acc = self._get_accumulator(
self._avg_squared_grad_acc_str, param_and_grad[0]) self._avg_squared_grad_acc_str, param_and_grad[0])
avg_squared_update_acc = self._get_accumulator( avg_squared_update_acc = self._get_accumulator(
...@@ -142,3 +175,9 @@ class Adadelta(Optimizer): ...@@ -142,3 +175,9 @@ class Adadelta(Optimizer):
stop_gradient=True) stop_gradient=True)
return adadelta_op return adadelta_op
def _update_param_group(self, parameters):
self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
self._rho = parameters.get('rho', self._default_dict['rho'])
parameters = parameters.get('params')
return parameters
...@@ -46,7 +46,10 @@ class Adagrad(Optimizer): ...@@ -46,7 +46,10 @@ class Adagrad(Optimizer):
epsilon (float, optional): A small float value for numerical stability. epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-06. The default value is 1e-06.
parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \ parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
This parameter is required in dygraph mode. \ This parameter is required in dygraph mode. And you can specify different options for \
different parameter groups such as the learning rate, weight decay, etc, \
then the parameters are list of dict. Note that the learning_rate in paramter groups \
represents the scale of base learning_rate. \
The default value is None in static mode, at this time all parameters will be updated. The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
It canbe a float value as coeff of L2 regularization or \ It canbe a float value as coeff of L2 regularization or \
...@@ -81,6 +84,27 @@ class Adagrad(Optimizer): ...@@ -81,6 +84,27 @@ class Adagrad(Optimizer):
adagrad.step() adagrad.step()
adagrad.clear_grad() adagrad.clear_grad()
#Note that the learning_rate of linear_2 is 0.01.
linear_1 = paddle.nn.Linear(10, 10)
linear_2 = paddle.nn.Linear(10, 10)
inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
out = linear_1(inp)
out = linear_2(out)
loss = paddle.mean(out)
adagrad = paddle.optimizer.Adagrad(
learning_rate=0.1,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001,
'learning_rate': 0.1,
}],
weight_decay=0.01)
out.backward()
adagrad.step()
adagrad.clear_grad()
""" """
_moment_acc_str = "moment" _moment_acc_str = "moment"
...@@ -103,10 +127,17 @@ class Adagrad(Optimizer): ...@@ -103,10 +127,17 @@ class Adagrad(Optimizer):
self.type = "adagrad" self.type = "adagrad"
self._epsilon = epsilon self._epsilon = epsilon
self.initial_accumulator_value = initial_accumulator_value self.initial_accumulator_value = initial_accumulator_value
self._default_dict = {
'epsilon': epsilon,
'initial_accumulator_value': initial_accumulator_value,
}
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
if isinstance(parameters, dict):
parameters = self._update_param_group(parameters)
for p in parameters: for p in parameters:
self._add_accumulator( self._add_accumulator(
self._moment_acc_str, self._moment_acc_str,
...@@ -116,6 +147,9 @@ class Adagrad(Optimizer): ...@@ -116,6 +147,9 @@ class Adagrad(Optimizer):
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad)
moment_acc = self._get_accumulator(self._moment_acc_str, moment_acc = self._get_accumulator(self._moment_acc_str,
param_and_grad[0]) param_and_grad[0])
# Create the adagrad optimizer op # Create the adagrad optimizer op
...@@ -133,3 +167,11 @@ class Adagrad(Optimizer): ...@@ -133,3 +167,11 @@ class Adagrad(Optimizer):
stop_gradient=True) stop_gradient=True)
return adagrad_op return adagrad_op
def _update_param_group(self, parameters):
self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
self.initial_accumulator_value = parameters.get(
'initial_accumulator_value',
self._default_dict['initial_accumulator_value'])
parameters = parameters.get('params')
return parameters
...@@ -21,6 +21,7 @@ from ..fluid import unique_name ...@@ -21,6 +21,7 @@ from ..fluid import unique_name
from ..fluid.layer_helper import LayerHelper from ..fluid.layer_helper import LayerHelper
import warnings import warnings
from ..fluid.dygraph import base as imperative_base from ..fluid.dygraph import base as imperative_base
from collections import defaultdict
import paddle import paddle
...@@ -64,7 +65,10 @@ class Adam(Optimizer): ...@@ -64,7 +65,10 @@ class Adam(Optimizer):
It should be a float number or a Tensor with shape [1] and data type as float32. It should be a float number or a Tensor with shape [1] and data type as float32.
The default value is 1e-08. The default value is 1e-08.
parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \ parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
This parameter is required in dygraph mode. \ This parameter is required in dygraph mode. And you can specify different options for \
different parameter groups such as the learning rate, weight decay, etc, \
then the parameters are list of dict. Note that the learning_rate in paramter groups \
represents the scale of base learning_rate. \
The default value is None in static mode, at this time all parameters will be updated. The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
It canbe a float value as coeff of L2 regularization or \ It canbe a float value as coeff of L2 regularization or \
...@@ -126,6 +130,29 @@ class Adam(Optimizer): ...@@ -126,6 +130,29 @@ class Adam(Optimizer):
adam.step() adam.step()
adam.clear_grad() adam.clear_grad()
#Note that the learning_rate of linear_2 is 0.01.
linear_1 = paddle.nn.Linear(10, 10)
linear_2 = paddle.nn.Linear(10, 10)
inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
out = linear_1(inp)
out = linear_2(out)
loss = paddle.mean(out)
adam = paddle.optimizer.Adam(
learning_rate=0.1,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001,
'learning_rate': 0.1,
'beta1': 0.8
}],
weight_decay=0.01,
beta1=0.9)
out.backward()
adam.step()
adam.clear_grad()
""" """
_moment1_acc_str = "moment1" _moment1_acc_str = "moment1"
_moment2_acc_str = "moment2" _moment2_acc_str = "moment2"
...@@ -172,6 +199,12 @@ class Adam(Optimizer): ...@@ -172,6 +199,12 @@ class Adam(Optimizer):
self._lazy_mode = lazy_mode self._lazy_mode = lazy_mode
self._multi_precision = multi_precision self._multi_precision = multi_precision
self._master_weights = {} self._master_weights = {}
self._default_dict = {
'beta1': beta1,
'beta2': beta2,
'epsilon': epsilon,
'lazy_mode': lazy_mode,
}
def _create_master_weight(self, param): def _create_master_weight(self, param):
assert isinstance(self.helper, LayerHelper) assert isinstance(self.helper, LayerHelper)
...@@ -241,6 +274,8 @@ class Adam(Optimizer): ...@@ -241,6 +274,8 @@ class Adam(Optimizer):
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
if isinstance(parameters, dict):
parameters = self._update_param_group(parameters)
# Create accumulator tensors for first and second moments # Create accumulator tensors for first and second moments
for p in parameters: for p in parameters:
...@@ -257,6 +292,8 @@ class Adam(Optimizer): ...@@ -257,6 +292,8 @@ class Adam(Optimizer):
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad)
moment1 = self._get_accumulator(self._moment1_acc_str, moment1 = self._get_accumulator(self._moment1_acc_str,
param_and_grad[0]) param_and_grad[0])
...@@ -274,6 +311,7 @@ class Adam(Optimizer): ...@@ -274,6 +311,7 @@ class Adam(Optimizer):
# create the adam optimize op # create the adam optimize op
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
_beta1 = self._beta1 if not isinstance( _beta1 = self._beta1 if not isinstance(
self._beta1, Variable) else self._beta1.numpy().item(0) self._beta1, Variable) else self._beta1.numpy().item(0)
_beta2 = self._beta2 if not isinstance( _beta2 = self._beta2 if not isinstance(
...@@ -359,6 +397,7 @@ class Adam(Optimizer): ...@@ -359,6 +397,7 @@ class Adam(Optimizer):
adam.step() adam.step()
adam.clear_grad() adam.clear_grad()
""" """
if not isinstance(self._parameter_list[0], dict):
params_grads = [] params_grads = []
for param in self._parameter_list: for param in self._parameter_list:
if param.stop_gradient: if param.stop_gradient:
...@@ -374,3 +413,27 @@ class Adam(Optimizer): ...@@ -374,3 +413,27 @@ class Adam(Optimizer):
optimize_ops = self._apply_optimize( optimize_ops = self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads) loss=None, startup_program=None, params_grads=params_grads)
else:
# optimize parameters in groups
for param_group in self._param_groups:
params_grads = defaultdict(lambda: list())
for param in param_group['params']:
if param.stop_gradient:
continue
if param._grad_ivar() is not None:
grad_var = param._grad_ivar()
params_grads['params'].append((param, grad_var))
params_grads.update(
{k: v
for k, v in param_group.items() if k != 'params'})
self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads)
def _update_param_group(self, parameters):
self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
self._beta2 = parameters.get('beta2', self._default_dict['beta2'])
self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
self._lazy_mode = parameters.get('lazy_mode',
self._default_dict['lazy_mode'])
parameters = parameters.get('params')
return parameters
...@@ -56,7 +56,10 @@ class Adamax(Optimizer): ...@@ -56,7 +56,10 @@ class Adamax(Optimizer):
epsilon (float, optional): A small float value for numerical stability. epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-08. The default value is 1e-08.
parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \ parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
This parameter is required in dygraph mode. \ This parameter is required in dygraph mode. And you can specify different options for \
different parameter groups such as the learning rate, weight decay, etc, \
then the parameters are list of dict. Note that the learning_rate in paramter groups \
represents the scale of base learning_rate. \
The default value is None in static mode, at this time all parameters will be updated. The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
It canbe a float value as coeff of L2 regularization or \ It canbe a float value as coeff of L2 regularization or \
...@@ -100,6 +103,29 @@ class Adamax(Optimizer): ...@@ -100,6 +103,29 @@ class Adamax(Optimizer):
adam.step() adam.step()
adam.clear_grad() adam.clear_grad()
#Note that the learning_rate of linear_2 is 0.01.
linear_1 = paddle.nn.Linear(10, 10)
linear_2 = paddle.nn.Linear(10, 10)
inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
out = linear_1(inp)
out = linear_2(out)
loss = paddle.mean(out)
adam = paddle.optimizer.Adamax(
learning_rate=0.1,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001,
'learning_rate': 0.1,
'beta1': 0.8
}],
weight_decay=0.01,
beta1=0.9)
out.backward()
adam.step()
adam.clear_grad()
""" """
_moment_acc_str = "moment" _moment_acc_str = "moment"
_inf_norm_acc_str = "inf_norm" _inf_norm_acc_str = "inf_norm"
...@@ -134,8 +160,16 @@ class Adamax(Optimizer): ...@@ -134,8 +160,16 @@ class Adamax(Optimizer):
self._beta1 = beta1 self._beta1 = beta1
self._beta2 = beta2 self._beta2 = beta2
self._epsilon = epsilon self._epsilon = epsilon
self._default_dict = {
'beta1': beta1,
'beta2': beta2,
'epsilon': epsilon
}
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
if isinstance(parameters, dict):
parameters = self._update_param_group(parameters)
# Create accumulator tensors for first moment and infinity norm # Create accumulator tensors for first moment and infinity norm
for p in parameters: for p in parameters:
self._add_accumulator(self._moment_acc_str, p) self._add_accumulator(self._moment_acc_str, p)
...@@ -148,6 +182,8 @@ class Adamax(Optimizer): ...@@ -148,6 +182,8 @@ class Adamax(Optimizer):
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad)
moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0]) moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0])
inf_norm = self._get_accumulator(self._inf_norm_acc_str, inf_norm = self._get_accumulator(self._inf_norm_acc_str,
...@@ -183,16 +219,40 @@ class Adamax(Optimizer): ...@@ -183,16 +219,40 @@ class Adamax(Optimizer):
"""Update Beta1 Power accumulator """Update Beta1 Power accumulator
""" """
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
if isinstance(parameters_and_grads, list):
for param, grad in parameters_and_grads: for param, grad in parameters_and_grads:
if grad is None or param.stop_gradient is True: if grad is None or param.stop_gradient is True:
continue continue
with param.block.program._optimized_guard( with param.block.program._optimized_guard(
[param, grad]), name_scope('adamax'): [param, grad]), name_scope('adamax'):
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, beta1_pow_acc = self._get_accumulator(
param) self._beta1_pow_acc_str, param)
block.append_op(
type="scale",
inputs={"X": beta1_pow_acc},
outputs={"Out": beta1_pow_acc},
attrs={"scale": self._beta1},
stop_gradient=True)
else:
for param, grad in parameters_and_grads['params']:
if grad is None or param.stop_gradient is True:
continue
with param.block.program._optimized_guard(
[param, grad]), name_scope('adamax'):
beta1_pow_acc = self._get_accumulator(
self._beta1_pow_acc_str, param)
self._beta1 = parameters_and_grads.get(
'beta1', self._default_dict['beta1'])
block.append_op( block.append_op(
type="scale", type="scale",
inputs={"X": beta1_pow_acc}, inputs={"X": beta1_pow_acc},
outputs={"Out": beta1_pow_acc}, outputs={"Out": beta1_pow_acc},
attrs={"scale": self._beta1}, attrs={"scale": self._beta1},
stop_gradient=True) stop_gradient=True)
def _update_param_group(self, parameters):
self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
self._beta2 = parameters.get('beta2', self._default_dict['beta2'])
self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
parameters = parameters.get('params')
return parameters
...@@ -46,7 +46,10 @@ class AdamW(Adam): ...@@ -46,7 +46,10 @@ class AdamW(Adam):
learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``. learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a LRScheduler. The default value is 0.001. It can be a float value or a LRScheduler. The default value is 0.001.
parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \ parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \ This parameter is required in dygraph mode. And you can specify different options for \
different parameter groups such as the learning rate, weight decay, etc, \
then the parameters are list of dict. Note that the learning_rate in paramter groups \
represents the scale of base learning_rate. \
The default value is None in static mode, at this time all parameters will be updated. The default value is None in static mode, at this time all parameters will be updated.
beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates. beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
It should be a float number or a Tensor with shape [1] and data type as float32. It should be a float number or a Tensor with shape [1] and data type as float32.
...@@ -101,6 +104,30 @@ class AdamW(Adam): ...@@ -101,6 +104,30 @@ class AdamW(Adam):
adam.step() adam.step()
adam.clear_grad() adam.clear_grad()
#Note that the learning_rate of linear_2 is 0.01.
linear_1 = paddle.nn.Linear(10, 10)
linear_2 = paddle.nn.Linear(10, 10)
inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
out = linear_1(inp)
out = linear_2(out)
loss = paddle.mean(out)
adam = paddle.optimizer.AdamW(
learning_rate=0.1,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001,
'learning_rate': 0.1,
'beta1': 0.8
}],
weight_decay=0.01,
beta1=0.9)
out.backward()
adam.step()
adam.clear_grad()
""" """
def __init__(self, def __init__(self,
...@@ -143,6 +170,7 @@ class AdamW(Adam): ...@@ -143,6 +170,7 @@ class AdamW(Adam):
name=name, name=name,
lazy_mode=lazy_mode, lazy_mode=lazy_mode,
multi_precision=multi_precision) multi_precision=multi_precision)
self._default_dict = {'coeff': coeff}
def _append_decoupled_weight_decay(self, block, param_and_grad): def _append_decoupled_weight_decay(self, block, param_and_grad):
""" """
...@@ -156,7 +184,10 @@ class AdamW(Adam): ...@@ -156,7 +184,10 @@ class AdamW(Adam):
Raises: Raises:
Exception: The type of coeff and parameter is not consistent. Exception: The type of coeff and parameter is not consistent.
""" """
if not isinstance(param_and_grad, dict):
param, grad = param_and_grad param, grad = param_and_grad
else:
param, grad = self._update_param_group(param_and_grad)
if self._apply_decay_param_fun is not None \ if self._apply_decay_param_fun is not None \
and not self._apply_decay_param_fun(param.name): and not self._apply_decay_param_fun(param.name):
...@@ -207,3 +238,8 @@ class AdamW(Adam): ...@@ -207,3 +238,8 @@ class AdamW(Adam):
def __str__(self): def __str__(self):
return " ".join(["Weight Decay, params:", ",".join(self._params_name)]) return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
def _update_param_group(self, parameters):
self._coeff = parameters.get('coeff', self._default_dict['coeff'])
parameters = parameters.get('params')
return parameters
...@@ -59,7 +59,10 @@ class Lamb(Optimizer): ...@@ -59,7 +59,10 @@ class Lamb(Optimizer):
Default 0.999. Default 0.999.
epsilon (float, optional): A small float value for numerical stability. Default 1e-6. epsilon (float, optional): A small float value for numerical stability. Default 1e-6.
parameters (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ parameters (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \ This parameter is required in dygraph mode. And you can specify different options for \
different parameter groups such as the learning rate, weight decay, etc, \
then the parameters are list of dict. Note that the learning_rate in paramter groups \
represents the scale of base learning_rate. \
The default value is None in static mode, at this time all parameters will be updated. The default value is None in static mode, at this time all parameters will be updated.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies some derived class of ``GradientClipBase`` . There are three cliping strategies
...@@ -83,6 +86,31 @@ class Lamb(Optimizer): ...@@ -83,6 +86,31 @@ class Lamb(Optimizer):
back = out.backward() back = out.backward()
lamb.step() lamb.step()
lamb.clear_grad() lamb.clear_grad()
#Note that the learning_rate of linear_2 is 0.01.
linear_1 = paddle.nn.Linear(10, 10)
linear_2 = paddle.nn.Linear(10, 10)
inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
out = linear_1(inp)
out = linear_2(out)
loss = paddle.mean(out)
lamb = paddle.optimizer.Lamb(
learning_rate=0.1,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001,
'learning_rate': 0.1,
'lamb_weight_decay': 0.02
}],
weight_decay=0.01,
lamb_weight_decay=0.01)
out.backward()
lamb.step()
lamb.clear_grad()
""" """
_moment1_acc_str = "moment1" _moment1_acc_str = "moment1"
_moment2_acc_str = "moment2" _moment2_acc_str = "moment2"
...@@ -115,9 +143,18 @@ class Lamb(Optimizer): ...@@ -115,9 +143,18 @@ class Lamb(Optimizer):
self._epsilon = epsilon self._epsilon = epsilon
self._lamb_weight_decay = lamb_weight_decay self._lamb_weight_decay = lamb_weight_decay
self._exclude_from_weight_decay_fn = exclude_from_weight_decay_fn self._exclude_from_weight_decay_fn = exclude_from_weight_decay_fn
self._default_dict = {
'beta1': beta1,
'beta2': beta2,
'epsilon': epsilon,
'lamb_weight_decay': lamb_weight_decay,
'exclude_from_weight_decay_fn': exclude_from_weight_decay_fn,
}
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
if isinstance(parameters, dict):
parameters = self._update_param_group(parameters)
# Create accumulator tensors for first and second moments # Create accumulator tensors for first and second moments
for p in parameters: for p in parameters:
...@@ -140,6 +177,9 @@ class Lamb(Optimizer): ...@@ -140,6 +177,9 @@ class Lamb(Optimizer):
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad)
block.program._use_lamb = True block.program._use_lamb = True
moment1 = self._get_accumulator(self._moment1_acc_str, moment1 = self._get_accumulator(self._moment1_acc_str,
...@@ -199,3 +239,15 @@ class Lamb(Optimizer): ...@@ -199,3 +239,15 @@ class Lamb(Optimizer):
stop_gradient=True) stop_gradient=True)
return lamb_op return lamb_op
def _update_param_group(self, parameters):
self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
self._beta2 = parameters.get('beta2', self._default_dict['beta2'])
self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
self._lamb_weight_decay = parameters.get(
'lamb_weight_decay', self._default_dict['lamb_weight_decay'])
self._exclude_from_weight_decay_fn = parameters.get(
'exclude_from_weight_decay_fn',
self._default_dict['exclude_from_weight_decay_fn'])
parameters = parameters.get('params')
return parameters
...@@ -51,8 +51,11 @@ class Momentum(Optimizer): ...@@ -51,8 +51,11 @@ class Momentum(Optimizer):
learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``. learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001. It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
momentum (float): Momentum factor. The default value is 0.9. momentum (float): Momentum factor. The default value is 0.9.
parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \ parameters (list|tuple, optional): List|Tuple of ``Tensor`` to update to minimize ``loss``. \
This parameter is required in dygraph mode. \ This parameter is required in dygraph mode. And you can specify different options for \
different parameter groups such as the learning rate, weight decay, etc, \
then the parameters are list of dict. Note that the learning_rate in paramter groups \
represents the scale of base learning_rate. \
The default value is None in static mode, at this time all parameters will be updated. The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
It canbe a float value as coeff of L2 regularization or \ It canbe a float value as coeff of L2 regularization or \
...@@ -88,6 +91,29 @@ class Momentum(Optimizer): ...@@ -88,6 +91,29 @@ class Momentum(Optimizer):
back = out.backward() back = out.backward()
momentum.step() momentum.step()
momentum.clear_grad() momentum.clear_grad()
#Note that the learning_rate of linear_2 is 0.01.
linear_1 = paddle.nn.Linear(10, 10)
linear_2 = paddle.nn.Linear(10, 10)
inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
out = linear_1(inp)
out = linear_2(out)
loss = paddle.mean(out)
momentum = paddle.optimizer.Momentum(
learning_rate=0.1,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001,
'learning_rate': 0.1
}],
weight_decay=0.01,
momentum=0.9)
out.backward()
momentum.step()
momentum.clear_grad()
""" """
_velocity_acc_str = "velocity" _velocity_acc_str = "velocity"
...@@ -105,7 +131,19 @@ class Momentum(Optimizer): ...@@ -105,7 +131,19 @@ class Momentum(Optimizer):
raise ValueError("learning_rate is not set") raise ValueError("learning_rate is not set")
if momentum is None: if momentum is None:
raise ValueError("momentum is not set") raise ValueError("momentum is not set")
predicate = lambda regular: isinstance(regular, (L2DecayRegularizer, float)) predicate = lambda regular: isinstance(regular, (L2DecayRegularizer, float))
if isinstance(parameters, list):
if isinstance(parameters[0], dict):
for param_group in parameters:
decay = param_group[
'weight_decay'] if 'weight_decay' in param_group else weight_decay
reg_method, reg_coeff = self._update_regularization(decay)
param_group['regularization_method'] = reg_method
param_group['regularization_coeff'] = reg_coeff
py_regular = None if predicate(decay) else decay
param_group['weight_decay'] = py_regular
py_regular = None if predicate(weight_decay) else weight_decay py_regular = None if predicate(weight_decay) else weight_decay
super(Momentum, self).__init__( super(Momentum, self).__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
...@@ -116,23 +154,42 @@ class Momentum(Optimizer): ...@@ -116,23 +154,42 @@ class Momentum(Optimizer):
self.type = "momentum" self.type = "momentum"
self._momentum = momentum self._momentum = momentum
self._use_nesterov = bool(use_nesterov) self._use_nesterov = bool(use_nesterov)
self._regularization_method = "" self._regularization_method, self._regularization_coeff = self._update_regularization(
self._regularization_coeff = 0 weight_decay)
if (isinstance(weight_decay, L2DecayRegularizer)):
self._regularization_method = "l2_decay"
self._regularization_coeff = weight_decay._regularization_coeff
if (isinstance(weight_decay, float)):
self._regularization_method = "l2_decay"
self._regularization_coeff = weight_decay
self._multi_precision = multi_precision self._multi_precision = multi_precision
self._rescale_grad = rescale_grad self._rescale_grad = rescale_grad
self._master_weights = {} self._master_weights = {}
self._default_dict = {
'momentum': momentum,
'use_nesterov': use_nesterov,
'rescale_grad': rescale_grad,
'regularization_method': self._regularization_method,
'regularization_coeff': self._regularization_coeff,
}
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
self.helper = LayerHelper(self.__class__.__name__) self.helper = LayerHelper(self.__class__.__name__)
if isinstance(self._parameter_list[0], dict):
for parameters in self._param_groups:
for p in parameters['params']:
self._add_accumulator(self._velocity_acc_str, p)
else:
for p in parameters: for p in parameters:
self._add_accumulator(self._velocity_acc_str, p) self._add_accumulator(self._velocity_acc_str, p)
def _update_regularization(self, weight_decay):
reg_method = ""
reg_coeff = 0
if (isinstance(weight_decay, L2DecayRegularizer)):
reg_method = "l2_decay"
reg_coeff = weight_decay._regularization_coeff
if (isinstance(weight_decay, float)):
reg_method = "l2_decay"
reg_coeff = weight_decay
return reg_method, reg_coeff
def _create_master_weight(self, param): def _create_master_weight(self, param):
assert isinstance(self.helper, LayerHelper) assert isinstance(self.helper, LayerHelper)
...@@ -197,12 +254,16 @@ class Momentum(Optimizer): ...@@ -197,12 +254,16 @@ class Momentum(Optimizer):
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad)
velocity_acc = self._get_accumulator(self._velocity_acc_str, velocity_acc = self._get_accumulator(self._velocity_acc_str,
param_and_grad[0]) param_and_grad[0])
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
if isinstance(param_and_grad, dict):
self._update_regularization(param_and_grad['weight_decay'])
_, _ = core.ops.momentum( _, _ = core.ops.momentum(
param_and_grad[0], param_and_grad[1], velocity_acc, lr, param_and_grad[0], param_and_grad[1], velocity_acc, lr,
param_and_grad[0], velocity_acc, 'mu', self._momentum, param_and_grad[0], velocity_acc, 'mu', self._momentum,
...@@ -250,3 +311,18 @@ class Momentum(Optimizer): ...@@ -250,3 +311,18 @@ class Momentum(Optimizer):
stop_gradient=True) stop_gradient=True)
return momentum_op return momentum_op
def _update_param_group(self, parameters):
self._momentum = parameters.get('momentum',
self._default_dict['momentum'])
self._use_nesterov = parameters.get('use_nesterov',
self._default_dict['use_nesterov'])
self._rescale_grad = parameters.get('rescale_grad',
self._default_dict['rescale_grad'])
self._regularization_method = parameters.get(
'regularization_method',
self._default_dict['regularization_method'])
self._regularization_coeff = parameters.get(
'regularization_coeff', self._default_dict['regularization_coeff'])
parameters = parameters.get('params')
return parameters
...@@ -28,7 +28,7 @@ from ..fluid import layers ...@@ -28,7 +28,7 @@ from ..fluid import layers
from ..fluid import unique_name from ..fluid import unique_name
from ..fluid.backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name from ..fluid.backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
from ..fluid.clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops from ..fluid.clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops
from ..fluid.framework import program_guard from ..fluid.framework import program_guard, Parameter
from ..fluid.initializer import Constant from ..fluid.initializer import Constant
from ..fluid.layer_helper import LayerHelper from ..fluid.layer_helper import LayerHelper
from ..fluid.layers import ops from ..fluid.layers import ops
...@@ -41,6 +41,7 @@ from functools import reduce ...@@ -41,6 +41,7 @@ from functools import reduce
from ..fluid.wrapped_decorator import signature_safe_contextmanager from ..fluid.wrapped_decorator import signature_safe_contextmanager
from .. import compat as cpt from .. import compat as cpt
from .lr import LRScheduler from .lr import LRScheduler
import copy
__all__ = [] __all__ = []
...@@ -56,7 +57,10 @@ class Optimizer(object): ...@@ -56,7 +57,10 @@ class Optimizer(object):
learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``. learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
It can be a float value or any subclass of ``LRScheduler`` . It can be a float value or any subclass of ``LRScheduler`` .
parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \ parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \ This parameter is required in dygraph mode. And you can specify different options for \
different parameter groups such as the learning rate, weight decay, etc, \
then the parameters are list of dict. Note that the learning_rate in paramter groups \
represents the scale of base learning_rate. \
The default value is None in static mode, at this time all parameters will be updated. The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
It canbe a float value as coeff of L2 regularization or \ It canbe a float value as coeff of L2 regularization or \
...@@ -91,6 +95,29 @@ class Optimizer(object): ...@@ -91,6 +95,29 @@ class Optimizer(object):
adam.step() adam.step()
adam.clear_grad() adam.clear_grad()
#Take the subclass sgd as an example
#optimize parameters in linear_1 and linear2 in different options.
#Note that the learning_rate of linear_2 is 0.01.
linear_1 = paddle.nn.Linear(10, 10)
linear_2 = paddle.nn.Linear(10, 10)
inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
out = linear_1(inp)
out = linear_2(out)
loss = paddle.mean(out)
sgd = paddle.optimizer.SGD(
learning_rate=0.1,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001,
'learning_rate': 0.1
}],
weight_decay=0.01)
out.backward()
sgd.step()
sgd.clear_grad()
""" """
@imperative_base.no_grad @imperative_base.no_grad
...@@ -100,6 +127,7 @@ class Optimizer(object): ...@@ -100,6 +127,7 @@ class Optimizer(object):
weight_decay=None, weight_decay=None,
grad_clip=None, grad_clip=None,
name=None): name=None):
if parameters is not None: if parameters is not None:
# paddle.Tensor is also iterable, so here we don't check whether # paddle.Tensor is also iterable, so here we don't check whether
# the input is iterable, if the input is paddle.Tensor, the # the input is iterable, if the input is paddle.Tensor, the
...@@ -109,6 +137,11 @@ class Optimizer(object): ...@@ -109,6 +137,11 @@ class Optimizer(object):
"`parameters` argument given to the optimizer should be " "`parameters` argument given to the optimizer should be "
"an iterable of paddle Tensors, but got argument type is `{}`.". "an iterable of paddle Tensors, but got argument type is `{}`.".
format(type(parameters))) format(type(parameters)))
if isinstance(parameters, dict):
raise TypeError(
"`parameters` argument should not get dict type, "
"if parameter groups is needed, please set `parameters`"
" as list of dict")
self._parameter_list = list(parameters) self._parameter_list = list(parameters)
else: else:
self._parameter_list = None self._parameter_list = None
...@@ -120,14 +153,17 @@ class Optimizer(object): ...@@ -120,14 +153,17 @@ class Optimizer(object):
"parameters argument given to the Optimizer should not be None in dygraph mode." "parameters argument given to the Optimizer should not be None in dygraph mode."
) )
if weight_decay is not None: if weight_decay is not None:
if not isinstance(self._parameter_list[0], dict):
for param in self._parameter_list: for param in self._parameter_list:
if hasattr(param, if hasattr(
param,
'regularizer') and param.regularizer is not None: 'regularizer') and param.regularizer is not None:
logging.info( logging.info(
"If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. " "If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. "
"The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!" "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
% weight_decay.__str__()) % weight_decay.__str__())
break break
if not isinstance(learning_rate, (float, LRScheduler)): if not isinstance(learning_rate, (float, LRScheduler)):
raise TypeError( raise TypeError(
"learning rate should be float or LRScheduler, got %s here" % "learning rate should be float or LRScheduler, got %s here" %
...@@ -148,6 +184,12 @@ class Optimizer(object): ...@@ -148,6 +184,12 @@ class Optimizer(object):
self._dtype = None self._dtype = None
# Infer the dtype form parameter # Infer the dtype form parameter
if self._parameter_list: if self._parameter_list:
if isinstance(self._parameter_list[0], dict):
for param_group in self._parameter_list:
assert 'params' in param_group, \
'params should be set in parameters if parameter groups are optimized in different options'
self._dtype = self._parameter_list[0]['params'][0].dtype
else:
self._dtype = self._parameter_list[0].dtype self._dtype = self._parameter_list[0].dtype
# each program should have a independent learning rate # each program should have a independent learning rate
...@@ -163,6 +205,18 @@ class Optimizer(object): ...@@ -163,6 +205,18 @@ class Optimizer(object):
self._accumulators_holder = {} self._accumulators_holder = {}
self._param_device_map = dict() self._param_device_map = dict()
self.clear_gradients = self.clear_grad self.clear_gradients = self.clear_grad
self._default_dict = {
'learning_rate': self._learning_rate,
'weight_decay': self.regularization,
'grad_clip': self._grad_clip
}
self._param_groups = []
if self._parameter_list and isinstance(self._parameter_list[0], dict):
for param_group in self._parameter_list:
self._add_param_group(param_group.copy())
else:
self._param_groups = self._parameter_list
@framework.dygraph_only @framework.dygraph_only
def state_dict(self): def state_dict(self):
...@@ -610,18 +664,45 @@ class Optimizer(object): ...@@ -610,18 +664,45 @@ class Optimizer(object):
start = len(target_block.ops) start = len(target_block.ops)
self.helper = LayerHelper(self.__class__.__name__) self.helper = LayerHelper(self.__class__.__name__)
self._update_param_device_map(parameters_and_grads, target_block) params_grads_device_map = parameters_and_grads['params'] if isinstance(
parameters_and_grads, dict) else parameters_and_grads
self._update_param_device_map(params_grads_device_map, target_block)
if isinstance(parameters_and_grads, list):
self._create_accumulators( self._create_accumulators(
target_block, target_block,
[p[0] for p in parameters_and_grads if not p[0].stop_gradient]) [p[0] for p in parameters_and_grads if not p[0].stop_gradient])
else:
params_acc_dict = parameters_and_grads.copy()
params_acc_dict['params'] = [
p[0] for p in params_acc_dict['params']
if not p[0].stop_gradient
]
self._create_accumulators(target_block, params_acc_dict)
self._create_global_learning_rate() self._create_global_learning_rate()
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
if isinstance(parameters_and_grads, list):
for param_and_grad in parameters_and_grads: for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None: if param_and_grad[1] is None:
continue continue
if param_and_grad[0].stop_gradient is False: if param_and_grad[0].stop_gradient is False:
self._append_optimize_op(target_block, param_and_grad) self._append_optimize_op(target_block, param_and_grad)
else:
for param_and_grad in parameters_and_grads['params']:
if param_and_grad[1] is None:
continue
if param_and_grad[0].stop_gradient is False:
param_grad_dict = dict()
param_grad_dict['params'] = param_and_grad
param_grad_dict.update({
k: v
for k, v in parameters_and_grads.items()
if k != 'params'
})
self._append_optimize_op(target_block, param_grad_dict)
else: else:
for param_and_grad in parameters_and_grads: for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None: if param_and_grad[1] is None:
...@@ -790,10 +871,19 @@ class Optimizer(object): ...@@ -790,10 +871,19 @@ class Optimizer(object):
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
with program_guard(framework.default_main_program(), with program_guard(framework.default_main_program(),
framework.default_startup_program()): framework.default_startup_program()):
if isinstance(params_grads, list):
if self._grad_clip is not None: if self._grad_clip is not None:
params_grads = self._grad_clip(params_grads) params_grads = self._grad_clip(params_grads)
params_grads = append_regularization_ops(params_grads, params_grads = append_regularization_ops(
self.regularization) params_grads, self.regularization)
else:
grad_clip = params_grads['grad_clip']
if grad_clip is not None:
params_grads['params'] = grad_clip(params_grads[
'params'])
params_grads['params'] = append_regularization_ops(
params_grads['params'], self.regularization)
optimize_ops = self._create_optimization_pass(params_grads) optimize_ops = self._create_optimization_pass(params_grads)
else: else:
program = loss.block.program program = loss.block.program
...@@ -840,9 +930,16 @@ class Optimizer(object): ...@@ -840,9 +930,16 @@ class Optimizer(object):
adam.clear_grad() adam.clear_grad()
""" """
if self._parameter_list is None or not isinstance(
self._parameter_list[0], dict):
for p in self._parameter_list: for p in self._parameter_list:
if not p.stop_gradient: if not p.stop_gradient:
p.clear_gradient() p.clear_gradient()
else:
for param_group in self._param_groups:
for p in param_group['params']:
if not p.stop_gradient:
p.clear_gradient()
@imperative_base.no_grad @imperative_base.no_grad
def minimize(self, def minimize(self,
...@@ -934,8 +1031,10 @@ class Optimizer(object): ...@@ -934,8 +1031,10 @@ class Optimizer(object):
adam.step() adam.step()
adam.clear_grad() adam.clear_grad()
""" """
if not isinstance(self._param_groups[0], dict):
params_grads = [] params_grads = []
for param in self._parameter_list: for param in self._param_groups:
if param.stop_gradient: if param.stop_gradient:
continue continue
if param._grad_ivar() is not None: if param._grad_ivar() is not None:
...@@ -944,3 +1043,70 @@ class Optimizer(object): ...@@ -944,3 +1043,70 @@ class Optimizer(object):
self._apply_optimize( self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads) loss=None, startup_program=None, params_grads=params_grads)
else:
# optimize parameters in groups
for param_group in self._param_groups:
params_grads = defaultdict(lambda: list())
for param in param_group['params']:
if param.stop_gradient:
continue
if param._grad_ivar() is not None:
grad_var = param._grad_ivar()
params_grads['params'].append((param, grad_var))
params_grads.update(
{k: v
for k, v in param_group.items() if k != 'params'})
self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads)
def _add_param_group(self, param_group):
"""
Add a param group to parameter_list.
Args:
param_group (dict): The group of Tensors to be optimzed with
different optimization options.
"""
params = param_group['params']
if isinstance(params, Parameter):
param_group['params'] = [params]
elif isinstance(params, set):
raise TypeError(
"optimizer parameters should be in ordered collections,"
"but received set, please use list instead.")
else:
param_group['params'] = list(params)
# Update optimization options for each groups
for k, v in self._default_dict.items():
param_group.setdefault(k, v)
param_set = set()
for group in self._param_groups:
param_set.update(set(group['params']))
if not param_set.isdisjoint(set(param_group['params'])):
raise ValueError(
"some parameters appear in more than one parameter group")
for param in param_group['params']:
weight_decay = param_group['weight_decay']
if isinstance(weight_decay, float):
from ..fluid.regularizer import L2Decay
regularization = L2Decay(weight_decay)
else:
regularization = weight_decay
param.regularizer = regularization
param.optimize_attr['learning_rate'] = param_group['learning_rate']
self._param_groups.append(param_group)
def _update_param_group(self, parameters):
"""
Update the param group with new entry
Args:
parameters (dict): The extra group of Tensors to be optimzed with
different optimization options. Only used in child class.
"""
pass
...@@ -81,7 +81,10 @@ class RMSProp(Optimizer): ...@@ -81,7 +81,10 @@ class RMSProp(Optimizer):
True may help with training, but is slightly more expensive in terms of True may help with training, but is slightly more expensive in terms of
computation and memory. Defaults to False. computation and memory. Defaults to False.
parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \ parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
This parameter is required in dygraph mode. \ This parameter is required in dygraph mode. And you can specify different options for \
different parameter groups such as the learning rate, weight decay, etc, \
then the parameters are list of dict. Note that the learning_rate in paramter groups \
represents the scale of base learning_rate. \
The default value is None in static mode, at this time all parameters will be updated. The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
It canbe a float value as coeff of L2 regularization or \ It canbe a float value as coeff of L2 regularization or \
...@@ -117,6 +120,26 @@ class RMSProp(Optimizer): ...@@ -117,6 +120,26 @@ class RMSProp(Optimizer):
rmsprop.step() rmsprop.step()
rmsprop.clear_grad() rmsprop.clear_grad()
#Note that the learning_rate of linear_2 is 0.01.
linear_1 = paddle.nn.Linear(10, 10)
linear_2 = paddle.nn.Linear(10, 10)
inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
out = linear_1(inp)
out = linear_2(out)
loss = paddle.mean(out)
rmsprop = paddle.optimizer.RMSProp(
learning_rate=0.1,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001,
'learning_rate': 0.1
}],
weight_decay=0.01)
out.backward()
rmsprop.step()
rmsprop.clear_grad()
""" """
_momentum_acc_str = "momentum" _momentum_acc_str = "momentum"
...@@ -160,11 +183,20 @@ class RMSProp(Optimizer): ...@@ -160,11 +183,20 @@ class RMSProp(Optimizer):
self._epsilon = epsilon self._epsilon = epsilon
self._momentum = momentum self._momentum = momentum
self._centered = centered self._centered = centered
self._default_dict = {
'rho': rho,
'epsilon': epsilon,
'momentum': momentum,
'centered': centered,
}
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block): if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.") raise TypeError("block is not instance of framework.Block.")
if isinstance(parameters, dict):
parameters = parameters.get('params')
for p in parameters: for p in parameters:
self._add_accumulator(self._momentum_acc_str, p) self._add_accumulator(self._momentum_acc_str, p)
self._add_accumulator(self._mean_square_acc_str, p) self._add_accumulator(self._mean_square_acc_str, p)
...@@ -174,6 +206,9 @@ class RMSProp(Optimizer): ...@@ -174,6 +206,9 @@ class RMSProp(Optimizer):
if not isinstance(block, framework.Block): if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.") raise TypeError("block is not instance of framework.Block.")
if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad)
momentum_acc = self._get_accumulator(self._momentum_acc_str, momentum_acc = self._get_accumulator(self._momentum_acc_str,
param_and_grad[0]) param_and_grad[0])
mean_square_acc = self._get_accumulator(self._mean_square_acc_str, mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
...@@ -205,3 +240,13 @@ class RMSProp(Optimizer): ...@@ -205,3 +240,13 @@ class RMSProp(Optimizer):
stop_gradient=True) stop_gradient=True)
return rmsprop_op return rmsprop_op
def _update_param_group(self, parameters):
self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
self._rho = parameters.get('rho', self._default_dict['rho'])
self._momentum = parameters.get('momentum',
self._default_dict['momentum'])
self._centered = parameters.get('centered',
self._default_dict['centered'])
parameters = parameters.get('params')
return parameters
...@@ -87,6 +87,8 @@ class SGD(Optimizer): ...@@ -87,6 +87,8 @@ class SGD(Optimizer):
@no_grad @no_grad
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad)
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
core.ops.sgd(param_and_grad[0], lr, param_and_grad[1], core.ops.sgd(param_and_grad[0], lr, param_and_grad[1],
...@@ -106,3 +108,7 @@ class SGD(Optimizer): ...@@ -106,3 +108,7 @@ class SGD(Optimizer):
stop_gradient=True) stop_gradient=True)
return sgd_op return sgd_op
def _update_param_group(self, parameters):
parameters = parameters.get('params')
return parameters
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册