diff --git a/mindspore/nn/optim/adam.py b/mindspore/nn/optim/adam.py index f0688a9b47f179eb26d522f61168db6d0a0ec1d0..b73c284aab7a7e4c74f923c8fbf0cbc079784a9e 100755 --- a/mindspore/nn/optim/adam.py +++ b/mindspore/nn/optim/adam.py @@ -181,8 +181,7 @@ class Adam(Optimizer): - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which - in the value of 'order_params' but not in any group will use default learning rate and default weight - decay. + in the value of 'order_params' should be in one of group parameters. learning_rate (Union[int, float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is Iterable or a Tensor and the dims of the Tensor is 1, @@ -220,16 +219,14 @@ class Adam(Optimizer): >>> >>> #2) Use parameter groups and set different values >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) - >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params())) + >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) >>> group_params = [{'params': conv_params, 'weight_decay': 0.01}, - >>> {'params': bias_params, 'lr': 0.01}, + >>> {'params': no_conv_params, 'lr': 0.01}, >>> {'order_params': net.trainable_params()}] >>> opt = nn.Adam(group_params, learning_rate=0.1, weight_decay=0.0) >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01. - >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. + >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'. - >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate - >>> # of default value 0.1 and a weight decay of default value 0.0. >>> >>> loss = nn.SoftmaxCrossEntropyWithLogits() >>> model = Model(net, loss_fn=loss, optimizer=optim) diff --git a/mindspore/nn/optim/lazyadam.py b/mindspore/nn/optim/lazyadam.py index 7d53aad4882832828b6c8762401efc4dd8ad1464..4b97d2eb20b84a37359923e3da394696338d7b63 100644 --- a/mindspore/nn/optim/lazyadam.py +++ b/mindspore/nn/optim/lazyadam.py @@ -109,6 +109,10 @@ class LazyAdam(Optimizer): - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay will be used. If not, the `weight_decay` in the API will be used. + - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and + the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which + in the value of 'order_params' should be in one of group parameters. + learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is Iterable or a Tensor and the dims of the Tensor is 1, use dynamic learning rate, then the i-th step will @@ -146,12 +150,13 @@ class LazyAdam(Optimizer): >>> #2) Use parameter groups and set different values >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) - >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01}, - >>> {'params': no_conv_params}] + >>> group_params = [{'params': conv_params, 'weight_decay': 0.01}, + >>> {'params': no_conv_params, 'lr': 0.01}, + >>> {'order_params': net.trainable_params()}] >>> opt = nn.LazyAdam(group_params, learning_rate=0.1, weight_decay=0.0) - >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01 - >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a - >>> # learning rate of 0.1 and a weight decay of 0.0. + >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01. + >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. + >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'. >>> >>> loss = nn.SoftmaxCrossEntropyWithLogits() >>> model = Model(net, loss_fn=loss, optimizer=optim) diff --git a/mindspore/nn/optim/momentum.py b/mindspore/nn/optim/momentum.py index ebdc5d86bff70fdf77df00457510512a5f972882..1e8ce855707ed0750642a3677f1fd7c3ef003e74 100755 --- a/mindspore/nn/optim/momentum.py +++ b/mindspore/nn/optim/momentum.py @@ -64,8 +64,7 @@ class Momentum(Optimizer): - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which - in the value of 'order_params' but not in any group will use default learning rate and default weight - decay. + in the value of 'order_params' should be in one of group parameters. learning_rate (Union[int, float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is Iterable or a Tensor and the dims of the Tensor is 1, @@ -97,16 +96,14 @@ class Momentum(Optimizer): >>> >>> #2) Use parameter groups and set different values >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) - >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params())) + >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) >>> group_params = [{'params': conv_params, 'weight_decay': 0.01}, - >>> {'params': bias_params, 'lr': 0.01}, + >>> {'params': no_conv_params, 'lr': 0.01}, >>> {'order_params': net.trainable_params()}] >>> opt = nn.Momentum(group_params, learning_rate=0.1, momentum=0.9, weight_decay=0.0) >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01. - >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. + >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'. - >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate - >>> # of default value 0.1 and a weight decay of default value 0.0. >>> >>> loss = nn.SoftmaxCrossEntropyWithLogits() >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None) diff --git a/mindspore/nn/optim/optimizer.py b/mindspore/nn/optim/optimizer.py index 5b13d7cfbdf35ebdc9e96272d5677f10b152565b..16f252adff8fe803e3a6dcb797773cfc78b765bf 100755 --- a/mindspore/nn/optim/optimizer.py +++ b/mindspore/nn/optim/optimizer.py @@ -77,8 +77,7 @@ class Optimizer(Cell): - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which - in the value of 'order_params' but not in any group will use default learning rate and default weight - decay. + in the value of 'order_params' should be in one of group parameters. weight_decay (float): A floating point value for the weight decay. It should be equal to or greater than 0. If the type of `weight_decay` input is int, it will be converted to float. Default: 0.0. @@ -351,16 +350,18 @@ class Optimizer(Cell): self.group_weight_decay.append(weight_decay_) if self.is_group_params_ordered: - self._order_and_adjust_group_params(ordered_parameters, learning_rate, weight_decay) + self._order_and_adjust_group_params(ordered_parameters) - def _order_and_adjust_group_params(self, ordered_parameters, learning_rate, weight_decay): + def _order_and_adjust_group_params(self, ordered_parameters): """ - Order group parameter, learning rate and weight decay in group params. And assign the parameters - which in the value of 'order_params' but not in any group to default value. + Order group parameter, learning rate and weight decay in group params. """ - params_length = len(ordered_parameters) - ordered_learning_rate = [Parameter(learning_rate, name="lr_" + param.name) for param in ordered_parameters] - ordered_weight_decay = [weight_decay * self.loss_scale] * params_length + params_length = len(self.group_params) + if len(ordered_parameters) != len(self.group_params): + raise ValueError(f"The value of 'order_params' should be same with all group parameters.") + + ordered_learning_rate = [None] * params_length + ordered_weight_decay = [None] * params_length params_name = [param.name for param in ordered_parameters] for param, lr, wd in zip(self.group_params, self.group_lr, self.group_weight_decay): diff --git a/mindspore/nn/optim/rmsprop.py b/mindspore/nn/optim/rmsprop.py index 05c42fb4444536d85eb8e8850c0810caf3e8c298..8e8885aff777db5fcedcc14dd5f1e6610ae8c68f 100644 --- a/mindspore/nn/optim/rmsprop.py +++ b/mindspore/nn/optim/rmsprop.py @@ -107,8 +107,7 @@ class RMSProp(Optimizer): - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which - in the value of 'order_params' but not in any group will use default learning rate and default weight - decay. + in the value of 'order_params' should be in one of group parameters. learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is Iterable or a Tensor and the dims of the Tensor is 1, @@ -140,16 +139,14 @@ class RMSProp(Optimizer): >>> >>> #2) Use parameter groups and set different values >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) - >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params())) + >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) >>> group_params = [{'params': conv_params, 'weight_decay': 0.01}, - >>> {'params': bias_params, 'lr': 0.01}, + >>> {'params': no_conv_params, 'lr': 0.01}, >>> {'order_params': net.trainable_params()}] >>> opt = nn.RMSProp(group_params, learning_rate=0.1, weight_decay=0.0) >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01. - >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. + >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'. - >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate - >>> # of default value 0.1 and a weight decay of default value 0.0. >>> >>> loss = nn.SoftmaxCrossEntropyWithLogits() >>> model = Model(net, loss_fn=loss, optimizer=optim) diff --git a/mindspore/nn/optim/sgd.py b/mindspore/nn/optim/sgd.py index d2680a38e549ed265be467c786fec95c81c2c817..382f0956272860fca40eae427777a26d5bbc4433 100755 --- a/mindspore/nn/optim/sgd.py +++ b/mindspore/nn/optim/sgd.py @@ -64,8 +64,7 @@ class SGD(Optimizer): - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which - in the value of 'order_params' but not in any group will use default learning rate and default weight - decay. + in the value of 'order_params' should be in one of group parameters. learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is Iterable or a Tensor and the dims of the Tensor is 1, @@ -98,16 +97,14 @@ class SGD(Optimizer): >>> >>> #2) Use parameter groups and set different values >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) - >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params())) + >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) >>> group_params = [{'params': conv_params, 'weight_decay': 0.01}, - >>> {'params': bias_params, 'lr': 0.01}, + >>> {'params': no_conv_params, 'lr': 0.01}, >>> {'order_params': net.trainable_params()}] >>> opt = nn.SGD(group_params, learning_rate=0.1, weight_decay=0.0) >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01. - >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. + >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'. - >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate - >>> # of default value 0.1 and a weight decay of default value 0.0. >>> >>> loss = nn.SoftmaxCrossEntropyWithLogits() >>> model = Model(net, loss_fn=loss, optimizer=optim) diff --git a/tests/ut/python/optimizer/test_optimizer_with_parameter_groups.py b/tests/ut/python/optimizer/test_optimizer_with_parameter_groups.py index 05e58013fa6c35181b5128436cf2f406b255ab2c..0aef22284dfa0cf88444fc226fbf36737cd6496c 100644 --- a/tests/ut/python/optimizer/test_optimizer_with_parameter_groups.py +++ b/tests/ut/python/optimizer/test_optimizer_with_parameter_groups.py @@ -250,8 +250,9 @@ def test_get_lr_parameter_with_order_group(): net = LeNet5() conv_lr = 0.1 conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) + no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) group_params = [{'params': conv_params, 'lr': conv_lr}, - {'order_params': net.trainable_params()}] + {'params': no_conv_params}] opt = SGD(group_params) assert opt.is_group_lr is True for param in opt.parameters: @@ -278,65 +279,19 @@ def test_get_lr_parameter_with_no_group(): opt.get_lr_parameter(params_error) -def test_order_params_lr(): - net = LeNet5() - conv_lr = 0.01 - default_lr = 0.1 - conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) - group_params = [{'params': conv_params, 'lr': conv_lr}, - {'order_params': net.trainable_params()}] - opt = SGD(group_params, learning_rate=default_lr) - assert opt.is_group is True - assert opt.is_group_lr is True - assert opt.is_group_params_ordered is True - for lr, param, order_param in zip(opt.learning_rate, opt.parameters, net.trainable_params()): - if param in conv_params: - assert np.all(lr.data.asnumpy() == Tensor(conv_lr, mstype.float32).asnumpy()) - else: - assert np.all(lr.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy()) - - assert param.name == order_param.name - assert lr.name == 'lr_' + param.name - - -def test_order_params_weight_decay(): - net = LeNet5() - conv_weight_decay = 0.01 - default_wd = 0.0 - default_lr = 0.1 - conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) - group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay}, - {'order_params': net.trainable_params()}] - opt = SGD(group_params, learning_rate=default_lr, weight_decay=default_wd) - assert opt.is_group is True - assert opt.is_group_lr is False - assert opt.is_group_params_ordered is True - assert opt.learning_rate.name == "learning_rate" - assert np.all(opt.learning_rate.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy()) - for weight_decay, decay_flags, param, order_param in zip( - opt.weight_decay, opt.decay_flags, opt.parameters, net.trainable_params()): - if param in conv_params: - assert weight_decay == conv_weight_decay - assert decay_flags is True - else: - assert weight_decay == default_wd - assert decay_flags is False - assert param.name == order_param.name - - -def test_order_params_all_1(): +def test_order_params_1(): net = LeNet5() conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params())) group_params = [{'params': conv_params, 'weight_decay': 0.01}, {'params': bias_params, 'lr': 0.01}, - {'order_params': net.trainable_params()}] + {'order_params': bias_params+conv_params}] opt = SGD(group_params, learning_rate=0.1, weight_decay=0.0) assert opt.is_group is True assert opt.is_group_lr is True assert opt.is_group_params_ordered is True for weight_decay, decay_flags, lr, param, order_param in zip( - opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, net.trainable_params()): + opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, bias_params+conv_params): if param in conv_params: assert np.all(lr.data.asnumpy() == Tensor(0.1, mstype.float32).asnumpy()) assert weight_decay == 0.01 @@ -354,7 +309,7 @@ def test_order_params_all_1(): assert lr.name == 'lr_' + param.name -def test_order_params_all_2(): +def test_order_params_2(): net = LeNet5() conv_weight_decay = 0.01 fc1_lr = (0.5, 0.4, 0.3) @@ -364,13 +319,13 @@ def test_order_params_all_2(): fc1_params = list(filter(lambda x: 'fc1' in x.name, net.trainable_params())) group_params = [{'params': fc1_params, 'lr': fc1_lr}, {'params': conv_params, 'weight_decay': conv_weight_decay}, - {'order_params': net.trainable_params()}] + {'order_params': fc1_params+conv_params}] opt = SGD(group_params, learning_rate=default_lr, weight_decay=default_wd) assert opt.is_group is True assert opt.is_group_lr is True assert opt.is_group_params_ordered is True for weight_decay, decay_flags, lr, param, order_param in zip( - opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, net.trainable_params()): + opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, fc1_params+conv_params): if param in conv_params: assert np.all(lr.data.asnumpy() == Tensor(np.array([default_lr] * 3), mstype.float32).asnumpy()) assert weight_decay == conv_weight_decay @@ -388,7 +343,7 @@ def test_order_params_all_2(): assert lr.name == 'lr_' + param.name -def test_get_order_params_with_not_include(): +def test_get_order_params_with_not_same(): net = LeNet5() conv_weight_decay = 0.8