!2748 Change order param only equal to group param

Merge pull request !2748 from ghzl/change-order-params-only-equal-to-group-param

!2748 Change order param only equal to group param
Merge pull request !2748 from ghzl/change-order-params-only-equal-to-group-param
d04f3b9a · mindspore-ci-bot · Gitee · 65189e8c · 65209364 · d04f3b9a
7 changed file
--- a/mindspore/nn/optim/adam.py
+++ b/mindspore/nn/optim/adam.py
@@ -181,8 +181,7 @@ class Adam(Optimizer):

            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
-              in the value of 'order_params' but not in any group will use default learning rate and default weight
-              decay.
+              in the value of 'order_params' should be in one of group parameters.

        learning_rate (Union[int, float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                             Iterable or a Tensor and the dims of the Tensor is 1,
@@ -220,16 +219,14 @@ class Adam(Optimizer):
        >>>
        >>> #2) Use parameter groups and set different values
        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
-        >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
-        >>>                 {'params': bias_params, 'lr': 0.01},
+        >>>                 {'params': no_conv_params, 'lr': 0.01},
        >>>                 {'order_params': net.trainable_params()}]
        >>> opt = nn.Adam(group_params, learning_rate=0.1, weight_decay=0.0)
        >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
-        >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
+        >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
        >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
-        >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
-        >>> # of default value 0.1 and a weight decay of default value 0.0.
        >>>
        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
        >>> model = Model(net, loss_fn=loss, optimizer=optim)

--- a/mindspore/nn/optim/lazyadam.py
+++ b/mindspore/nn/optim/lazyadam.py
@@ -109,6 +109,10 @@ class LazyAdam(Optimizer):
            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.

+            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
+              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
+              in the value of 'order_params' should be in one of group parameters.
+
        learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                        Iterable or a Tensor and the dims of the Tensor is 1,
                                                        use dynamic learning rate, then the i-th step will
@@ -146,12 +150,13 @@ class LazyAdam(Optimizer):
        >>> #2) Use parameter groups and set different values
        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
-        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01},
-        >>>                 {'params': no_conv_params}]
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
+        >>>                 {'params': no_conv_params, 'lr': 0.01},
+        >>>                 {'order_params': net.trainable_params()}]
        >>> opt = nn.LazyAdam(group_params, learning_rate=0.1, weight_decay=0.0)
-        >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01
-        >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a
-        >>> # learning rate of 0.1 and a weight decay of 0.0.
+        >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
+        >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
+        >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
        >>>
        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
        >>> model = Model(net, loss_fn=loss, optimizer=optim)

--- a/mindspore/nn/optim/momentum.py
+++ b/mindspore/nn/optim/momentum.py
@@ -64,8 +64,7 @@ class Momentum(Optimizer):

            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
-              in the value of 'order_params' but not in any group will use default learning rate and default weight
-              decay.
+              in the value of 'order_params' should be in one of group parameters.

        learning_rate (Union[int, float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                             Iterable or a Tensor and the dims of the Tensor is 1,
@@ -97,16 +96,14 @@ class Momentum(Optimizer):
        >>>
        >>> #2) Use parameter groups and set different values
        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
-        >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
-        >>>                 {'params': bias_params, 'lr': 0.01},
+        >>>                 {'params': no_conv_params, 'lr': 0.01},
        >>>                 {'order_params': net.trainable_params()}]
        >>> opt = nn.Momentum(group_params, learning_rate=0.1, momentum=0.9, weight_decay=0.0)
        >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
-        >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
+        >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
        >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
-        >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
-        >>> # of default value 0.1 and a weight decay of default value 0.0.
        >>>
        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
        >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)

--- a/mindspore/nn/optim/optimizer.py
+++ b/mindspore/nn/optim/optimizer.py
@@ -77,8 +77,7 @@ class Optimizer(Cell):

            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
-              in the value of 'order_params' but not in any group will use default learning rate and default weight
-              decay.
+              in the value of 'order_params' should be in one of group parameters.

        weight_decay (float): A floating point value for the weight decay. It should be equal to or greater than 0.
            If the type of `weight_decay` input is int, it will be converted to float. Default: 0.0.
@@ -351,16 +350,18 @@ class Optimizer(Cell):
                self.group_weight_decay.append(weight_decay_)

        if self.is_group_params_ordered:
-            self._order_and_adjust_group_params(ordered_parameters, learning_rate, weight_decay)
+            self._order_and_adjust_group_params(ordered_parameters)

-    def _order_and_adjust_group_params(self, ordered_parameters, learning_rate, weight_decay):
+    def _order_and_adjust_group_params(self, ordered_parameters):
        """
-        Order group parameter, learning rate and weight decay in group params. And assign the parameters
-        which in the value of 'order_params' but not in any group to default value.
+        Order group parameter, learning rate and weight decay in group params.
        """
-        params_length = len(ordered_parameters)
-        ordered_learning_rate = [Parameter(learning_rate, name="lr_" + param.name) for param in ordered_parameters]
-        ordered_weight_decay = [weight_decay * self.loss_scale] * params_length
+        params_length = len(self.group_params)
+        if len(ordered_parameters) != len(self.group_params):
+            raise ValueError(f"The value of 'order_params' should be same with all group parameters.")
+
+        ordered_learning_rate = [None] * params_length
+        ordered_weight_decay = [None] * params_length
        params_name = [param.name for param in ordered_parameters]

        for param, lr, wd in zip(self.group_params, self.group_lr, self.group_weight_decay):

--- a/mindspore/nn/optim/rmsprop.py
+++ b/mindspore/nn/optim/rmsprop.py
@@ -107,8 +107,7 @@ class RMSProp(Optimizer):

            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
-              in the value of 'order_params' but not in any group will use default learning rate and default weight
-              decay.
+              in the value of 'order_params' should be in one of group parameters.

        learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                        Iterable or a Tensor and the dims of the Tensor is 1,
@@ -140,16 +139,14 @@ class RMSProp(Optimizer):
        >>>
        >>> #2) Use parameter groups and set different values
        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
-        >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
-        >>>                 {'params': bias_params, 'lr': 0.01},
+        >>>                 {'params': no_conv_params, 'lr': 0.01},
        >>>                 {'order_params': net.trainable_params()}]
        >>> opt = nn.RMSProp(group_params, learning_rate=0.1, weight_decay=0.0)
        >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
-        >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
+        >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
        >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
-        >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
-        >>> # of default value 0.1 and a weight decay of default value 0.0.
        >>>
        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
        >>> model = Model(net, loss_fn=loss, optimizer=optim)

--- a/mindspore/nn/optim/sgd.py
+++ b/mindspore/nn/optim/sgd.py
@@ -64,8 +64,7 @@ class SGD(Optimizer):

            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
-              in the value of 'order_params' but not in any group will use default learning rate and default weight
-              decay.
+              in the value of 'order_params' should be in one of group parameters.

        learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                        Iterable or a Tensor and the dims of the Tensor is 1,
@@ -98,16 +97,14 @@ class SGD(Optimizer):
        >>>
        >>> #2) Use parameter groups and set different values
        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
-        >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
-        >>>                 {'params': bias_params, 'lr': 0.01},
+        >>>                 {'params': no_conv_params, 'lr': 0.01},
        >>>                 {'order_params': net.trainable_params()}]
        >>> opt = nn.SGD(group_params, learning_rate=0.1, weight_decay=0.0)
        >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
-        >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
+        >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
        >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
-        >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
-        >>> # of default value 0.1 and a weight decay of default value 0.0.
        >>>
        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
        >>> model = Model(net, loss_fn=loss, optimizer=optim)

--- a/tests/ut/python/optimizer/test_optimizer_with_parameter_groups.py
+++ b/tests/ut/python/optimizer/test_optimizer_with_parameter_groups.py
@@ -250,8 +250,9 @@ def test_get_lr_parameter_with_order_group():
    net = LeNet5()
    conv_lr = 0.1
    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
    group_params = [{'params': conv_params, 'lr': conv_lr},
-                    {'order_params': net.trainable_params()}]
+                    {'params': no_conv_params}]
    opt = SGD(group_params)
    assert opt.is_group_lr is True
    for param in opt.parameters:
@@ -278,65 +279,19 @@ def test_get_lr_parameter_with_no_group():
        opt.get_lr_parameter(params_error)


-def test_order_params_lr():
-    net = LeNet5()
-    conv_lr = 0.01
-    default_lr = 0.1
-    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
-    group_params = [{'params': conv_params, 'lr': conv_lr},
-                    {'order_params': net.trainable_params()}]
-    opt = SGD(group_params, learning_rate=default_lr)
-    assert opt.is_group is True
-    assert opt.is_group_lr is True
-    assert opt.is_group_params_ordered is True
-    for lr, param, order_param in zip(opt.learning_rate, opt.parameters, net.trainable_params()):
-        if param in conv_params:
-            assert np.all(lr.data.asnumpy() == Tensor(conv_lr, mstype.float32).asnumpy())
-        else:
-            assert np.all(lr.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy())
-
-        assert param.name == order_param.name
-        assert lr.name == 'lr_' + param.name
-
-
-def test_order_params_weight_decay():
-    net = LeNet5()
-    conv_weight_decay = 0.01
-    default_wd = 0.0
-    default_lr = 0.1
-    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
-    group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay},
-                    {'order_params': net.trainable_params()}]
-    opt = SGD(group_params, learning_rate=default_lr, weight_decay=default_wd)
-    assert opt.is_group is True
-    assert opt.is_group_lr is False
-    assert opt.is_group_params_ordered is True
-    assert opt.learning_rate.name == "learning_rate"
-    assert np.all(opt.learning_rate.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy())
-    for weight_decay, decay_flags, param, order_param in zip(
-            opt.weight_decay, opt.decay_flags, opt.parameters, net.trainable_params()):
-        if param in conv_params:
-            assert weight_decay == conv_weight_decay
-            assert decay_flags is True
-        else:
-            assert weight_decay == default_wd
-            assert decay_flags is False
-        assert param.name == order_param.name
-
-
-def test_order_params_all_1():
+def test_order_params_1():
    net = LeNet5()
    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
    bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
    group_params = [{'params': conv_params, 'weight_decay': 0.01},
                    {'params': bias_params, 'lr': 0.01},
-                    {'order_params': net.trainable_params()}]
+                    {'order_params': bias_params+conv_params}]
    opt = SGD(group_params, learning_rate=0.1, weight_decay=0.0)
    assert opt.is_group is True
    assert opt.is_group_lr is True
    assert opt.is_group_params_ordered is True
    for weight_decay, decay_flags, lr, param, order_param in zip(
-            opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, net.trainable_params()):
+            opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, bias_params+conv_params):
        if param in conv_params:
            assert np.all(lr.data.asnumpy() == Tensor(0.1, mstype.float32).asnumpy())
            assert weight_decay == 0.01
@@ -354,7 +309,7 @@ def test_order_params_all_1():
        assert lr.name == 'lr_' + param.name


-def test_order_params_all_2():
+def test_order_params_2():
    net = LeNet5()
    conv_weight_decay = 0.01
    fc1_lr = (0.5, 0.4, 0.3)
@@ -364,13 +319,13 @@ def test_order_params_all_2():
    fc1_params = list(filter(lambda x: 'fc1' in x.name, net.trainable_params()))
    group_params = [{'params': fc1_params, 'lr': fc1_lr},
                    {'params': conv_params, 'weight_decay': conv_weight_decay},
-                    {'order_params': net.trainable_params()}]
+                    {'order_params': fc1_params+conv_params}]
    opt = SGD(group_params, learning_rate=default_lr, weight_decay=default_wd)
    assert opt.is_group is True
    assert opt.is_group_lr is True
    assert opt.is_group_params_ordered is True
    for weight_decay, decay_flags, lr, param, order_param in zip(
-            opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, net.trainable_params()):
+            opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, fc1_params+conv_params):
        if param in conv_params:
            assert np.all(lr.data.asnumpy() == Tensor(np.array([default_lr] * 3), mstype.float32).asnumpy())
            assert weight_decay == conv_weight_decay
@@ -388,7 +343,7 @@ def test_order_params_all_2():
        assert lr.name == 'lr_' + param.name


-def test_get_order_params_with_not_include():
+def test_get_order_params_with_not_same():
    net = LeNet5()
    conv_weight_decay = 0.8