提交 eaaacfea 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!1941 Add order function in group params in r0.3

Merge pull request !1941 from ghzl/v3-add-oder-parameters-in-group-functions
...@@ -144,10 +144,12 @@ class Adam(Optimizer): ...@@ -144,10 +144,12 @@ class Adam(Optimizer):
value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
To improve parameter groups performance, the customized order of parameters can be supported.
Args: Args:
params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params", the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
"lr" and "weight_decay" are the keys can be parsed. "lr", "weight_decay" and "order_params" are the keys can be parsed.
- params: Required. The value should be a list of `Parameter`. - params: Required. The value should be a list of `Parameter`.
...@@ -157,6 +159,11 @@ class Adam(Optimizer): ...@@ -157,6 +159,11 @@ class Adam(Optimizer):
- weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
will be used. If not, the `weight_decay` in the API will be used. will be used. If not, the `weight_decay` in the API will be used.
- order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
in the value of 'order_params' but not in any group will use default learning rate and default weight
decay.
learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
Iterable or a Tensor and the dims of the Tensor is 1, Iterable or a Tensor and the dims of the Tensor is 1,
use dynamic learning rate, then the i-th step will use dynamic learning rate, then the i-th step will
...@@ -193,13 +200,16 @@ class Adam(Optimizer): ...@@ -193,13 +200,16 @@ class Adam(Optimizer):
>>> >>>
>>> #2) Use parameter groups and set different values >>> #2) Use parameter groups and set different values
>>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
>>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
>>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01}, >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
>>> {'params': no_conv_params}] >>> {'params': bias_params, 'lr': 0.01},
>>> {'order_params': net.trainable_params()}]
>>> opt = nn.Adam(group_params, learning_rate=0.1, weight_decay=0.0) >>> opt = nn.Adam(group_params, learning_rate=0.1, weight_decay=0.0)
>>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01 >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
>>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
>>> # learning rate of 0.1 and a weight decay of 0.0. >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
>>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
>>> # of default value 0.1 and a weight decay of default value 0.0.
>>> >>>
>>> loss = nn.SoftmaxCrossEntropyWithLogits() >>> loss = nn.SoftmaxCrossEntropyWithLogits()
>>> model = Model(net, loss_fn=loss, optimizer=optim) >>> model = Model(net, loss_fn=loss, optimizer=optim)
......
...@@ -45,10 +45,12 @@ class Momentum(Optimizer): ...@@ -45,10 +45,12 @@ class Momentum(Optimizer):
value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
To improve parameter groups performance, the customized order of parameters can be supported.
Args: Args:
params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params", the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
"lr" and "weight_decay" are the keys can be parsed. "lr", "weight_decay" and "order_params" are the keys can be parsed.
- params: Required. The value should be a list of `Parameter`. - params: Required. The value should be a list of `Parameter`.
...@@ -58,6 +60,11 @@ class Momentum(Optimizer): ...@@ -58,6 +60,11 @@ class Momentum(Optimizer):
- weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
will be used. If not, the `weight_decay` in the API will be used. will be used. If not, the `weight_decay` in the API will be used.
- order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
in the value of 'order_params' but not in any group will use default learning rate and default weight
decay.
learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
Iterable or a Tensor and the dims of the Tensor is 1, Iterable or a Tensor and the dims of the Tensor is 1,
use dynamic learning rate, then the i-th step will use dynamic learning rate, then the i-th step will
...@@ -86,13 +93,16 @@ class Momentum(Optimizer): ...@@ -86,13 +93,16 @@ class Momentum(Optimizer):
>>> >>>
>>> #2) Use parameter groups and set different values >>> #2) Use parameter groups and set different values
>>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
>>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
>>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01}, >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
>>> {'params': no_conv_params}] >>> {'params': bias_params, 'lr': 0.01},
>>> {'order_params': net.trainable_params()}]
>>> opt = nn.Momentum(group_params, learning_rate=0.1, momentum=0.9, weight_decay=0.0) >>> opt = nn.Momentum(group_params, learning_rate=0.1, momentum=0.9, weight_decay=0.0)
>>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01 >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
>>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
>>> # learning rate of 0.1 and a weight decay of 0.0. >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
>>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
>>> # of default value 0.1 and a weight decay of default value 0.0.
>>> >>>
>>> loss = nn.SoftmaxCrossEntropyWithLogits() >>> loss = nn.SoftmaxCrossEntropyWithLogits()
>>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None) >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)
......
...@@ -48,11 +48,13 @@ class Optimizer(Cell): ...@@ -48,11 +48,13 @@ class Optimizer(Cell):
value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
To improve parameter groups performance, the customized order of parameters can be supported.
Args: Args:
learning_rate (float): A floating point value for the learning rate. Should be greater than 0. learning_rate (float): A floating point value for the learning rate. Should be greater than 0.
parameters (Union[list[Parameter], list[dict]]): When the `parameters` is a list of `Parameter` which will be parameters (Union[list[Parameter], list[dict]]): When the `parameters` is a list of `Parameter` which will be
updated, the element in `parameters` should be class `Parameter`. When the `parameters` is a list of `dict`, updated, the element in `parameters` should be class `Parameter`. When the `parameters` is a list of `dict`,
the "params", "lr" and "weight_decay" are the keys can be parsed. the "params", "lr", "weight_decay" and "order_params" are the keys can be parsed.
- params: Required. The value should be a list of `Parameter`. - params: Required. The value should be a list of `Parameter`.
...@@ -62,6 +64,11 @@ class Optimizer(Cell): ...@@ -62,6 +64,11 @@ class Optimizer(Cell):
- weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
will be used. If not, the `weight_decay` in the API will be used. will be used. If not, the `weight_decay` in the API will be used.
- order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
in the value of 'order_params' but not in any group will use default learning rate and default weight
decay.
weight_decay (float): A floating point value for the weight decay. It should be equal to or greater than 0. weight_decay (float): A floating point value for the weight decay. It should be equal to or greater than 0.
If the type of `weight_decay` input is int, it will be converted to float. Default: 0.0. If the type of `weight_decay` input is int, it will be converted to float. Default: 0.0.
loss_scale (float): A floating point value for the loss scale. It should be greater than 0. If the loss_scale (float): A floating point value for the loss scale. It should be greater than 0. If the
...@@ -95,6 +102,7 @@ class Optimizer(Cell): ...@@ -95,6 +102,7 @@ class Optimizer(Cell):
self.is_group = False self.is_group = False
self.is_group_lr = False self.is_group_lr = False
self.is_group_params_ordered = False
self.loss_scale = loss_scale self.loss_scale = loss_scale
if isinstance(learning_rate, float): if isinstance(learning_rate, float):
self.dynamic_lr = False self.dynamic_lr = False
...@@ -200,9 +208,8 @@ class Optimizer(Cell): ...@@ -200,9 +208,8 @@ class Optimizer(Cell):
raise TypeError("Learning rate should be float, Tensor or Iterable.") raise TypeError("Learning rate should be float, Tensor or Iterable.")
return lr return lr
def _init_group_params(self, parameters, learning_rate, weight_decay): def _parse_group_params(self, parameters, learning_rate):
"""Init learning rate or weight decay in group params.""" """Parse group params."""
origin_dynamic_lr = self.dynamic_lr
if self.dynamic_lr: if self.dynamic_lr:
dynamic_lr_length = learning_rate.size() dynamic_lr_length = learning_rate.size()
else: else:
...@@ -210,6 +217,15 @@ class Optimizer(Cell): ...@@ -210,6 +217,15 @@ class Optimizer(Cell):
for group_param in parameters: for group_param in parameters:
lr_length = dynamic_lr_length lr_length = dynamic_lr_length
if 'order_params' in group_param.keys():
if len(group_param.keys()) > 1:
raise ValueError("The order params dict in group parameters should "
"only include the 'order_params' key.")
if not isinstance(group_param['order_params'], Iterable):
raise TypeError("The value of 'order_params' should be an Iterable type.")
self.is_group_params_ordered = True
continue
if 'lr' in group_param.keys(): if 'lr' in group_param.keys():
self.is_group_lr = True self.is_group_lr = True
self._get_single_lr(group_param['lr']) self._get_single_lr(group_param['lr'])
...@@ -219,10 +235,20 @@ class Optimizer(Cell): ...@@ -219,10 +235,20 @@ class Optimizer(Cell):
elif isinstance(group_param['lr'], Tensor): elif isinstance(group_param['lr'], Tensor):
lr_length = group_param['lr'].size() lr_length = group_param['lr'].size()
self.dynamic_lr = True self.dynamic_lr = True
if dynamic_lr_length not in (lr_length, 0): if dynamic_lr_length not in (lr_length, 0):
raise ValueError("The dynamic learning rate in group should be the same size.") raise ValueError("The dynamic learning rate in group should be the same size.")
if not group_param['params']:
raise ValueError("Optimizer got an empty group parameter list.")
dynamic_lr_length = lr_length dynamic_lr_length = lr_length
self.dynamic_lr_length = dynamic_lr_length
def _init_group_params(self, parameters, learning_rate, weight_decay):
"""Init learning rate or weight decay in group params."""
origin_dynamic_lr = self.dynamic_lr
self._parse_group_params(parameters, learning_rate)
if self.dynamic_lr and not origin_dynamic_lr: if self.dynamic_lr and not origin_dynamic_lr:
self.gather = P.GatherV2() self.gather = P.GatherV2()
self.assignadd = P.AssignAdd() self.assignadd = P.AssignAdd()
...@@ -230,20 +256,20 @@ class Optimizer(Cell): ...@@ -230,20 +256,20 @@ class Optimizer(Cell):
params_store = [] params_store = []
for group_param in parameters: for group_param in parameters:
if not group_param['params']: if 'order_params' in group_param.keys():
raise ValueError("Optimizer got an empty parameter list.") ordered_parameters = group_param['order_params']
continue
self.group_params += group_param['params'] self.group_params += group_param['params']
if 'lr' in group_param.keys(): if 'lr' in group_param.keys():
params_dynamic_lr = isinstance(group_param['lr'], (Iterable, Tensor)) params_dynamic_lr = isinstance(group_param['lr'], (Iterable, Tensor))
if self.dynamic_lr and not params_dynamic_lr: if self.dynamic_lr and not params_dynamic_lr:
lr = Tensor(np.array([group_param['lr']] * dynamic_lr_length).astype(np.float32)) lr = Tensor(np.array([group_param['lr']] * self.dynamic_lr_length).astype(np.float32))
else: else:
lr = self._get_single_lr(group_param['lr']) lr = self._get_single_lr(group_param['lr'])
else: else:
if self.dynamic_lr and not origin_dynamic_lr: if self.dynamic_lr and not origin_dynamic_lr:
lr = Tensor(np.array([self.scalar_lr] * dynamic_lr_length).astype(np.float32)) lr = Tensor(np.array([self.scalar_lr] * self.dynamic_lr_length).astype(np.float32))
else: else:
lr = learning_rate lr = learning_rate
...@@ -263,10 +289,33 @@ class Optimizer(Cell): ...@@ -263,10 +289,33 @@ class Optimizer(Cell):
validator.check_value_type("parameter", param, [Parameter], self.cls_name) validator.check_value_type("parameter", param, [Parameter], self.cls_name)
if param.name in params_store: if param.name in params_store:
raise RuntimeError(f"The {param.name} parameter has appeared in parameter groups.") raise RuntimeError(f"The {param.name} parameter has appeared in parameter groups.")
params_store.append(param.name) params_store.append(param.name)
self.group_lr.append(Parameter(lr, name="lr_" + param.name)) self.group_lr.append(Parameter(lr, name="lr_" + param.name))
self.group_weight_decay.append(weight_decay_) self.group_weight_decay.append(weight_decay_)
if self.is_group_params_ordered:
self._order_and_adjust_group_params(ordered_parameters, learning_rate, weight_decay)
def _order_and_adjust_group_params(self, ordered_parameters, learning_rate, weight_decay):
"""
Order group parameter, learning rate and weight decay in group params. And assign the parameters
which in the value of 'order_params' but not in any group to default value.
"""
params_length = len(ordered_parameters)
ordered_learning_rate = [Parameter(learning_rate, name="lr_" + param.name) for param in ordered_parameters]
ordered_weight_decay = [weight_decay * self.loss_scale] * params_length
params_name = [param.name for param in ordered_parameters]
for param, lr, wd in zip(self.group_params, self.group_lr, self.group_weight_decay):
index = params_name.index(param.name)
ordered_learning_rate[index] = lr
ordered_weight_decay[index] = wd
self.group_params = list(ordered_parameters)
self.group_lr = ordered_learning_rate
self.group_weight_decay = ordered_weight_decay
def get_lr(self): def get_lr(self):
""" """
Get the learning rate of current step. Get the learning rate of current step.
......
...@@ -51,6 +51,8 @@ class RMSProp(Optimizer): ...@@ -51,6 +51,8 @@ class RMSProp(Optimizer):
value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
To improve parameter groups performance, the customized order of parameters can be supported.
Update `params` according to the RMSProp algorithm. Update `params` according to the RMSProp algorithm.
The equation is as follows: The equation is as follows:
...@@ -93,7 +95,7 @@ class RMSProp(Optimizer): ...@@ -93,7 +95,7 @@ class RMSProp(Optimizer):
Args: Args:
params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params", the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
"lr" and "weight_decay" are the keys can be parsed. "lr", "weight_decay" and "order_params" are the keys can be parsed.
- params: Required. The value should be a list of `Parameter`. - params: Required. The value should be a list of `Parameter`.
...@@ -103,6 +105,11 @@ class RMSProp(Optimizer): ...@@ -103,6 +105,11 @@ class RMSProp(Optimizer):
- weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
will be used. If not, the `weight_decay` in the API will be used. will be used. If not, the `weight_decay` in the API will be used.
- order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
in the value of 'order_params' but not in any group will use default learning rate and default weight
decay.
learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
Iterable or a Tensor and the dims of the Tensor is 1, Iterable or a Tensor and the dims of the Tensor is 1,
use dynamic learning rate, then the i-th step will use dynamic learning rate, then the i-th step will
...@@ -133,13 +140,16 @@ class RMSProp(Optimizer): ...@@ -133,13 +140,16 @@ class RMSProp(Optimizer):
>>> >>>
>>> #2) Use parameter groups and set different values >>> #2) Use parameter groups and set different values
>>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
>>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
>>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01}, >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
>>> {'params': no_conv_params}] >>> {'params': bias_params, 'lr': 0.01},
>>> {'order_params': net.trainable_params()}]
>>> opt = nn.RMSProp(group_params, learning_rate=0.1, weight_decay=0.0) >>> opt = nn.RMSProp(group_params, learning_rate=0.1, weight_decay=0.0)
>>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01 >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
>>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
>>> # learning rate of 0.1 and a weight decay of 0.0. >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
>>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
>>> # of default value 0.1 and a weight decay of default value 0.0.
>>> >>>
>>> loss = nn.SoftmaxCrossEntropyWithLogits() >>> loss = nn.SoftmaxCrossEntropyWithLogits()
>>> model = Model(net, loss_fn=loss, optimizer=optim) >>> model = Model(net, loss_fn=loss, optimizer=optim)
......
...@@ -47,10 +47,12 @@ class SGD(Optimizer): ...@@ -47,10 +47,12 @@ class SGD(Optimizer):
value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
To improve parameter groups performance, the customized order of parameters can be supported.
Args: Args:
params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params", the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
"lr" and "weight_decay" are the keys can be parsed. "lr", "weight_decay" and "order_params" are the keys can be parsed.
- params: Required. The value should be a list of `Parameter`. - params: Required. The value should be a list of `Parameter`.
...@@ -60,6 +62,11 @@ class SGD(Optimizer): ...@@ -60,6 +62,11 @@ class SGD(Optimizer):
- weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
will be used. If not, the `weight_decay` in the API will be used. will be used. If not, the `weight_decay` in the API will be used.
- order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
in the value of 'order_params' but not in any group will use default learning rate and default weight
decay.
learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
Iterable or a Tensor and the dims of the Tensor is 1, Iterable or a Tensor and the dims of the Tensor is 1,
use dynamic learning rate, then the i-th step will use dynamic learning rate, then the i-th step will
...@@ -90,13 +97,16 @@ class SGD(Optimizer): ...@@ -90,13 +97,16 @@ class SGD(Optimizer):
>>> >>>
>>> #2) Use parameter groups and set different values >>> #2) Use parameter groups and set different values
>>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
>>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
>>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01}, >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
>>> {'params': no_conv_params}] >>> {'params': bias_params, 'lr': 0.01},
>>> {'order_params': net.trainable_params()}]
>>> opt = nn.SGD(group_params, learning_rate=0.1, weight_decay=0.0) >>> opt = nn.SGD(group_params, learning_rate=0.1, weight_decay=0.0)
>>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01 >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
>>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
>>> # learning rate of 0.1 and a weight decay of 0.0. >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
>>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
>>> # of default value 0.1 and a weight decay of default value 0.0.
>>> >>>
>>> loss = nn.SoftmaxCrossEntropyWithLogits() >>> loss = nn.SoftmaxCrossEntropyWithLogits()
>>> model = Model(net, loss_fn=loss, optimizer=optim) >>> model = Model(net, loss_fn=loss, optimizer=optim)
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
# limitations under the License. # limitations under the License.
# ============================================================================ # ============================================================================
"""Dataset help for minddata dataset""" """Dataset help for minddata dataset"""
import math
from mindspore._checkparam import check_bool from mindspore._checkparam import check_bool
from .. import context from .. import context
from .parallel_utils import ParallelMode from .parallel_utils import ParallelMode
...@@ -104,10 +106,10 @@ class _DatasetIter: ...@@ -104,10 +106,10 @@ class _DatasetIter:
loop_count = 1 loop_count = 1
if hasattr(dataset, '__loop_size__'): if hasattr(dataset, '__loop_size__'):
loop_size = dataset.__loop_size__ loop_size = dataset.__loop_size__
if dataset.get_dataset_size() % loop_size != 0: if loop_size <= dataset.get_dataset_size() and dataset.get_dataset_size() % loop_size != 0:
raise ValueError(f'Dataset size {dataset.get_dataset_size()} and ' raise ValueError(f'Dataset size {dataset.get_dataset_size()} and '
f'loop_size {loop_size} are not matched.') f'loop_size {loop_size} are not matched.')
loop_count = int(dataset.get_dataset_size() / loop_size) loop_count = math.ceil(dataset.get_dataset_size() / loop_size)
return loop_count return loop_count
......
...@@ -60,8 +60,9 @@ def test_group_lr(): ...@@ -60,8 +60,9 @@ def test_group_lr():
default_lr = 0.1 default_lr = 0.1
conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
group_params = [{'params': conv_params, 'lr': conv_lr}, group_params = [{'params': no_conv_params},
{'params': no_conv_params}] {'params': conv_params, 'lr': conv_lr},
{'order_params': net.trainable_params()}]
net.set_train() net.set_train()
loss = nn.SoftmaxCrossEntropyWithLogits() loss = nn.SoftmaxCrossEntropyWithLogits()
...@@ -69,12 +70,15 @@ def test_group_lr(): ...@@ -69,12 +70,15 @@ def test_group_lr():
assert opt.is_group is True assert opt.is_group is True
assert opt.is_group_lr is True assert opt.is_group_lr is True
assert opt.dynamic_lr is False assert opt.dynamic_lr is False
for lr, param in zip(opt.learning_rate, opt.parameters): assert opt.is_group_params_ordered is True
for lr, param, order_param in zip(opt.learning_rate, opt.parameters, net.trainable_params()):
if param in conv_params: if param in conv_params:
assert np.all(lr.data.asnumpy() == Tensor(conv_lr, mstype.float32).asnumpy()) assert np.all(lr.data.asnumpy() == Tensor(conv_lr, mstype.float32).asnumpy())
else: else:
assert np.all(lr.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy()) assert np.all(lr.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy())
assert param.name == order_param.name
net_with_loss = WithLossCell(net, loss) net_with_loss = WithLossCell(net, loss)
train_network = TrainOneStepCell(net_with_loss, opt) train_network = TrainOneStepCell(net_with_loss, opt)
_executor.compile(train_network, inputs, label) _executor.compile(train_network, inputs, label)
...@@ -89,20 +93,24 @@ def test_group_dynamic_1(): ...@@ -89,20 +93,24 @@ def test_group_dynamic_1():
default_lr = (0.1, 0.2, 0.3) default_lr = (0.1, 0.2, 0.3)
conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
group_params = [{'params': conv_params, 'lr': conv_lr}, group_params = [{'params': no_conv_params},
{'params': no_conv_params}] {'params': conv_params, 'lr': conv_lr},
{'order_params': net.trainable_params()}]
net.set_train() net.set_train()
loss = nn.SoftmaxCrossEntropyWithLogits() loss = nn.SoftmaxCrossEntropyWithLogits()
opt = Momentum(group_params, learning_rate=default_lr, momentum=0.9) opt = Momentum(group_params, learning_rate=default_lr, momentum=0.9)
assert opt.is_group is True assert opt.is_group is True
assert opt.dynamic_lr is True assert opt.dynamic_lr is True
for lr, param in zip(opt.learning_rate, opt.parameters): assert opt.is_group_params_ordered is True
for lr, param, order_param in zip(opt.learning_rate, opt.parameters, net.trainable_params()):
if param in conv_params: if param in conv_params:
assert np.all(lr.data.asnumpy() == Tensor(np.array([conv_lr] * 3).astype(np.float32)).asnumpy()) assert np.all(lr.data.asnumpy() == Tensor(np.array([conv_lr] * 3).astype(np.float32)).asnumpy())
else: else:
assert np.all(lr.data.asnumpy() == Tensor(np.array(list(default_lr)).astype(np.float32)).asnumpy()) assert np.all(lr.data.asnumpy() == Tensor(np.array(list(default_lr)).astype(np.float32)).asnumpy())
assert param.name == order_param.name
net_with_loss = WithLossCell(net, loss) net_with_loss = WithLossCell(net, loss)
train_network = TrainOneStepCell(net_with_loss, opt) train_network = TrainOneStepCell(net_with_loss, opt)
_executor.compile(train_network, inputs, label) _executor.compile(train_network, inputs, label)
...@@ -127,9 +135,9 @@ def test_group_dynamic_2(): ...@@ -127,9 +135,9 @@ def test_group_dynamic_2():
assert opt.dynamic_lr is True assert opt.dynamic_lr is True
for lr, param in zip(opt.learning_rate, opt.parameters): for lr, param in zip(opt.learning_rate, opt.parameters):
if param in conv_params: if param in conv_params:
assert np.all(lr.data == Tensor(np.array(list(conv_lr)).astype(np.float32))) assert np.all(lr.data.asnumpy() == Tensor(np.array(list(conv_lr)).astype(np.float32)).asnumpy())
else: else:
assert np.all(lr.data == Tensor(np.array([default_lr] * 3).astype(np.float32))) assert np.all(lr.data.asnumpy() == Tensor(np.array([default_lr] * 3).astype(np.float32)).asnumpy())
net_with_loss = WithLossCell(net, loss) net_with_loss = WithLossCell(net, loss)
train_network = TrainOneStepCell(net_with_loss, opt) train_network = TrainOneStepCell(net_with_loss, opt)
...@@ -180,15 +188,18 @@ def test_weight_decay(): ...@@ -180,15 +188,18 @@ def test_weight_decay():
default_weight_decay = 0.0 default_weight_decay = 0.0
conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay}, group_params = [{'params': no_conv_params},
{'params': no_conv_params}] {'params': conv_params, 'weight_decay': conv_weight_decay},
{'order_params': net.trainable_params()}]
net.set_train() net.set_train()
loss = nn.SoftmaxCrossEntropyWithLogits() loss = nn.SoftmaxCrossEntropyWithLogits()
opt = SGD(group_params, learning_rate=0.1, weight_decay=default_weight_decay) opt = SGD(group_params, learning_rate=0.1, weight_decay=default_weight_decay)
assert opt.is_group is True assert opt.is_group is True
assert opt.is_group_lr is False assert opt.is_group_lr is False
for weight_decay, decay_flags, param in zip(opt.weight_decay, opt.decay_flags, opt.parameters): assert opt.is_group_params_ordered is True
for weight_decay, decay_flags, param, order_param in zip(
opt.weight_decay, opt.decay_flags, opt.parameters, net.trainable_params()):
if param in conv_params: if param in conv_params:
assert weight_decay == conv_weight_decay assert weight_decay == conv_weight_decay
assert decay_flags is True assert decay_flags is True
...@@ -196,6 +207,8 @@ def test_weight_decay(): ...@@ -196,6 +207,8 @@ def test_weight_decay():
assert weight_decay == default_weight_decay assert weight_decay == default_weight_decay
assert decay_flags is False assert decay_flags is False
assert param.name == order_param.name
net_with_loss = WithLossCell(net, loss) net_with_loss = WithLossCell(net, loss)
train_network = TrainOneStepCell(net_with_loss, opt) train_network = TrainOneStepCell(net_with_loss, opt)
_executor.compile(train_network, inputs, label) _executor.compile(train_network, inputs, label)
...@@ -233,6 +246,19 @@ def test_get_lr_parameter_with_group(): ...@@ -233,6 +246,19 @@ def test_get_lr_parameter_with_group():
assert lr.name == 'lr_' + param.name assert lr.name == 'lr_' + param.name
def test_get_lr_parameter_with_order_group():
net = LeNet5()
conv_lr = 0.1
conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
group_params = [{'params': conv_params, 'lr': conv_lr},
{'order_params': net.trainable_params()}]
opt = SGD(group_params)
assert opt.is_group_lr is True
for param in opt.parameters:
lr = opt.get_lr_parameter(param)
assert lr.name == 'lr_' + param.name
def test_get_lr_parameter_with_no_group(): def test_get_lr_parameter_with_no_group():
net = LeNet5() net = LeNet5()
conv_weight_decay = 0.8 conv_weight_decay = 0.8
...@@ -250,3 +276,125 @@ def test_get_lr_parameter_with_no_group(): ...@@ -250,3 +276,125 @@ def test_get_lr_parameter_with_no_group():
params_error = [1, 2, 3] params_error = [1, 2, 3]
with pytest.raises(TypeError): with pytest.raises(TypeError):
opt.get_lr_parameter(params_error) opt.get_lr_parameter(params_error)
def test_order_params_lr():
net = LeNet5()
conv_lr = 0.01
default_lr = 0.1
conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
group_params = [{'params': conv_params, 'lr': conv_lr},
{'order_params': net.trainable_params()}]
opt = SGD(group_params, learning_rate=default_lr)
assert opt.is_group is True
assert opt.is_group_lr is True
assert opt.is_group_params_ordered is True
for lr, param, order_param in zip(opt.learning_rate, opt.parameters, net.trainable_params()):
if param in conv_params:
assert np.all(lr.data.asnumpy() == Tensor(conv_lr, mstype.float32).asnumpy())
else:
assert np.all(lr.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy())
assert param.name == order_param.name
assert lr.name == 'lr_' + param.name
def test_order_params_weight_decay():
net = LeNet5()
conv_weight_decay = 0.01
default_wd = 0.0
default_lr = 0.1
conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay},
{'order_params': net.trainable_params()}]
opt = SGD(group_params, learning_rate=default_lr, weight_decay=default_wd)
assert opt.is_group is True
assert opt.is_group_lr is False
assert opt.is_group_params_ordered is True
assert opt.learning_rate.name == "learning_rate"
assert np.all(opt.learning_rate.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy())
for weight_decay, decay_flags, param, order_param in zip(
opt.weight_decay, opt.decay_flags, opt.parameters, net.trainable_params()):
if param in conv_params:
assert weight_decay == conv_weight_decay
assert decay_flags is True
else:
assert weight_decay == default_wd
assert decay_flags is False
assert param.name == order_param.name
def test_order_params_all_1():
net = LeNet5()
conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
group_params = [{'params': conv_params, 'weight_decay': 0.01},
{'params': bias_params, 'lr': 0.01},
{'order_params': net.trainable_params()}]
opt = SGD(group_params, learning_rate=0.1, weight_decay=0.0)
assert opt.is_group is True
assert opt.is_group_lr is True
assert opt.is_group_params_ordered is True
for weight_decay, decay_flags, lr, param, order_param in zip(
opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, net.trainable_params()):
if param in conv_params:
assert np.all(lr.data.asnumpy() == Tensor(0.1, mstype.float32).asnumpy())
assert weight_decay == 0.01
assert decay_flags is True
elif param in bias_params:
assert np.all(lr.data.asnumpy() == Tensor(0.01, mstype.float32).asnumpy())
assert weight_decay == 0.0
assert decay_flags is False
else:
assert np.all(lr.data.asnumpy() == Tensor(0.1, mstype.float32).asnumpy())
assert weight_decay == 0.0
assert decay_flags is False
assert param.name == order_param.name
assert lr.name == 'lr_' + param.name
def test_order_params_all_2():
net = LeNet5()
conv_weight_decay = 0.01
fc1_lr = (0.5, 0.4, 0.3)
default_lr = 0.1
default_wd = 0.0
conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
fc1_params = list(filter(lambda x: 'fc1' in x.name, net.trainable_params()))
group_params = [{'params': fc1_params, 'lr': fc1_lr},
{'params': conv_params, 'weight_decay': conv_weight_decay},
{'order_params': net.trainable_params()}]
opt = SGD(group_params, learning_rate=default_lr, weight_decay=default_wd)
assert opt.is_group is True
assert opt.is_group_lr is True
assert opt.is_group_params_ordered is True
for weight_decay, decay_flags, lr, param, order_param in zip(
opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, net.trainable_params()):
if param in conv_params:
assert np.all(lr.data.asnumpy() == Tensor(np.array([default_lr] * 3), mstype.float32).asnumpy())
assert weight_decay == conv_weight_decay
assert decay_flags is True
elif param in fc1_params:
assert np.all(lr.data.asnumpy() == Tensor(fc1_lr, mstype.float32).asnumpy())
assert weight_decay == default_wd
assert decay_flags is False
else:
assert np.all(lr.data.asnumpy() == Tensor(np.array([default_lr] * 3), mstype.float32).asnumpy())
assert weight_decay == default_wd
assert decay_flags is False
assert param.name == order_param.name
assert lr.name == 'lr_' + param.name
def test_get_order_params_with_not_include():
net = LeNet5()
conv_weight_decay = 0.8
conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay},
{'order_params': no_conv_params}]
with pytest.raises(ValueError):
SGD(group_params)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册