diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index ea9abdcae2e11290b1f90f6008723815da523aa7..4b841ef31dcb67ab660475cf6e231fd8a4ae83d6 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1183,6 +1183,8 @@ class Parameter(Variable): self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None) + self.do_model_average = kwargs.get('do_model_average', None) + def __str__(self): return self.to_string(True) @@ -1203,7 +1205,7 @@ class Parameter(Variable): if with_details: res_str = Variable.to_string(self, throw_on_error, True) additional_attr = ("trainable", "optimize_attr", "regularizer", - "gradient_clip_attr") + "gradient_clip_attr", "do_model_average") for attr_name in additional_attr: res_str += "%s: %s\n" % (attr_name, str(getattr(self, attr_name))) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d2e7d58524bfb11627b6acb36ef873c41b348f0f..7ca4ed9a7be32a90e2186f07a5454f1a0e236891 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1516,7 +1516,8 @@ def batch_norm(input, in_place=False, name=None, moving_mean_name=None, - moving_variance_name=None): + moving_variance_name=None, + do_model_average_for_mean_and_var=False): """ This function helps create an operator to implement the BatchNorm layer using the configurations from the input parameters. @@ -1547,7 +1548,10 @@ def batch_norm(input, mean = helper.create_parameter( attr=ParamAttr( - name=moving_mean_name, initializer=Constant(0.0), trainable=False), + name=moving_mean_name, + initializer=Constant(0.0), + trainable=False, + do_model_average=do_model_average_for_mean_and_var), shape=param_shape, dtype=input.dtype) mean.stop_gradient = True @@ -1556,7 +1560,8 @@ def batch_norm(input, attr=ParamAttr( name=moving_variance_name, initializer=Constant(1.0), - trainable=False), + trainable=False, + do_model_average=do_model_average_for_mean_and_var), shape=param_shape, dtype=input.dtype) variance.stop_gradient = True @@ -3374,14 +3379,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): Here are some examples to explain it. 1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape - is [6, 8], the reshape operator will transform x into a 2-D tensor with + is [6, 8], the reshape operator will transform x into a 2-D tensor with shape [6, 8] and leaving x's data unchanged. 2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape specified is [2, 3, -1, 2], the reshape operator will transform x into a 4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this - case, one dimension of the target shape is set to -1, the value of this - dimension is inferred from the total element number of x and remaining + case, one dimension of the target shape is set to -1, the value of this + dimension is inferred from the total element number of x and remaining dimensions. 3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape @@ -3615,7 +3620,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None): def pad(x, paddings, pad_value=0., name=None): """ Pads a tensor with a constant value given by :attr:`pad_value`, and the - padded width is specified by :attr:`paddings`. + padded width is specified by :attr:`paddings`. Specifically, the number of values padded before the contents of :attr:`x` in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number @@ -3643,7 +3648,7 @@ def pad(x, paddings, pad_value=0., name=None): x (Variable): The input tensor variable. paddings (list): A list of integers. Its elements specify the padded width before and after for each dimension in turn. - The length of :attr:paddings must be + The length of :attr:paddings must be :math:`rank(x) \\times 2`. pad_value (float): The constant value used to pad. name(str|None): A name for this layer(optional). If set None, the layer diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 180575c35dc6e115e11cccf9fff9fb2d3cd7e9a6..36503cac6d5391821b977d90e6b77c4df7e3b564 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import re from collections import defaultdict from paddle.fluid.framework import Program import framework @@ -818,8 +818,8 @@ class ModelAverage(Optimizer): min_average_window, max_average_window and current update times. Args: - params_grads: A list of parameter-grad variable pairs. average_window_rate: The rate of average window. + params_grads: A list of parameter-grad variable pairs. min_average_window: The minimum size of average window. max_average_window: The maximum size of average window. @@ -840,8 +840,8 @@ class ModelAverage(Optimizer): """ def __init__(self, - params_grads, average_window_rate, + params_grads=None, min_average_window=10000, max_average_window=10000, **kwargs): @@ -849,24 +849,37 @@ class ModelAverage(Optimizer): self.average_window = average_window_rate self.min_average_window = min_average_window self.max_average_window = max_average_window - self.params_grads = params_grads + + self.params_grads = [] if params_grads is None else params_grads + params = {} + for param, grad in self.params_grads: + if param.do_model_average != False: + params[param.name] = (param, grad) + for param in framework.default_main_program().global_block( + ).all_parameters(): + if param.name not in params and param.do_model_average != False: + grad = param.block.create_var( + name=unique_name.generate(".".join([param.name, 'tmp'])), + dtype=param.dtype, + persistable=False, + stop_gradient=True) + params[param.name] = (param, grad) + self.params_grads = params.values() + for param, grad in self.params_grads: - if grad is not None: - self._append_average_accumulate_op(param) + self._append_average_accumulate_op(param) self.apply_program = Program() block = self.apply_program.global_block() with program_guard(main_program=self.apply_program): for param_grad in self.params_grads: - if param_grad[1] is not None: - self._add_average_apply_op(block, param_grad) + self._add_average_apply_op(block, param_grad) self.restore_program = Program() block = self.restore_program.global_block() with program_guard(main_program=self.restore_program): for param_grad in self.params_grads: - if param_grad[1] is not None: - self._add_average_restore_op(block, param_grad) + self._add_average_restore_op(block, param_grad) def _add_average_apply_op(self, block, param_grad): param = block.clone_variable(param_grad[0]) diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py index 255cd2104325afa31449cbd3875499a7c5d7f572..1c6970441bccdc1c1221503256c30c83502bd123 100644 --- a/python/paddle/fluid/param_attr.py +++ b/python/paddle/fluid/param_attr.py @@ -28,13 +28,15 @@ class ParamAttr(object): learning_rate=1.0, regularizer=None, trainable=True, - gradient_clip=None): + gradient_clip=None, + do_model_average=None): self.name = name self.initializer = initializer self.learning_rate = learning_rate self.regularizer = regularizer self.trainable = trainable self.gradient_clip = gradient_clip + self.model_average = do_model_average def set_default_initializer(self, initializer): if initializer is None: @@ -80,7 +82,8 @@ class ParamAttr(object): }, 'regularizer': self.regularizer, 'trainable': self.trainable, - 'gradient_clip_attr': self.gradient_clip + 'gradient_clip_attr': self.gradient_clip, + 'model_average': self.model_average } if with_initializer: kwargs['initializer'] = self.initializer @@ -90,7 +93,7 @@ class ParamAttr(object): class WeightNormParamAttr(ParamAttr): """ Used for weight normalization. Any field in ParamAttr can also be set here. - Besides, an extra field dim can be set to indicate the dimension except + Besides, an extra field dim can be set to indicate the dimension except which to normalize. """ # List to record the parameters reparameterized by weight normalization.