From 9708b21f191b3ff606651dfaeb7cf65dfd250881 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Mon, 2 Apr 2018 10:51:31 +0800 Subject: [PATCH] Refine average model option 1. Add attr 'average' into ParamAttr. 2. Make 'params_grads' optional for AverageModel. 3. Add option 'average_mean' and 'average_variance' for batch_normal. --- python/paddle/fluid/framework.py | 4 +++- python/paddle/fluid/layers/nn.py | 12 +++++++++--- python/paddle/fluid/optimizer.py | 28 ++++++++++++---------------- python/paddle/fluid/param_attr.py | 9 ++++++--- 4 files changed, 30 insertions(+), 23 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 3e78788f47..92c299a4b6 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1137,6 +1137,8 @@ class Parameter(Variable): self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None) + self.average = kwargs.get('average', True) + def __str__(self): return self.to_string(True) @@ -1157,7 +1159,7 @@ class Parameter(Variable): if with_details: res_str = Variable.to_string(self, throw_on_error, True) additional_attr = ("trainable", "optimize_attr", "regularizer", - "gradient_clip_attr") + "gradient_clip_attr", "average") for attr_name in additional_attr: res_str += "%s: %s\n" % (attr_name, str(getattr(self, attr_name))) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 0332556f62..3265ff733b 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1486,7 +1486,9 @@ def batch_norm(input, in_place=False, name=None, moving_mean_name=None, - moving_variance_name=None): + moving_variance_name=None, + average_mean=True, + average_variance=True): """ This function helps create an operator to implement the BatchNorm layer using the configurations from the input parameters. @@ -1517,7 +1519,10 @@ def batch_norm(input, mean = helper.create_parameter( attr=ParamAttr( - name=moving_mean_name, initializer=Constant(0.0), trainable=False), + name=moving_mean_name, + initializer=Constant(0.0), + trainable=False, + average=average_variance), shape=param_shape, dtype=input.dtype) mean.stop_gradient = True @@ -1526,7 +1531,8 @@ def batch_norm(input, attr=ParamAttr( name=moving_variance_name, initializer=Constant(1.0), - trainable=False), + trainable=False, + average=average_mean), shape=param_shape, dtype=input.dtype) variance.stop_gradient = True diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index d21320f705..560257a356 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import re from collections import defaultdict from paddle.fluid.framework import Program import framework @@ -818,8 +818,8 @@ class ModelAverage(Optimizer): min_average_window, max_average_window and current update times. Args: - params_grads: A list of parameter-grad variable pairs. average_window_rate: The rate of average window. + params_grads: A list of parameter-grad variable pairs. min_average_window: The minimum size of average window. max_average_window: The maximum size of average window. @@ -840,8 +840,8 @@ class ModelAverage(Optimizer): """ def __init__(self, - params_grads, - average_window_rate, + average_window_rate=0.15, + params_grads=None, min_average_window=10000, max_average_window=10000, **kwargs): @@ -849,25 +849,21 @@ class ModelAverage(Optimizer): self.average_window = average_window_rate self.min_average_window = min_average_window self.max_average_window = max_average_window - self.params_grads = params_grads - # append 'moving mean' and 'moving variance' to self.params_grads - pattern = re.compile(r"batch_norm_\d+\.w_[1,2]") + self.params_grads = [] if params_grads is None else params_grads + params = {} + for param, grad in self.params_grads: + params[param.name] = (param, grad) for param in framework.default_main_program().global_block( ).all_parameters(): - if pattern.match(param.name) is not None: - self.params_grads.append((param, None)) - # create a tmp gradient variable to backup parameter value - # for parameter whose grad is None - for i, param_grad in enumerate(self.params_grads): - param, grad = param_grad - if grad is None: + if param.name not in params and param.average: grad = param.block.create_var( name=unique_name.generate(".".join([param.name, 'tmp'])), dtype=param.dtype, persistable=False, - stop_gradient=stop_gradient) - self.params_grads[i] = (param, grad) + stop_gradient=True) + params[param.name] = (param, grad) + self.params_grads = params.values() for param, grad in self.params_grads: self._append_average_accumulate_op(param) diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py index 255cd21043..74b968f8ee 100644 --- a/python/paddle/fluid/param_attr.py +++ b/python/paddle/fluid/param_attr.py @@ -28,13 +28,15 @@ class ParamAttr(object): learning_rate=1.0, regularizer=None, trainable=True, - gradient_clip=None): + gradient_clip=None, + average=True): self.name = name self.initializer = initializer self.learning_rate = learning_rate self.regularizer = regularizer self.trainable = trainable self.gradient_clip = gradient_clip + self.average = average def set_default_initializer(self, initializer): if initializer is None: @@ -80,7 +82,8 @@ class ParamAttr(object): }, 'regularizer': self.regularizer, 'trainable': self.trainable, - 'gradient_clip_attr': self.gradient_clip + 'gradient_clip_attr': self.gradient_clip, + 'average': self.average } if with_initializer: kwargs['initializer'] = self.initializer @@ -90,7 +93,7 @@ class ParamAttr(object): class WeightNormParamAttr(ParamAttr): """ Used for weight normalization. Any field in ParamAttr can also be set here. - Besides, an extra field dim can be set to indicate the dimension except + Besides, an extra field dim can be set to indicate the dimension except which to normalize. """ # List to record the parameters reparameterized by weight normalization. -- GitLab