提交 9708b21f 编写于 作者: W wanghaoshuang

Refine average model option

1. Add attr 'average' into ParamAttr.
2. Make 'params_grads' optional for AverageModel.
3. Add option 'average_mean' and 'average_variance' for batch_normal.
上级 d1a7b47e
...@@ -1137,6 +1137,8 @@ class Parameter(Variable): ...@@ -1137,6 +1137,8 @@ class Parameter(Variable):
self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None) self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
self.average = kwargs.get('average', True)
def __str__(self): def __str__(self):
return self.to_string(True) return self.to_string(True)
...@@ -1157,7 +1159,7 @@ class Parameter(Variable): ...@@ -1157,7 +1159,7 @@ class Parameter(Variable):
if with_details: if with_details:
res_str = Variable.to_string(self, throw_on_error, True) res_str = Variable.to_string(self, throw_on_error, True)
additional_attr = ("trainable", "optimize_attr", "regularizer", additional_attr = ("trainable", "optimize_attr", "regularizer",
"gradient_clip_attr") "gradient_clip_attr", "average")
for attr_name in additional_attr: for attr_name in additional_attr:
res_str += "%s: %s\n" % (attr_name, res_str += "%s: %s\n" % (attr_name,
str(getattr(self, attr_name))) str(getattr(self, attr_name)))
......
...@@ -1486,7 +1486,9 @@ def batch_norm(input, ...@@ -1486,7 +1486,9 @@ def batch_norm(input,
in_place=False, in_place=False,
name=None, name=None,
moving_mean_name=None, moving_mean_name=None,
moving_variance_name=None): moving_variance_name=None,
average_mean=True,
average_variance=True):
""" """
This function helps create an operator to implement This function helps create an operator to implement
the BatchNorm layer using the configurations from the input parameters. the BatchNorm layer using the configurations from the input parameters.
...@@ -1517,7 +1519,10 @@ def batch_norm(input, ...@@ -1517,7 +1519,10 @@ def batch_norm(input,
mean = helper.create_parameter( mean = helper.create_parameter(
attr=ParamAttr( attr=ParamAttr(
name=moving_mean_name, initializer=Constant(0.0), trainable=False), name=moving_mean_name,
initializer=Constant(0.0),
trainable=False,
average=average_variance),
shape=param_shape, shape=param_shape,
dtype=input.dtype) dtype=input.dtype)
mean.stop_gradient = True mean.stop_gradient = True
...@@ -1526,7 +1531,8 @@ def batch_norm(input, ...@@ -1526,7 +1531,8 @@ def batch_norm(input,
attr=ParamAttr( attr=ParamAttr(
name=moving_variance_name, name=moving_variance_name,
initializer=Constant(1.0), initializer=Constant(1.0),
trainable=False), trainable=False,
average=average_mean),
shape=param_shape, shape=param_shape,
dtype=input.dtype) dtype=input.dtype)
variance.stop_gradient = True variance.stop_gradient = True
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import re
from collections import defaultdict from collections import defaultdict
from paddle.fluid.framework import Program from paddle.fluid.framework import Program
import framework import framework
...@@ -818,8 +818,8 @@ class ModelAverage(Optimizer): ...@@ -818,8 +818,8 @@ class ModelAverage(Optimizer):
min_average_window, max_average_window and current update times. min_average_window, max_average_window and current update times.
Args: Args:
params_grads: A list of parameter-grad variable pairs.
average_window_rate: The rate of average window. average_window_rate: The rate of average window.
params_grads: A list of parameter-grad variable pairs.
min_average_window: The minimum size of average window. min_average_window: The minimum size of average window.
max_average_window: The maximum size of average window. max_average_window: The maximum size of average window.
...@@ -840,8 +840,8 @@ class ModelAverage(Optimizer): ...@@ -840,8 +840,8 @@ class ModelAverage(Optimizer):
""" """
def __init__(self, def __init__(self,
params_grads, average_window_rate=0.15,
average_window_rate, params_grads=None,
min_average_window=10000, min_average_window=10000,
max_average_window=10000, max_average_window=10000,
**kwargs): **kwargs):
...@@ -849,25 +849,21 @@ class ModelAverage(Optimizer): ...@@ -849,25 +849,21 @@ class ModelAverage(Optimizer):
self.average_window = average_window_rate self.average_window = average_window_rate
self.min_average_window = min_average_window self.min_average_window = min_average_window
self.max_average_window = max_average_window self.max_average_window = max_average_window
self.params_grads = params_grads
# append 'moving mean' and 'moving variance' to self.params_grads self.params_grads = [] if params_grads is None else params_grads
pattern = re.compile(r"batch_norm_\d+\.w_[1,2]") params = {}
for param, grad in self.params_grads:
params[param.name] = (param, grad)
for param in framework.default_main_program().global_block( for param in framework.default_main_program().global_block(
).all_parameters(): ).all_parameters():
if pattern.match(param.name) is not None: if param.name not in params and param.average:
self.params_grads.append((param, None))
# create a tmp gradient variable to backup parameter value
# for parameter whose grad is None
for i, param_grad in enumerate(self.params_grads):
param, grad = param_grad
if grad is None:
grad = param.block.create_var( grad = param.block.create_var(
name=unique_name.generate(".".join([param.name, 'tmp'])), name=unique_name.generate(".".join([param.name, 'tmp'])),
dtype=param.dtype, dtype=param.dtype,
persistable=False, persistable=False,
stop_gradient=stop_gradient) stop_gradient=True)
self.params_grads[i] = (param, grad) params[param.name] = (param, grad)
self.params_grads = params.values()
for param, grad in self.params_grads: for param, grad in self.params_grads:
self._append_average_accumulate_op(param) self._append_average_accumulate_op(param)
......
...@@ -28,13 +28,15 @@ class ParamAttr(object): ...@@ -28,13 +28,15 @@ class ParamAttr(object):
learning_rate=1.0, learning_rate=1.0,
regularizer=None, regularizer=None,
trainable=True, trainable=True,
gradient_clip=None): gradient_clip=None,
average=True):
self.name = name self.name = name
self.initializer = initializer self.initializer = initializer
self.learning_rate = learning_rate self.learning_rate = learning_rate
self.regularizer = regularizer self.regularizer = regularizer
self.trainable = trainable self.trainable = trainable
self.gradient_clip = gradient_clip self.gradient_clip = gradient_clip
self.average = average
def set_default_initializer(self, initializer): def set_default_initializer(self, initializer):
if initializer is None: if initializer is None:
...@@ -80,7 +82,8 @@ class ParamAttr(object): ...@@ -80,7 +82,8 @@ class ParamAttr(object):
}, },
'regularizer': self.regularizer, 'regularizer': self.regularizer,
'trainable': self.trainable, 'trainable': self.trainable,
'gradient_clip_attr': self.gradient_clip 'gradient_clip_attr': self.gradient_clip,
'average': self.average
} }
if with_initializer: if with_initializer:
kwargs['initializer'] = self.initializer kwargs['initializer'] = self.initializer
...@@ -90,7 +93,7 @@ class ParamAttr(object): ...@@ -90,7 +93,7 @@ class ParamAttr(object):
class WeightNormParamAttr(ParamAttr): class WeightNormParamAttr(ParamAttr):
""" """
Used for weight normalization. Any field in ParamAttr can also be set here. Used for weight normalization. Any field in ParamAttr can also be set here.
Besides, an extra field dim can be set to indicate the dimension except Besides, an extra field dim can be set to indicate the dimension except
which to normalize. which to normalize.
""" """
# List to record the parameters reparameterized by weight normalization. # List to record the parameters reparameterized by weight normalization.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册