diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 421963a2f9120dae3a72142681f0a30232c11166..5d90bb532d93b2853a0c8069c8b0083a5d5ef1d7 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -13,7 +13,7 @@ # limitations under the License. from collections import defaultdict - +from paddle.fluid.framework import Program import framework import layers from backward import append_backward @@ -24,7 +24,10 @@ from layer_helper import LayerHelper from regularizer import append_regularization_ops from clip import append_gradient_clip_ops, error_clip_callback -__all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad'] +__all__ = [ + 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', + 'ModelAverage' +] class Optimizer(object): @@ -119,7 +122,12 @@ class Optimizer(object): """ pass - def _add_accumulator(self, name, param, dtype=None, fill_value=0.0): + def _add_accumulator(self, + name, + param, + dtype=None, + fill_value=0.0, + shape=None): """Utility function to add an accumulator for a parameter Args: @@ -133,17 +141,19 @@ class Optimizer(object): param.name in self._accumulators[name]): raise Exception("Accumulator {} already exists for parameter {}". format(name, param.name)) - + if shape == None: + shape = param.shape assert isinstance(self.helper, LayerHelper) var = self.helper.create_global_variable( name=unique_name.generate(name), persistable=True, dtype=dtype or param.dtype, type=param.type, - shape=param.shape) + shape=shape) self.helper.set_variable_initializer( var, initializer=Constant(value=float(fill_value))) self._accumulators[name][param.name] = var + return var def _get_accumulator(self, name, param): """Utility function to fetch an accumulator for a parameter @@ -592,3 +602,132 @@ Adagrad = AdagradOptimizer Adam = AdamOptimizer Adamax = AdamaxOptimizer DecayedAdagrad = DecayedAdagradOptimizer + + +class ModelAverage(Optimizer): + """Accumulate the average of parameters whtin sliding window. The average + result will be saved in temporary variables which can be applied to + parameter variables of current model by calling 'apply()' method. And the + 'restore()' method is used to restored the parameter values of current model. + + The size of average window is determined by average_window_rate, + min_average_window, max_average_window and current update times. + + Args: + params_grads: A list of parameter-grad variable pairs. + average_window_rate: The rate of average window. + min_average_window: The minimum size of average window. + max_average_window: The maximum size of average window. + + Examples: + ... + optimizer = fluid.optimizer.Momentum() + _, params_grads = optimizer.minimize(cost) + model_average = fluid.optimizer.ModelAverage(params_grads, 0.15, + min_average_window=10000, + max_average_window=20000) + for pass_id in range(args.pass_num): + for data in train_reader(): + exe.run(fluid.default_main_program()...) + model_average.apply() + for data in test_reader(): + exe.run(inference_program...) + model_average.restore(exe) + """ + + def __init__(self, + params_grads, + average_window_rate, + min_average_window=10000, + max_average_window=10000, + **kwargs): + super(ModelAverage, self).__init__(0.0, **kwargs) + self.average_window = average_window_rate + self.min_average_window = min_average_window + self.max_average_window = max_average_window + self.params_grads = params_grads + for param, _ in self.params_grads: + self._append_average_accumulate_op(param) + + def _add_average_apply_op(self, block, param_grad): + param = block.clone_variable(param_grad[0]) + grad = block.clone_variable(param_grad[1]) + sum_1 = block.clone_variable(self._get_accumulator('sum_1', param)) + sum_2 = block.clone_variable(self._get_accumulator('sum_2', param)) + sum_3 = block.clone_variable(self._get_accumulator('sum_3', param)) + num_accumulates = block.clone_variable( + self._get_accumulator('num_accumulates', param)) + old_num_accumulates = block.clone_variable( + self._get_accumulator('old_num_accumulates', param)) + num_updates = block.clone_variable( + self._get_accumulator('num_updates', param)) + # backup param value to grad + layers.assign(input=param, output=grad) + # param = (sum_1 + sum_2 + sum_3) / (num_accumulates + old_num_accumulates) + tmp = layers.sum(x=[num_accumulates, old_num_accumulates]) + sum = layers.sum(x=[sum_1, sum_2, sum_3]) + tmp = layers.cast(x=tmp, dtype='float32') + sum = layers.cast(x=sum, dtype='float32') + layers.elementwise_div(x=sum, y=tmp, out=param) + + def _add_average_restore_op(self, block, param_grad): + param = block.clone_variable(param_grad[0]) + grad = block.clone_variable(param_grad[1]) + layers.assign(input=grad, output=param) + + def _append_average_accumulate_op(self, param): + self.helper = LayerHelper("average_accumulate") + sum_1 = self._add_accumulator('sum_1', param) + sum_2 = self._add_accumulator('sum_2', param) + sum_3 = self._add_accumulator('sum_3', param) + num_accumulates = self._add_accumulator( + 'num_accumulates', param, dtype='int64', shape=[1]) + old_num_accumulates = self._add_accumulator( + 'old_num_accumulates', param, dtype='int64', shape=[1]) + num_updates = self._add_accumulator( + 'num_updates', param, dtype='int64', shape=[1]) + + self.helper.append_op( + type='average_accumulates', + inputs={ + "param": param, + "in_sum_1": sum_1, + "in_sum_2": sum_2, + "in_sum_3": sum_3, + "in_num_accumulates": num_accumulates, + "in_old_num_accumulates": old_num_accumulates, + "in_num_updates": num_updates + }, + outputs={ + "out_sum_1": sum_1, + "out_sum_2": sum_2, + "out_sum_3": sum_3, + "out_num_accumulates": num_accumulates, + "out_old_num_accumulates": old_num_accumulates, + "out_num_updates": num_updates, + }, + attrs={ + "average_window": self.average_window, + "min_average_window": self.min_average_window, + "max_average_window": self.max_average_window, + }) + + def apply(self, executor): + """Apply average values to parameters of current model. + """ + apply_program = Program() + block = apply_program.global_block() + with program_guard(main_program=apply_program): + for param_grad in self.params_grads: + self._add_average_apply_op(block, param_grad) + executor.run(apply_program) + + def restore(self, executor): + """Restore parameter values of current model. + """ + restore_program = Program() + block = restore_program.global_block() + with program_guard(main_program=restore_program): + for param_grad in self.params_grads: + self._add_average_restore_op(block, param_grad) + executor.run(restore_program)