提交 87fe52c1 编写于 作者: W wanghaoshuang

Add ModelAverage class to optimizer.py

上级 016d0eb7
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
from collections import defaultdict from collections import defaultdict
from paddle.fluid.framework import Program
import framework import framework
import layers import layers
from backward import append_backward from backward import append_backward
...@@ -24,7 +24,10 @@ from layer_helper import LayerHelper ...@@ -24,7 +24,10 @@ from layer_helper import LayerHelper
from regularizer import append_regularization_ops from regularizer import append_regularization_ops
from clip import append_gradient_clip_ops, error_clip_callback from clip import append_gradient_clip_ops, error_clip_callback
__all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad'] __all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad',
'ModelAverage'
]
class Optimizer(object): class Optimizer(object):
...@@ -119,7 +122,12 @@ class Optimizer(object): ...@@ -119,7 +122,12 @@ class Optimizer(object):
""" """
pass pass
def _add_accumulator(self, name, param, dtype=None, fill_value=0.0): def _add_accumulator(self,
name,
param,
dtype=None,
fill_value=0.0,
shape=None):
"""Utility function to add an accumulator for a parameter """Utility function to add an accumulator for a parameter
Args: Args:
...@@ -133,17 +141,19 @@ class Optimizer(object): ...@@ -133,17 +141,19 @@ class Optimizer(object):
param.name in self._accumulators[name]): param.name in self._accumulators[name]):
raise Exception("Accumulator {} already exists for parameter {}". raise Exception("Accumulator {} already exists for parameter {}".
format(name, param.name)) format(name, param.name))
if shape == None:
shape = param.shape
assert isinstance(self.helper, LayerHelper) assert isinstance(self.helper, LayerHelper)
var = self.helper.create_global_variable( var = self.helper.create_global_variable(
name=unique_name.generate(name), name=unique_name.generate(name),
persistable=True, persistable=True,
dtype=dtype or param.dtype, dtype=dtype or param.dtype,
type=param.type, type=param.type,
shape=param.shape) shape=shape)
self.helper.set_variable_initializer( self.helper.set_variable_initializer(
var, initializer=Constant(value=float(fill_value))) var, initializer=Constant(value=float(fill_value)))
self._accumulators[name][param.name] = var self._accumulators[name][param.name] = var
return var
def _get_accumulator(self, name, param): def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter """Utility function to fetch an accumulator for a parameter
...@@ -592,3 +602,132 @@ Adagrad = AdagradOptimizer ...@@ -592,3 +602,132 @@ Adagrad = AdagradOptimizer
Adam = AdamOptimizer Adam = AdamOptimizer
Adamax = AdamaxOptimizer Adamax = AdamaxOptimizer
DecayedAdagrad = DecayedAdagradOptimizer DecayedAdagrad = DecayedAdagradOptimizer
class ModelAverage(Optimizer):
"""Accumulate the average of parameters whtin sliding window. The average
result will be saved in temporary variables which can be applied to
parameter variables of current model by calling 'apply()' method. And the
'restore()' method is used to restored the parameter values of current model.
The size of average window is determined by average_window_rate,
min_average_window, max_average_window and current update times.
Args:
params_grads: A list of parameter-grad variable pairs.
average_window_rate: The rate of average window.
min_average_window: The minimum size of average window.
max_average_window: The maximum size of average window.
Examples:
...
optimizer = fluid.optimizer.Momentum()
_, params_grads = optimizer.minimize(cost)
model_average = fluid.optimizer.ModelAverage(params_grads, 0.15,
min_average_window=10000,
max_average_window=20000)
for pass_id in range(args.pass_num):
for data in train_reader():
exe.run(fluid.default_main_program()...)
model_average.apply()
for data in test_reader():
exe.run(inference_program...)
model_average.restore(exe)
"""
def __init__(self,
params_grads,
average_window_rate,
min_average_window=10000,
max_average_window=10000,
**kwargs):
super(ModelAverage, self).__init__(0.0, **kwargs)
self.average_window = average_window_rate
self.min_average_window = min_average_window
self.max_average_window = max_average_window
self.params_grads = params_grads
for param, _ in self.params_grads:
self._append_average_accumulate_op(param)
def _add_average_apply_op(self, block, param_grad):
param = block.clone_variable(param_grad[0])
grad = block.clone_variable(param_grad[1])
sum_1 = block.clone_variable(self._get_accumulator('sum_1', param))
sum_2 = block.clone_variable(self._get_accumulator('sum_2', param))
sum_3 = block.clone_variable(self._get_accumulator('sum_3', param))
num_accumulates = block.clone_variable(
self._get_accumulator('num_accumulates', param))
old_num_accumulates = block.clone_variable(
self._get_accumulator('old_num_accumulates', param))
num_updates = block.clone_variable(
self._get_accumulator('num_updates', param))
# backup param value to grad
layers.assign(input=param, output=grad)
# param = (sum_1 + sum_2 + sum_3) / (num_accumulates + old_num_accumulates)
tmp = layers.sum(x=[num_accumulates, old_num_accumulates])
sum = layers.sum(x=[sum_1, sum_2, sum_3])
tmp = layers.cast(x=tmp, dtype='float32')
sum = layers.cast(x=sum, dtype='float32')
layers.elementwise_div(x=sum, y=tmp, out=param)
def _add_average_restore_op(self, block, param_grad):
param = block.clone_variable(param_grad[0])
grad = block.clone_variable(param_grad[1])
layers.assign(input=grad, output=param)
def _append_average_accumulate_op(self, param):
self.helper = LayerHelper("average_accumulate")
sum_1 = self._add_accumulator('sum_1', param)
sum_2 = self._add_accumulator('sum_2', param)
sum_3 = self._add_accumulator('sum_3', param)
num_accumulates = self._add_accumulator(
'num_accumulates', param, dtype='int64', shape=[1])
old_num_accumulates = self._add_accumulator(
'old_num_accumulates', param, dtype='int64', shape=[1])
num_updates = self._add_accumulator(
'num_updates', param, dtype='int64', shape=[1])
self.helper.append_op(
type='average_accumulates',
inputs={
"param": param,
"in_sum_1": sum_1,
"in_sum_2": sum_2,
"in_sum_3": sum_3,
"in_num_accumulates": num_accumulates,
"in_old_num_accumulates": old_num_accumulates,
"in_num_updates": num_updates
},
outputs={
"out_sum_1": sum_1,
"out_sum_2": sum_2,
"out_sum_3": sum_3,
"out_num_accumulates": num_accumulates,
"out_old_num_accumulates": old_num_accumulates,
"out_num_updates": num_updates,
},
attrs={
"average_window": self.average_window,
"min_average_window": self.min_average_window,
"max_average_window": self.max_average_window,
})
def apply(self, executor):
"""Apply average values to parameters of current model.
"""
apply_program = Program()
block = apply_program.global_block()
with program_guard(main_program=apply_program):
for param_grad in self.params_grads:
self._add_average_apply_op(block, param_grad)
executor.run(apply_program)
def restore(self, executor):
"""Restore parameter values of current model.
"""
restore_program = Program()
block = restore_program.global_block()
with program_guard(main_program=restore_program):
for param_grad in self.params_grads:
self._add_average_restore_op(block, param_grad)
executor.run(restore_program)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册