未验证 提交 4f4f0993 编写于 作者: Y Yibing Liu 提交者: GitHub

Bias correction for exponential moving average (#17677)

* Bias correction for exponential moving average

test=develop, test=document_preview

* Fix docs

test=develop, test=document_preview
上级 962eed6f
...@@ -523,7 +523,7 @@ paddle.fluid.optimizer.LambOptimizer.apply_optimize (ArgSpec(args=['self', 'loss ...@@ -523,7 +523,7 @@ paddle.fluid.optimizer.LambOptimizer.apply_optimize (ArgSpec(args=['self', 'loss
paddle.fluid.optimizer.LambOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.LambOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
paddle.fluid.optimizer.LambOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.LambOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.LambOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.LambOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
paddle.fluid.optimizer.ExponentialMovingAverage.__init__ (ArgSpec(args=['self', 'decay', 'zero_init', 'name'], varargs=None, keywords=None, defaults=(0.999, False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.ExponentialMovingAverage.__init__ (ArgSpec(args=['self', 'decay', 'thres_steps', 'name'], varargs=None, keywords=None, defaults=(0.999, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.ExponentialMovingAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '30f494752ac8921dc5835a63637f453a')) paddle.fluid.optimizer.ExponentialMovingAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '30f494752ac8921dc5835a63637f453a'))
paddle.fluid.optimizer.ExponentialMovingAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '8c8a1791608b02a1ede53d6dd3a4fcec')) paddle.fluid.optimizer.ExponentialMovingAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '8c8a1791608b02a1ede53d6dd3a4fcec'))
paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '08a5dd9f6f376ff3d55e0b1d92115cbd')) paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '08a5dd9f6f376ff3d55e0b1d92115cbd'))
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
from __future__ import print_function from __future__ import print_function
import numpy as np
from collections import defaultdict from collections import defaultdict
from functools import reduce from functools import reduce
...@@ -2175,19 +2176,41 @@ class ExponentialMovingAverage(object): ...@@ -2175,19 +2176,41 @@ class ExponentialMovingAverage(object):
.. math:: .. math::
\\text{EMA}_t = \\text{decay} * \\text{EMA}_{t-1} + (1 - \\text{decay}) * \\theta_t \\text{EMA}_0 & = 0
\\text{EMA}_t & = \\text{decay} * \\text{EMA}_{t-1} + (1 - \\text{decay}) * \\theta_t
The average results will be saved in temporary variables which can be
applied to parameters of current model by calling `apply()` method. And The average results will be saved in temporary variables which are created
the `restore()` method is used to restore the parameters. and maintained by the object, and can be applied to parameters of current
model by calling **apply()** method. And the **restore()** method is used to
restore the parameters.
**Bias correction**. All EMAs are initialized to :math:`0` and hence they will be
zero biased, which can be corrected by divided by a factor
:math:`(1 - \\text{decay}^t)` , i.e., the actual EMAs applied to parameters
when calling **apply()** method would be
.. math::
\\widehat{\\text{EMA}}_t = \\frac{\\text{EMA}_t}{1 - \\text{decay}^t}
**Decay rate scheduling**. A large decay rate very close to 1 would result
in that the averages move very slowly. And a better strategy is to set a
relative smaller decay rate in the very beginning. The argument **thres_steps**
allows users to pass a Variable to schedule the decay rate, in this case,
the actual decay rate becomes
.. math::
\\min(\\text{decay}, \\frac{1 + \\text{thres_steps}}{10 + \\text{thres_steps}})
Usually **thres_steps** can be the global training steps.
Args: Args:
decay (float|Variable): The exponential decay rate. Can be scheduled like decay (float): The exponential decay rate, usually close to 1, such as
learning rate. 0.999, 0.9999, ... .
zero_init (bool): Whether using zero to initialize EMA Variable. If set to thres_steps (Variable|None): If not `None`, schedule the decay rate.
`True`, :math:`\\text{EMA}_0 = 0.0` else :math:`\\text{EMA}_0 = \\theta_0`.
name (str|None): An optional name prefix. name (str|None): An optional name prefix.
...@@ -2204,25 +2227,35 @@ class ExponentialMovingAverage(object): ...@@ -2204,25 +2227,35 @@ class ExponentialMovingAverage(object):
optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer = fluid.optimizer.Adam(learning_rate=0.001)
optimizer.minimize(cost) optimizer.minimize(cost)
ema = fluid.optimizer.ExponentialMovingAverage(0.99) global_steps = fluid.layers.learning_rate_scheduler._decay_step_counter()
ema = fluid.optimizer.ExponentialMovingAverage(0.999, thres_steps=global_steps)
# pseudo code # pseudo code
for pass_id in range(args.pass_num): for pass_id in range(args.pass_num):
for data in train_reader(): for data in train_reader():
exe.run(fluid.default_main_program()...) exe.run(fluid.default_main_program()...)
# usage 1
with ema.apply(exe): with ema.apply(exe):
for data in test_reader(): for data in test_reader():
exe.run(inference_program...) exe.run(inference_program...)
# usage 2
with ema.apply(exe, need_restore=False):
for data in test_reader():
exe.run(inference_program...)
...
ema.restore(exe)
""" """
def __init__(self, decay=0.999, zero_init=False, name=None): def __init__(self, decay=0.999, thres_steps=None, name=None):
self._decay = decay self._decay = decay
self._zero_init = zero_init self._thres_steps = thres_steps
self._name = name if name is not None else '' self._name = name if name is not None else ''
self._decay_var = self._get_ema_decay()
self.params_tmps = [] self.params_tmps = []
for param in framework.default_main_program().global_block( for param in default_main_program().global_block().all_parameters():
).all_parameters():
if param.do_model_average != False: if param.do_model_average != False:
tmp = param.block.create_var( tmp = param.block.create_var(
name=unique_name.generate(".".join( name=unique_name.generate(".".join(
...@@ -2232,22 +2265,23 @@ class ExponentialMovingAverage(object): ...@@ -2232,22 +2265,23 @@ class ExponentialMovingAverage(object):
stop_gradient=True) stop_gradient=True)
self.params_tmps.append((param, tmp)) self.params_tmps.append((param, tmp))
startup_block = default_startup_program().global_block()
ema_vars = {} ema_vars = {}
for param, tmp in self.params_tmps: for param, tmp in self.params_tmps:
with param.block.program._optimized_guard( with param.block.program._optimized_guard(
[param, tmp]), name_scope('moving_average'): [param, tmp]), name_scope('moving_average'):
ema_vars[param.name] = self._append_ema_ops(startup_block, ema_vars[param.name] = self._append_ema_ops(param)
param)
self.apply_program = Program() self.apply_program = Program()
block = self.apply_program.global_block() block = self.apply_program.global_block()
with program_guard(main_program=self.apply_program): with program_guard(main_program=self.apply_program):
decay_pow = self._get_decay_pow(block)
for param, tmp in self.params_tmps: for param, tmp in self.params_tmps:
param = block._clone_variable(param) param = block._clone_variable(param)
tmp = block._clone_variable(tmp) tmp = block._clone_variable(tmp)
ema = block._clone_variable(ema_vars[param.name]) ema = block._clone_variable(ema_vars[param.name])
layers.assign(input=param, output=tmp) layers.assign(input=param, output=tmp)
# bias correction
ema = ema / (1.0 - decay_pow)
layers.assign(input=ema, output=param) layers.assign(input=ema, output=param)
self.restore_program = Program() self.restore_program = Program()
...@@ -2258,25 +2292,43 @@ class ExponentialMovingAverage(object): ...@@ -2258,25 +2292,43 @@ class ExponentialMovingAverage(object):
param = block._clone_variable(param) param = block._clone_variable(param)
layers.assign(input=tmp, output=param) layers.assign(input=tmp, output=param)
def _append_ema_ops(self, startup_block, param): def _get_ema_decay(self):
with default_main_program()._lr_schedule_guard():
decay_var = layers.tensor.create_global_var(
shape=[1],
value=self._decay,
dtype='float32',
persistable=True,
name="scheduled_ema_decay_rate")
if self._thres_steps is not None:
decay_t = (self._thres_steps + 1.0) / (self._thres_steps + 10.0)
with layers.control_flow.Switch() as switch:
with switch.case(decay_t < self._decay):
layers.tensor.assign(decay_t, decay_var)
with switch.default():
layers.tensor.assign(
np.array(
[self._decay], dtype=np.float32),
decay_var)
return decay_var
def _get_decay_pow(self, block):
global_steps = layers.learning_rate_scheduler._decay_step_counter()
decay_var = block._clone_variable(self._decay_var)
decay_pow_acc = layers.elementwise_pow(decay_var, global_steps + 1)
return decay_pow_acc
def _append_ema_ops(self, param):
param_ema = layers.create_global_var( param_ema = layers.create_global_var(
name=unique_name.generate(self._name + param.name + '_ema'), name=unique_name.generate(self._name + param.name + '_ema'),
shape=param.shape, shape=param.shape,
value=0.0, value=0.0,
dtype=param.dtype, dtype=param.dtype,
persistable=True) persistable=True)
# t = 0
if self._zero_init is not True:
startup_p_ema = startup_block._clone_variable(param_ema)
startup_p = startup_block.var(param.name)
startup_block.append_op(
type="assign",
inputs={"X": startup_p},
outputs={"Out": startup_p_ema})
# t > 0
ema_t = param_ema * self._decay - param * (self._decay - 1)
layers.assign(input=ema_t, output=param_ema)
ema_t = param_ema * self._decay_var + param * (1 - self._decay_var)
layers.assign(input=ema_t, output=param_ema)
return param_ema return param_ema
@signature_safe_contextmanager @signature_safe_contextmanager
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册