未验证 提交 c4cd99f3 编写于 作者: W WangXi 提交者: GitHub

fix adamw apply gradient (#30130) (#30207)

上级 6d1fb79d
......@@ -29,10 +29,12 @@ class TestAdamWOp(unittest.TestCase):
parameters=linear.parameters(),
apply_decay_param_fun=lambda name: True,
weight_decay=0.01)
out = linear(a)
out.backward()
adam.step()
adam.clear_gradients()
for _ in range(2):
out = linear(a)
out.backward()
adam.step()
adam.clear_gradients()
def test_adamw_op_coverage(self):
paddle.disable_static()
......
......@@ -16,6 +16,7 @@ from .optimizer import Optimizer
from ..fluid import core
from ..fluid import framework
from ..fluid.framework import Variable
from ..fluid.dygraph import base as imperative_base
import paddle
......@@ -247,6 +248,7 @@ class Adam(Optimizer):
return adam_op
@imperative_base.no_grad
@framework.dygraph_only
def step(self):
"""
......
......@@ -129,6 +129,7 @@ class AdamW(Adam):
self._params_name = set()
self._apply_decay_param_fun = apply_decay_param_fun
self._coeff = coeff
self._lr_to_coeff = dict()
super(AdamW, self).__init__(
learning_rate=learning_rate,
parameters=parameters,
......@@ -139,97 +140,48 @@ class AdamW(Adam):
name=name,
lazy_mode=lazy_mode)
def _scale_parameters(self, params_and_grads):
def _append_decoupled_weight_decay(self, block, param_and_grad):
"""
Adds weight decay ops.
scaled_parameter = parameter * coeff
Add decoupled weight decay op.
parameter = parameter - parameter * coeff * lr
Args:
params_and_grads: A list of (parameters, gradients) pairs,
block: block in which variable is to be created
param_and_grad: (parameters, gradients) pairs,
the parameters need to decay.
Raises:
Exception: The type of coeff and parameter is not consistent.
"""
scaled_params = []
for param, grad in params_and_grads:
# If no gradient then we don't need to do anything
if grad is None:
continue
if self._apply_decay_param_fun is not None \
and not self._apply_decay_param_fun(param.name):
continue
if isinstance(self._coeff, float):
assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \
"the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype)
else:
assert self._coeff.dtype == param.dtype, \
"the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)
if isinstance(self._learning_rate, float):
learning_rate = self._learning_rate
else:
learning_rate = self._learning_rate()
with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
scaled_params.append(
(param, grad, param * self._coeff * learning_rate))
if param.name not in self._params_name:
self._params_name.add(param.name)
param = param * self._coeff
return scaled_params
@imperative_base.no_grad
def minimize(self,
loss,
startup_program=None,
parameters=None,
no_grad_set=None):
parameters = parameters if parameters \
else self._parameter_list
params_grads = self.backward(
loss=loss,
startup_program=startup_program,
parameters=parameters,
no_grad_set=no_grad_set)
scaled_params = self._scale_parameters(params_grads)
for p_grad_sgrad in scaled_params:
param, grad, scaled_param = p_grad_sgrad
with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
updated_param = paddle.fluid.layers.elementwise_sub(
x=param, y=scaled_param)
paddle.fluid.layers.assign(input=updated_param, output=param)
optimize_ops = self._apply_optimize(
loss=loss,
params_grads=params_grads,
startup_program=startup_program)
return optimize_ops, params_grads
@framework.dygraph_only
@imperative_base.no_grad
def step(self):
self._dtype = None
params_grads = []
for param in self._parameter_list:
if not param.trainable:
continue
if param._grad_ivar() is not None:
grad_var = param._grad_ivar()
params_grads.append((param, grad_var))
scaled_params = self._scale_parameters(params_grads)
for p_grad_sgrad in scaled_params:
param, grad, scaled_param = p_grad_sgrad
with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
updated_param = paddle.fluid.layers.elementwise_sub(
x=param, y=scaled_param)
paddle.fluid.layers.assign(input=updated_param, output=param)
self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads)
param, grad = param_and_grad
if self._apply_decay_param_fun is not None \
and not self._apply_decay_param_fun(param.name):
return
if isinstance(self._learning_rate, float):
learning_rate = self._learning_rate
else:
# NOTE. We add this function to the _append_optimize_op(),
# for we must make sure _create_param_lr() be called after
# optimizer._create_global_learning_rate().
learning_rate = self._create_param_lr(param_and_grad)
with block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
self._params_name.add(param.name)
# If it has been calculated, the result will be reused
decay_coeff = self._lr_to_coeff.get(learning_rate, None)
if decay_coeff is None:
decay_coeff = 1.0 - learning_rate * self._coeff
self._lr_to_coeff[learning_rate] = decay_coeff
scaled_param = param * decay_coeff
paddle.fluid.layers.assign(input=scaled_param, output=param)
def _append_optimize_op(self, block, param_and_grad):
self._append_decoupled_weight_decay(block, param_and_grad)
return super(AdamW, self)._append_optimize_op(block, param_and_grad)
def __str__(self):
return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册