diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py index f5399a3aaab5b90192d2121bc7d66cb755cc185b..e7033d845116afaa60c07ad6c4aabb866fca98b7 100644 --- a/python/paddle/fluid/tests/unittests/test_adamw_op.py +++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py @@ -29,10 +29,12 @@ class TestAdamWOp(unittest.TestCase): parameters=linear.parameters(), apply_decay_param_fun=lambda name: True, weight_decay=0.01) - out = linear(a) - out.backward() - adam.step() - adam.clear_gradients() + + for _ in range(2): + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() def test_adamw_op_coverage(self): paddle.disable_static() diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index 910c9b185dbaab017ea900676c372af2e344d561..d364248c5491cfb39d224d010739d79bce0d5278 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -16,6 +16,7 @@ from .optimizer import Optimizer from ..fluid import core from ..fluid import framework from ..fluid.framework import Variable +from ..fluid.dygraph import base as imperative_base import paddle @@ -247,6 +248,7 @@ class Adam(Optimizer): return adam_op + @imperative_base.no_grad @framework.dygraph_only def step(self): """ diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index 2c963e816abcd1fcab795772979888c3f5f6fe2d..050ac2f03183d219d4a890e8a8e6be90b11f3f76 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -129,6 +129,7 @@ class AdamW(Adam): self._params_name = set() self._apply_decay_param_fun = apply_decay_param_fun self._coeff = coeff + self._lr_to_coeff = dict() super(AdamW, self).__init__( learning_rate=learning_rate, parameters=parameters, @@ -139,97 +140,48 @@ class AdamW(Adam): name=name, lazy_mode=lazy_mode) - def _scale_parameters(self, params_and_grads): + def _append_decoupled_weight_decay(self, block, param_and_grad): """ - Adds weight decay ops. - scaled_parameter = parameter * coeff + Add decoupled weight decay op. + parameter = parameter - parameter * coeff * lr Args: - params_and_grads: A list of (parameters, gradients) pairs, + block: block in which variable is to be created + param_and_grad: (parameters, gradients) pairs, the parameters need to decay. Raises: Exception: The type of coeff and parameter is not consistent. """ - - scaled_params = [] - for param, grad in params_and_grads: - # If no gradient then we don't need to do anything - if grad is None: - continue - if self._apply_decay_param_fun is not None \ - and not self._apply_decay_param_fun(param.name): - continue - - if isinstance(self._coeff, float): - assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \ - "the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype) - else: - assert self._coeff.dtype == param.dtype, \ - "the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype) - if isinstance(self._learning_rate, float): - learning_rate = self._learning_rate - else: - learning_rate = self._learning_rate() - with param.block.program._optimized_guard( - [param, grad]), framework.name_scope('weight decay'): - scaled_params.append( - (param, grad, param * self._coeff * learning_rate)) - if param.name not in self._params_name: - self._params_name.add(param.name) - param = param * self._coeff - return scaled_params - - @imperative_base.no_grad - def minimize(self, - loss, - startup_program=None, - parameters=None, - no_grad_set=None): - parameters = parameters if parameters \ - else self._parameter_list - - params_grads = self.backward( - loss=loss, - startup_program=startup_program, - parameters=parameters, - no_grad_set=no_grad_set) - scaled_params = self._scale_parameters(params_grads) - for p_grad_sgrad in scaled_params: - param, grad, scaled_param = p_grad_sgrad - with param.block.program._optimized_guard( - [param, grad]), framework.name_scope('weight decay'): - updated_param = paddle.fluid.layers.elementwise_sub( - x=param, y=scaled_param) - paddle.fluid.layers.assign(input=updated_param, output=param) - - optimize_ops = self._apply_optimize( - loss=loss, - params_grads=params_grads, - startup_program=startup_program) - return optimize_ops, params_grads - - @framework.dygraph_only - @imperative_base.no_grad - def step(self): - self._dtype = None - params_grads = [] - for param in self._parameter_list: - if not param.trainable: - continue - if param._grad_ivar() is not None: - grad_var = param._grad_ivar() - params_grads.append((param, grad_var)) - - scaled_params = self._scale_parameters(params_grads) - for p_grad_sgrad in scaled_params: - param, grad, scaled_param = p_grad_sgrad - with param.block.program._optimized_guard( - [param, grad]), framework.name_scope('weight decay'): - updated_param = paddle.fluid.layers.elementwise_sub( - x=param, y=scaled_param) - paddle.fluid.layers.assign(input=updated_param, output=param) - self._apply_optimize( - loss=None, startup_program=None, params_grads=params_grads) + param, grad = param_and_grad + + if self._apply_decay_param_fun is not None \ + and not self._apply_decay_param_fun(param.name): + return + + if isinstance(self._learning_rate, float): + learning_rate = self._learning_rate + else: + # NOTE. We add this function to the _append_optimize_op(), + # for we must make sure _create_param_lr() be called after + # optimizer._create_global_learning_rate(). + learning_rate = self._create_param_lr(param_and_grad) + + with block.program._optimized_guard( + [param, grad]), framework.name_scope('weight decay'): + self._params_name.add(param.name) + + # If it has been calculated, the result will be reused + decay_coeff = self._lr_to_coeff.get(learning_rate, None) + if decay_coeff is None: + decay_coeff = 1.0 - learning_rate * self._coeff + self._lr_to_coeff[learning_rate] = decay_coeff + + scaled_param = param * decay_coeff + paddle.fluid.layers.assign(input=scaled_param, output=param) + + def _append_optimize_op(self, block, param_and_grad): + self._append_decoupled_weight_decay(block, param_and_grad) + return super(AdamW, self)._append_optimize_op(block, param_and_grad) def __str__(self): return " ".join(["Weight Decay, params:", ",".join(self._params_name)])