diff --git a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py index 15373ee7bba5955e6ccd33906b7ef822669f25c1..b190a5d02efc4ce34a7062f1bf3e2ad1939c9399 100644 --- a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py +++ b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py @@ -97,7 +97,7 @@ def train(use_pure_fp16=True, use_nesterov=False, use_adam=False): test_program = train_program.clone(for_test=True) if use_adam: - optimizer = paddle.optimizer.Adam( + optimizer = paddle.optimizer.AdamW( learning_rate=0.001, epsilon=1e-8, weight_decay=0.0, diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index cd3955d5f06d7846151527663bb96a90d20f0ddd..78c9fcb83fc249254903362b78301d5b5be288eb 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -14,6 +14,7 @@ from .optimizer import Optimizer from .adam import Adam +from ..fluid import core from ..fluid import framework from ..fluid.dygraph import base as imperative_base import paddle @@ -182,8 +183,16 @@ class AdamW(Adam): decay_coeff = 1.0 - learning_rate * self._coeff self._lr_to_coeff[learning_rate] = decay_coeff - scaled_param = param * decay_coeff - paddle.fluid.layers.assign(input=scaled_param, output=param) + find_master = (self._multi_precision and + param.dtype == core.VarDesc.VarType.FP16) + if find_master: + master_weight = self._master_weights[param.name] + scaled_param = master_weight * decay_coeff + paddle.fluid.layers.assign( + input=scaled_param, output=master_weight) + else: + scaled_param = param * decay_coeff + paddle.fluid.layers.assign(input=scaled_param, output=param) def _append_optimize_op(self, block, param_and_grad): self._append_decoupled_weight_decay(block, param_and_grad)