From a29006d128f1a6466a2a78263eaa946c27133bad Mon Sep 17 00:00:00 2001 From: huangxu96 <46740794+huangxu96@users.noreply.github.com> Date: Mon, 21 Dec 2020 18:47:57 +0800 Subject: [PATCH] Optimizer trans momentum (#29597) * merge amp related function in Momentum from paddle.fluid.contrib.optimizer into paddle.optimizer. * Add unittest for 2.0 Momentum API. * fix some bugs in weight_decay. --- .../tests/test_multi_precision_fp16_train.py | 4 +- python/paddle/optimizer/momentum.py | 105 ++++++++++++++++-- 2 files changed, 100 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py index 812b817b924..3526a3d761c 100644 --- a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py +++ b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py @@ -122,11 +122,11 @@ def train(use_pure_fp16=True, use_nesterov=False): # Test program test_program = train_program.clone(for_test=True) - optimizer = fluid.contrib.optimizer.Momentum( + optimizer = paddle.optimizer.Momentum( learning_rate=0.001, momentum=0.9, use_nesterov=use_nesterov, - regularization=fluid.regularizer.L2Decay(1e-4), + weight_decay=fluid.regularizer.L2Decay(1e-4), multi_precision=use_pure_fp16, rescale_grad=1.0 / BATCH_SIZE) diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py index 5c6ce5fd590..b9d05eb8a72 100644 --- a/python/paddle/optimizer/momentum.py +++ b/python/paddle/optimizer/momentum.py @@ -17,8 +17,10 @@ from ..fluid import core from ..fluid import framework from ..fluid.framework import Variable, name_scope from ..fluid.layer_helper import LayerHelper +from ..fluid import unique_name +from ..fluid import layers import paddle.fluid as fluid - +from paddle.fluid.regularizer import L2DecayRegularizer __all__ = ["Momentum"] @@ -62,6 +64,9 @@ class Momentum(Optimizer): some derived class of ``GradientClipBase`` . There are three cliping strategies ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false. + rescale_grad (float, optional): Multiply the gradient with `rescale_grad` before updating. \ + Often choose to be ``1.0/batch_size``. name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` . @@ -92,20 +97,33 @@ class Momentum(Optimizer): use_nesterov=False, weight_decay=None, grad_clip=None, + multi_precision=False, + rescale_grad=1.0, name=None): if learning_rate is None: raise ValueError("learning_rate is not set") if momentum is None: raise ValueError("momentum is not set") + predicate = lambda regular: isinstance(regular, L2DecayRegularizer) + py_regular = None if predicate(weight_decay) else weight_decay super(Momentum, self).__init__( learning_rate=learning_rate, parameters=parameters, - weight_decay=weight_decay, + weight_decay=py_regular, grad_clip=grad_clip, name=name) self.type = "momentum" self._momentum = momentum self._use_nesterov = bool(use_nesterov) + self._regularization_method = "" + self._regularization_coeff = 0 + if (isinstance(weight_decay, L2DecayRegularizer)): + self._regularization_method = "l2_decay" + self._regularization_coeff = weight_decay._regularization_coeff + self._multi_precision = multi_precision + self._rescale_grad = rescale_grad + self._master_weights = {} + if framework.in_dygraph_mode(): self.helper = LayerHelper(self.__class__.__name__) for p in parameters: @@ -115,8 +133,62 @@ class Momentum(Optimizer): ).all_parameters() self.helper = LayerHelper(self.__class__.__name__) for p in all_parameters: + if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: + master_p = self._create_master_weight(p) + self._add_accumulator(self._velocity_acc_str, master_p) + continue + if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision: + warnings.warn( + "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." + "Consider using multi_precision=True option of the Momentum optimizer." + ) self._add_accumulator(self._velocity_acc_str, p) + def _create_master_weight(self, param): + assert isinstance(self.helper, LayerHelper) + + var_name = param.name + "_fp32_master" + var_name = unique_name.generate(var_name) + var = layers.create_global_var( + name=var_name, + shape=param.shape, + value=0, + dtype='float32', + persistable=True) + block = self.helper.startup_program.global_block() + block.append_op( + type="cast", + inputs={"X": [param]}, + outputs={"Out": [var]}, + attrs={ + "in_dtype": param.dtype, + "out_dtype": core.VarDesc.VarType.FP32 + }) + self._master_weights[param.name] = var + return var + + def _get_accumulator(self, name, param): + """Utility function to fetch an accumulator for a parameter + + Args: + name: name of the accumulator + param: parameter variable for which accumulator is to be fetched + + Returns: + accumulator variable for the parameter + """ + if self._name is not None: + name = self._name + "_" + name + find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16 + target_param = self._master_weights[ + param.name] if find_master else param + target_name = target_param.name + if (name not in self._accumulators or + target_name not in self._accumulators[name]): + raise Exception("Accumulator {} does not exist for parameter {}". + format(name, target_name)) + return self._accumulators[name][target_name] + def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) # create accumulator in init func, so no implementation here @@ -126,16 +198,30 @@ class Momentum(Optimizer): velocity_acc = self._get_accumulator(self._velocity_acc_str, param_and_grad[0]) + find_master = self._multi_precision and param_and_grad[ + 0].dtype == core.VarDesc.VarType.FP16 + master_weight = (self._master_weights[param_and_grad[0].name] + if find_master else None) lr = self._create_param_lr(param_and_grad) if framework.in_dygraph_mode(): - _, _ = core.ops.momentum(param_and_grad[0], param_and_grad[1], - velocity_acc, lr, param_and_grad[0], - velocity_acc, 'mu', self._momentum, - 'use_nesterov', self._use_nesterov) + _, _ = core.ops.momentum( + param_and_grad[0], param_and_grad[1], velocity_acc, lr, + param_and_grad[0], velocity_acc, 'mu', self._momentum, + 'use_nesterov', self._use_nesterov, 'regularization_method', + self._regularization_method, 'regularization_coeff', + self._regularization_coeff) return None - attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov} + attrs = { + "mu": self._momentum, + "use_nesterov": self._use_nesterov, + "regularization_method": self._regularization_method, + "regularization_coeff": self._regularization_coeff, + "multi_precision": find_master, + "rescale_grad": self._rescale_grad + } + inputs = { "Param": [param_and_grad[0]], "Grad": [param_and_grad[1]], @@ -147,6 +233,11 @@ class Momentum(Optimizer): "ParamOut": [param_and_grad[0]], "VelocityOut": [velocity_acc] } + + if find_master: + inputs["MasterParam"] = master_weight + outputs["MasterParamOut"] = master_weight + # create the momentum optimize op momentum_op = block.append_op( type=self.type, -- GitLab