未验证 提交 a29006d1 编写于 作者: H huangxu96 提交者: GitHub

Optimizer trans momentum (#29597)

* merge amp related function in Momentum from paddle.fluid.contrib.optimizer into paddle.optimizer.

* Add unittest for 2.0  Momentum API.

* fix some bugs in weight_decay.
上级 0cc42e34
...@@ -122,11 +122,11 @@ def train(use_pure_fp16=True, use_nesterov=False): ...@@ -122,11 +122,11 @@ def train(use_pure_fp16=True, use_nesterov=False):
# Test program # Test program
test_program = train_program.clone(for_test=True) test_program = train_program.clone(for_test=True)
optimizer = fluid.contrib.optimizer.Momentum( optimizer = paddle.optimizer.Momentum(
learning_rate=0.001, learning_rate=0.001,
momentum=0.9, momentum=0.9,
use_nesterov=use_nesterov, use_nesterov=use_nesterov,
regularization=fluid.regularizer.L2Decay(1e-4), weight_decay=fluid.regularizer.L2Decay(1e-4),
multi_precision=use_pure_fp16, multi_precision=use_pure_fp16,
rescale_grad=1.0 / BATCH_SIZE) rescale_grad=1.0 / BATCH_SIZE)
......
...@@ -17,8 +17,10 @@ from ..fluid import core ...@@ -17,8 +17,10 @@ from ..fluid import core
from ..fluid import framework from ..fluid import framework
from ..fluid.framework import Variable, name_scope from ..fluid.framework import Variable, name_scope
from ..fluid.layer_helper import LayerHelper from ..fluid.layer_helper import LayerHelper
from ..fluid import unique_name
from ..fluid import layers
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.regularizer import L2DecayRegularizer
__all__ = ["Momentum"] __all__ = ["Momentum"]
...@@ -62,6 +64,9 @@ class Momentum(Optimizer): ...@@ -62,6 +64,9 @@ class Momentum(Optimizer):
some derived class of ``GradientClipBase`` . There are three cliping strategies some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
rescale_grad (float, optional): Multiply the gradient with `rescale_grad` before updating. \
Often choose to be ``1.0/batch_size``.
name (str, optional): The default value is None. Normally there is no need for user name (str, optional): The default value is None. Normally there is no need for user
to set this property. For more information, please refer to to set this property. For more information, please refer to
:ref:`api_guide_Name` . :ref:`api_guide_Name` .
...@@ -92,20 +97,33 @@ class Momentum(Optimizer): ...@@ -92,20 +97,33 @@ class Momentum(Optimizer):
use_nesterov=False, use_nesterov=False,
weight_decay=None, weight_decay=None,
grad_clip=None, grad_clip=None,
multi_precision=False,
rescale_grad=1.0,
name=None): name=None):
if learning_rate is None: if learning_rate is None:
raise ValueError("learning_rate is not set") raise ValueError("learning_rate is not set")
if momentum is None: if momentum is None:
raise ValueError("momentum is not set") raise ValueError("momentum is not set")
predicate = lambda regular: isinstance(regular, L2DecayRegularizer)
py_regular = None if predicate(weight_decay) else weight_decay
super(Momentum, self).__init__( super(Momentum, self).__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
parameters=parameters, parameters=parameters,
weight_decay=weight_decay, weight_decay=py_regular,
grad_clip=grad_clip, grad_clip=grad_clip,
name=name) name=name)
self.type = "momentum" self.type = "momentum"
self._momentum = momentum self._momentum = momentum
self._use_nesterov = bool(use_nesterov) self._use_nesterov = bool(use_nesterov)
self._regularization_method = ""
self._regularization_coeff = 0
if (isinstance(weight_decay, L2DecayRegularizer)):
self._regularization_method = "l2_decay"
self._regularization_coeff = weight_decay._regularization_coeff
self._multi_precision = multi_precision
self._rescale_grad = rescale_grad
self._master_weights = {}
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
self.helper = LayerHelper(self.__class__.__name__) self.helper = LayerHelper(self.__class__.__name__)
for p in parameters: for p in parameters:
...@@ -115,8 +133,62 @@ class Momentum(Optimizer): ...@@ -115,8 +133,62 @@ class Momentum(Optimizer):
).all_parameters() ).all_parameters()
self.helper = LayerHelper(self.__class__.__name__) self.helper = LayerHelper(self.__class__.__name__)
for p in all_parameters: for p in all_parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
master_p = self._create_master_weight(p)
self._add_accumulator(self._velocity_acc_str, master_p)
continue
if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Momentum optimizer."
)
self._add_accumulator(self._velocity_acc_str, p) self._add_accumulator(self._velocity_acc_str, p)
def _create_master_weight(self, param):
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = layers.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32
})
self._master_weights[param.name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
target_param = self._master_weights[
param.name] if find_master else param
target_name = target_param.name
if (name not in self._accumulators or
target_name not in self._accumulators[name]):
raise Exception("Accumulator {} does not exist for parameter {}".
format(name, target_name))
return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
# create accumulator in init func, so no implementation here # create accumulator in init func, so no implementation here
...@@ -126,16 +198,30 @@ class Momentum(Optimizer): ...@@ -126,16 +198,30 @@ class Momentum(Optimizer):
velocity_acc = self._get_accumulator(self._velocity_acc_str, velocity_acc = self._get_accumulator(self._velocity_acc_str,
param_and_grad[0]) param_and_grad[0])
find_master = self._multi_precision and param_and_grad[
0].dtype == core.VarDesc.VarType.FP16
master_weight = (self._master_weights[param_and_grad[0].name]
if find_master else None)
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
_, _ = core.ops.momentum(param_and_grad[0], param_and_grad[1], _, _ = core.ops.momentum(
velocity_acc, lr, param_and_grad[0], param_and_grad[0], param_and_grad[1], velocity_acc, lr,
velocity_acc, 'mu', self._momentum, param_and_grad[0], velocity_acc, 'mu', self._momentum,
'use_nesterov', self._use_nesterov) 'use_nesterov', self._use_nesterov, 'regularization_method',
self._regularization_method, 'regularization_coeff',
self._regularization_coeff)
return None return None
attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov} attrs = {
"mu": self._momentum,
"use_nesterov": self._use_nesterov,
"regularization_method": self._regularization_method,
"regularization_coeff": self._regularization_coeff,
"multi_precision": find_master,
"rescale_grad": self._rescale_grad
}
inputs = { inputs = {
"Param": [param_and_grad[0]], "Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]], "Grad": [param_and_grad[1]],
...@@ -147,6 +233,11 @@ class Momentum(Optimizer): ...@@ -147,6 +233,11 @@ class Momentum(Optimizer):
"ParamOut": [param_and_grad[0]], "ParamOut": [param_and_grad[0]],
"VelocityOut": [velocity_acc] "VelocityOut": [velocity_acc]
} }
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
# create the momentum optimize op # create the momentum optimize op
momentum_op = block.append_op( momentum_op = block.append_op(
type=self.type, type=self.type,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册