未验证 提交 2bca3295 编写于 作者: Z Zhou Wei 提交者: GitHub

move the initialize position of grad_clip to optimizer(__init__),and speed up clip (#23839)

上级 21fc789e
......@@ -137,9 +137,6 @@ class GradientClipBase(object):
raise NotImplementedError
def __call__(self, params_grads):
assert len(
params_grads
) > 0, "The number of trainable parameters should be greater than 0."
if framework.in_dygraph_mode():
return self._dygraph_clip(params_grads)
else:
......@@ -147,7 +144,7 @@ class GradientClipBase(object):
if getattr(p, 'gradient_clip_attr', None) is not None:
warnings.warn(
"'set_gradient_clip' will be ineffective, because you have "
"pass 'grad_clip' into 'minimize'. So, 'set_gradient_clip' "
"set 'grad_clip' in 'optimizer'. So, 'set_gradient_clip' "
"is redundant and you can remove it.")
break
return self._static_clip(params_grads)
......@@ -170,7 +167,7 @@ class GradientClipByValue(GradientClipBase):
The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
is not None, then only part of gradients can be selected for gradient clipping.
Gradient clip will takes effect after being set in ``optimizer.minimize(grad_clip)`` , see the document ``optimizer``
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_fluid_optimizer_SGDOptimizer`).
Args:
......@@ -208,8 +205,8 @@ class GradientClipByValue(GradientClipBase):
# return Parameter.name=="fc_0.w_0"
# clip = fluid.clip.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1)
sgd_optimizer.minimize(loss, grad_clip=clip)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1, grad_clip=clip)
sgd_optimizer.minimize(loss)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
......@@ -242,8 +239,8 @@ class GradientClipByValue(GradientClipBase):
# clip = fluid.clip.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=0.1, parameter_list=linear.parameters())
sgd_optimizer.minimize(loss, grad_clip=clip)
learning_rate=0.1, parameter_list=linear.parameters(), grad_clip=clip)
sgd_optimizer.minimize(loss)
"""
def __init__(self, max, min=None, need_clip=None):
......@@ -272,6 +269,7 @@ class GradientClipByValue(GradientClipBase):
def _static_clip(self, params_grads):
params_and_grads = []
param_new_grad_name_dict = dict()
with framework.name_scope('gradient_clip'):
for p, g in params_grads:
if g is None:
......@@ -284,7 +282,8 @@ class GradientClipByValue(GradientClipBase):
with p.block.program._optimized_guard([p, g]):
new_grad = layers.clip(x=g, min=self.min, max=self.max)
params_and_grads.append((p, new_grad))
_correct_clip_op_role_var(params_and_grads)
param_new_grad_name_dict[p.name] = new_grad.name
_correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
return params_and_grads
def _process_context(self, context, param, grad):
......@@ -306,7 +305,7 @@ class GradientClipByNorm(GradientClipBase):
The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
is not None, then only part of gradients can be selected for gradient clipping.
Gradient clip will takes effect after being set in ``optimizer.minimize(grad_clip)`` , see the document ``optimizer``
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_fluid_optimizer_SGDOptimizer`).
The clipping formula is:
......@@ -359,8 +358,8 @@ class GradientClipByNorm(GradientClipBase):
# return Parameter.name=="fc_0.w_0"
# clip = fluid.clip.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1)
sgd_optimizer.minimize(loss, grad_clip=clip)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1, grad_clip=clip)
sgd_optimizer.minimize(loss)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
......@@ -394,8 +393,8 @@ class GradientClipByNorm(GradientClipBase):
# clip = fluid.clip.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=0.1, parameter_list=linear.parameters())
sgd_optimizer.minimize(loss, grad_clip=clip)
learning_rate=0.1, parameter_list=linear.parameters(), grad_clip=clip)
sgd_optimizer.minimize(loss)
"""
......@@ -422,6 +421,7 @@ class GradientClipByNorm(GradientClipBase):
def _static_clip(self, params_grads):
params_and_grads = []
with framework.name_scope('gradient_clip'):
param_new_grad_name_dict = dict()
for p, g in params_grads:
if g is None:
continue
......@@ -432,8 +432,9 @@ class GradientClipByNorm(GradientClipBase):
with p.block.program._optimized_guard([p, g]):
new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
param_new_grad_name_dict[p.name] = new_grad.name
params_and_grads.append((p, new_grad))
_correct_clip_op_role_var(params_and_grads)
_correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
return params_and_grads
def _process_context(self, context, param, grad):
......@@ -456,7 +457,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
is not None, then only part of gradients can be selected for gradient clipping.
Gradient clip will takes effect after being set in ``optimizer.minimize(grad_clip)`` , see the document ``optimizer``
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_fluid_optimizer_SGDOptimizer`).
The clipping formula is:
......@@ -505,8 +506,8 @@ class GradientClipByGlobalNorm(GradientClipBase):
# return Parameter.name=="fc_0.w_0"
# clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1)
sgd_optimizer.minimize(loss, grad_clip=clip)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1, grad_clip=clip)
sgd_optimizer.minimize(loss)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
......@@ -539,8 +540,8 @@ class GradientClipByGlobalNorm(GradientClipBase):
# clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=0.1, parameter_list=linear.parameters())
sgd_optimizer.minimize(loss, grad_clip=clip)
learning_rate=0.1, parameter_list=linear.parameters(), grad_clip=clip)
sgd_optimizer.minimize(loss)
"""
......@@ -628,6 +629,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
y=layers.elementwise_max(
x=max_global_norm, y=global_norm_var))
param_new_grad_name_dict = dict()
for p, g in params_grads:
if g is None:
continue
......@@ -638,9 +640,10 @@ class GradientClipByGlobalNorm(GradientClipBase):
with p.block.program._optimized_guard([p, g]):
new_grad = layers.elementwise_mul(x=g, y=scale_var)
param_new_grad_name_dict[p.name] = new_grad.name
params_and_grads.append((p, new_grad))
_correct_clip_op_role_var(params_and_grads)
_correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
return params_and_grads
def _process_context(self, context, param, grad):
......@@ -692,9 +695,10 @@ def set_gradient_clip(clip, param_list=None, program=None):
This API must be used after building network, and before ``minimize`` ,
and it may be removed in future releases, so it is not recommended.
It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient.
There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
this is a better method to clip gradient. There are three clipping strategies:
:ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` .
To specify parameters that require gradient clip.
......@@ -757,7 +761,7 @@ def set_gradient_clip(clip, param_list=None, program=None):
sgd = fluid.optimizer.SGD(learning_rate=1e-3)
sgd.minimize(loss)
# network 4: use 'set_gradient_clip' and 'minimize(grad_clip=clip)' together
# network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
with fluid.program_guard(fluid.Program(), fluid.Program()):
loss = network()
clip1 = fluid.clip.GradientClipByValue(min=-1.0, max=1.0)
......@@ -765,8 +769,8 @@ def set_gradient_clip(clip, param_list=None, program=None):
# Set the gradient clipping strategy: clip1
fluid.clip.set_gradient_clip(clip1)
# Set the gradient clipping strategy: clip2
sgd = fluid.optimizer.SGD(learning_rate=1e-3)
sgd.minimize(loss, grad_clip=clip2)
sgd = fluid.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
sgd.minimize(loss)
# 'set_gradient_clip' will not take effect when setting has a conflict,
# and the gradient clipping strategy will be 'clip2'
......@@ -774,10 +778,10 @@ def set_gradient_clip(clip, param_list=None, program=None):
"""
warnings.warn("Caution! 'set_gradient_clip' is not recommended "
"and may be deprecated in future! "
"We recommend a new strategy: clip gradient by "
"'optimizer.minimize(loss, grad_clip=clip)'. "
"We recommend a new strategy: set 'grad_clip' "
"when initializing the 'optimizer'. "
"This method can reduce the mistakes, please "
"see documention of 'optimzier.minimize'.")
"refer to documention of 'optimizer'.")
if not isinstance(clip, GradientClipBase):
raise TypeError(
......@@ -824,33 +828,40 @@ def append_gradient_clip_ops(param_grads):
clip_attr._process_context(context=context, param=p, grad=g)
res = []
param_new_grad_name_dict = dict()
for p, g in param_grads:
if g is None:
continue
with p.block.program._optimized_guard(
[p, g]), framework.name_scope('graident_clip_@CLIP'):
param, new_grad = clip_attr._create_operators(param=p, grad=g)
param_new_grad_name_dict[param.name] = new_grad.name
res.append([param, new_grad])
_correct_clip_op_role_var(res)
_correct_clip_op_role_var(res, param_new_grad_name_dict)
return res
# change wrong mapping relation between param & grad in clip op
def _correct_clip_op_role_var(params_grads):
def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
block_id_list = []
if len(param_new_grad_name_dict) == 0:
return
for param, grad in params_grads:
if grad is None:
continue
block_id = param.block.idx
if block_id in block_id_list:
continue
block_id_list.append(block_id)
for op in param.block.program.global_block().ops:
if 'op_namescope' in op.all_attrs() and "gradient_clip" in op.attr(
"op_namescope"):
if op.attr('op_role_var'):
"op_namescope") and op.attr('op_role_var'):
param_name = op.attr('op_role_var')[0]
index = 0
for i in range(len(params_grads)):
if params_grads[i][0].name == param_name:
index = i
correct_p_g = [param_name, params_grads[index][1].name]
if param_name in param_new_grad_name_dict:
correct_p_g = [
param_name, param_new_grad_name_dict[param_name]
]
op._set_attr('op_role_var', correct_p_g)
......
......@@ -24,7 +24,7 @@ from . import framework
from . import layers
from . import unique_name
from .backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
from .clip import GradientClipBase, error_clip_callback, append_gradient_clip_ops
from .clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops
from .framework import program_guard
from .initializer import Constant
from .layer_helper import LayerHelper
......@@ -64,6 +64,7 @@ class Optimizer(object):
learning_rate,
parameter_list=None,
regularization=None,
grad_clip=None,
name=None):
self._parameter_list = parameter_list
if framework.in_dygraph_mode():
......@@ -88,7 +89,13 @@ class Optimizer(object):
type(learning_rate))
self._name = name
if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase):
raise TypeError(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
self.regularization = regularization
self._grad_clip = grad_clip
self._learning_rate = learning_rate
# the learning rate type should be inferenced from loss
self._dtype = None
......@@ -107,8 +114,6 @@ class Optimizer(object):
self._opti_name_list = []
self._accumulators_holder = {}
self._param_device_map = dict()
# if pass grad_clip into minimize, it will not be None
self._grad_clip = None
@framework.dygraph_only
def state_dict(self):
......@@ -787,8 +792,7 @@ class Optimizer(object):
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None,
grad_clip=None):
no_grad_set=None):
"""
Add operations to minimize ``loss`` by updating ``parameter_list``.
......@@ -802,11 +806,6 @@ class Optimizer(object):
will be updated.
no_grad_set (set, optional): Set of ``Variable`` or ``Variable.name`` that don't need
to be updated. The default value is None.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no
gradient clipping.
Returns:
tuple: tuple (optimize_ops, params_grads), A list of operators appended
......@@ -820,12 +819,7 @@ class Optimizer(object):
Please refer to the example of current Optimizer.
"""
assert isinstance(loss, Variable), "The loss should be an Variable."
if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase):
raise TypeError(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
self._grad_clip = grad_clip
parameter_list = parameter_list if parameter_list \
else self._parameter_list
params_grads = self.backward(
......@@ -859,6 +853,10 @@ class SGDOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information. \
For details, please refer to :ref:`api_guide_Name`. Default is None.
......@@ -896,12 +894,14 @@ class SGDOptimizer(Optimizer):
learning_rate,
parameter_list=None,
regularization=None,
grad_clip=None,
name=None):
assert learning_rate is not None
super(SGDOptimizer, self).__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
grad_clip=grad_clip,
name=name)
self.type = "sgd"
......@@ -962,6 +962,10 @@ class MomentumOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information. \
For details, please refer to :ref:`api_guide_Name`. Default is None.
......@@ -1002,6 +1006,7 @@ class MomentumOptimizer(Optimizer):
parameter_list=None,
use_nesterov=False,
regularization=None,
grad_clip=None,
name=None):
assert learning_rate is not None
assert momentum is not None
......@@ -1009,6 +1014,7 @@ class MomentumOptimizer(Optimizer):
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
grad_clip=grad_clip,
name=name)
self.type = "momentum"
self._momentum = momentum
......@@ -1097,13 +1103,14 @@ class DGCMomentumOptimizer(Optimizer):
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
use_nesterov (bool): Enables Nesterov momentum. True means use Nesterov. Default is False.
local_grad_clip_norm (float, optional): Local gradient clip norm value. Optional, default is None, represent no need clip.
num_trainers (int, optional): The number of training nodes. Optional, default is None.
regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
:ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipByNorm, optional): Gradient cliping strategy. ``DGCMomentumOptimizer`` only support
:ref:`api_fluid_clip_GradientClipByNorm` , and if not, it will raise TypeError. Default None,
meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information. \
For details, please refer to :ref:`api_guide_Name`. Default is None.
......@@ -1130,9 +1137,9 @@ class DGCMomentumOptimizer(Optimizer):
sparsity=[0.999],
parameter_list=None,
use_nesterov=False,
local_grad_clip_norm=None,
num_trainers=None,
regularization=None,
grad_clip=None,
name=None):
if framework.in_dygraph_mode():
raise Exception("In dygraph, don't support DGCMomentumOptimizer.")
......@@ -1146,6 +1153,7 @@ class DGCMomentumOptimizer(Optimizer):
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
grad_clip=grad_clip,
name=name)
self.type = "dgc_momentum"
self._momentum = momentum
......@@ -1159,20 +1167,23 @@ class DGCMomentumOptimizer(Optimizer):
self._rampup_begin_step_var = None
self._global_step_var = None
self._local_grad_clip_norm = None
self._clip_norm = None
if local_grad_clip_norm is not None:
assert isinstance(num_trainers, int)
assert isinstance(local_grad_clip_norm, float)
assert num_trainers > 0
self._dgc_clip_norm = None
if grad_clip is not None:
if not isinstance(grad_clip, GradientClipByNorm):
raise TypeError(
"The type of grad_clip should be 'GradientClipByNorm', because DGCMomentumOptimizer only support GradientClipByNorm"
)
assert isinstance(
num_trainers, int
), "The type of num_trainers should be 'int', but received %s" % type(
value)
assert num_trainers > 0, "The value of num_trainers should be greater than 0!"
self._local_grad_clip_norm = local_grad_clip_norm
self._num_trainers = num_trainers
self._clip_norm = local_grad_clip_norm * (num_trainers**-0.5)
self._dgc_clip_norm = grad_clip.clip_norm * (num_trainers**-0.5)
self.regular_type, self.regular_coeff = self._get_regularization_param(
self.regularization)
self._grad_clip = None
def _get_regularization_param(self, regularization):
regular_type = 0
......@@ -1342,8 +1353,8 @@ class DGCMomentumOptimizer(Optimizer):
op._remove_attr(op_maker.kOpRoleVarAttrName())
clip_var = grad_var
if self._local_grad_clip_norm is not None:
clip_var = self._append_clip_norm(grad_var, self._clip_norm)
if self._dgc_clip_norm is not None:
clip_var = self._append_clip_norm(grad_var, self._dgc_clip_norm)
self._dgc_op(param_var, clip_var, grad_var, u_var, v_var, k_var,
encoded_var, gather_var)
......@@ -1494,6 +1505,10 @@ class LarsMomentumOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information. \
For details, please refer to :ref:`api_guide_Name`. Default is None.
......@@ -1526,6 +1541,7 @@ class LarsMomentumOptimizer(Optimizer):
lars_weight_decay=0.0005,
parameter_list=None,
regularization=None,
grad_clip=None,
name=None):
assert learning_rate is not None
assert momentum is not None
......@@ -1533,6 +1549,7 @@ class LarsMomentumOptimizer(Optimizer):
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
grad_clip=grad_clip,
name=name)
self.type = "lars_momentum"
self._momentum = momentum
......@@ -1607,6 +1624,10 @@ class AdagradOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
......@@ -1639,6 +1660,7 @@ class AdagradOptimizer(Optimizer):
epsilon=1.0e-6,
parameter_list=None,
regularization=None,
grad_clip=None,
name=None,
initial_accumulator_value=0.0):
assert learning_rate is not None
......@@ -1647,6 +1669,7 @@ class AdagradOptimizer(Optimizer):
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
grad_clip=grad_clip,
name=name)
self.type = "adagrad"
self._epsilon = epsilon
......@@ -1726,6 +1749,10 @@ class AdamOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
......@@ -1835,6 +1862,7 @@ class AdamOptimizer(Optimizer):
epsilon=1e-8,
parameter_list=None,
regularization=None,
grad_clip=None,
name=None,
lazy_mode=False):
assert learning_rate is not None
......@@ -1845,6 +1873,7 @@ class AdamOptimizer(Optimizer):
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
grad_clip=grad_clip,
name=name)
self.type = "adam"
self._beta1 = beta1
......@@ -1986,6 +2015,10 @@ class AdamaxOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
......@@ -2031,6 +2064,7 @@ class AdamaxOptimizer(Optimizer):
epsilon=1e-8,
parameter_list=None,
regularization=None,
grad_clip=None,
name=None):
assert learning_rate is not None
assert beta1 is not None
......@@ -2040,6 +2074,7 @@ class AdamaxOptimizer(Optimizer):
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
grad_clip=grad_clip,
name=name)
self.type = "adamax"
self._beta1 = beta1
......@@ -2238,6 +2273,10 @@ class DecayedAdagradOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
......@@ -2264,6 +2303,7 @@ class DecayedAdagradOptimizer(Optimizer):
epsilon=1.0e-6,
parameter_list=None,
regularization=None,
grad_clip=None,
name=None):
assert learning_rate is not None
assert decay is not None
......@@ -2273,6 +2313,7 @@ class DecayedAdagradOptimizer(Optimizer):
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
grad_clip=grad_clip,
name=name)
self.type = "decayed_adagrad"
self._decay = decay
......@@ -2337,6 +2378,10 @@ class AdadeltaOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): The default value is None. Normally there is no need for user
to set this property. For more information, please refer to
:ref:`api_guide_Name` .
......@@ -2367,6 +2412,7 @@ class AdadeltaOptimizer(Optimizer):
rho=0.95,
parameter_list=None,
regularization=None,
grad_clip=None,
name=None):
if learning_rate is None:
raise ValueError("learning_rate is not set.")
......@@ -2378,6 +2424,7 @@ class AdadeltaOptimizer(Optimizer):
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
grad_clip=grad_clip,
name=name)
self.type = "adadelta"
self._epsilon = epsilon
......@@ -2488,6 +2535,10 @@ class RMSPropOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information. \
For details, please refer to :ref:`api_guide_Name`. Default is None.
......@@ -2536,11 +2587,13 @@ class RMSPropOptimizer(Optimizer):
centered=False,
parameter_list=None,
regularization=None,
grad_clip=None,
name=None):
super(RMSPropOptimizer, self).__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
grad_clip=grad_clip,
name=name)
if learning_rate is None:
raise ValueError("learning_rate is not set.")
......@@ -2656,6 +2709,10 @@ class FtrlOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information. \
For details, please refer to :ref:`api_guide_Name`. Default is None.
......@@ -2704,11 +2761,13 @@ class FtrlOptimizer(Optimizer):
lr_power=-0.5,
parameter_list=None,
regularization=None,
grad_clip=None,
name=None):
super(FtrlOptimizer, self).__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
grad_clip=grad_clip,
name=name)
if learning_rate is None:
raise ValueError("learning_rate is not set.")
......@@ -2798,6 +2857,10 @@ class LambOptimizer(AdamOptimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
exclude_from_weight_decay_fn (function|None): Exclude a parameter from weight
decay when **exclude_from_weight_decay_fn(parameter)** returns true.
Default None.
......@@ -2834,6 +2897,7 @@ class LambOptimizer(AdamOptimizer):
epsilon=1e-6,
parameter_list=None,
regularization=None,
grad_clip=None,
exclude_from_weight_decay_fn=None,
name=None):
assert learning_rate is not None
......@@ -2845,6 +2909,7 @@ class LambOptimizer(AdamOptimizer):
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
grad_clip=grad_clip,
beta1=beta1,
beta2=beta2,
epsilon=epsilon,
......@@ -4043,20 +4108,13 @@ class RecomputeOptimizer(Optimizer):
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None,
grad_clip=None):
no_grad_set=None):
assert isinstance(loss, Variable), "The loss should be an Variable."
assert (self._checkpoints is not None
), "You should call _set_checkpoints first"
if framework.in_dygraph_mode():
raise NotImplementedError(
"DyGraph current does not support recompute")
if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase):
raise TypeError(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
self._optimizer._grad_clip = grad_clip
params_grads = self.backward(
loss,
startup_program=startup_program,
......
......@@ -34,7 +34,7 @@ class ParamAttr(object):
Note:
``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient.
It is recommended to set ``grad_clip`` in ``optimizer`` to clip gradient.
There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
......
......@@ -19,6 +19,7 @@ import unittest
import paddle.fluid.framework as framework
import paddle.fluid.optimizer as optimizer
import paddle.fluid.regularizer as regularizer
import paddle.fluid.clip as clip
import paddle.compat as cpt
from paddle.fluid.backward import append_backward
from paddle.fluid.transpiler.details import program_to_code
......@@ -70,9 +71,9 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
learning_rate=learning_rate,
momentum=0.2,
rampup_begin_step=0,
local_grad_clip_norm=1.0,
num_trainers=2,
regularization=regularization)
regularization=regularization,
grad_clip=clip.GradientClipByNorm(1.0))
if use_recompute:
dgc_momentum_optimizer = optimizer.RecomputeOptimizer(
......@@ -124,6 +125,16 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
#with open("test_dgc_optimizer_" + name + str(use_recompute) + ".log", "w") as f:
# program_to_code(program, fout=f)
def test_tpyeError(self):
# the type of DGCMomentumOptimizer(grad_clip=) must be 'GradientClipByNorm'
with self.assertRaises(TypeError):
dgc_momentum_optimizer = self.MockDGCMomentum(
learning_rate=0.01,
momentum=0.2,
rampup_begin_step=0,
num_trainers=2,
grad_clip=clip.GradientClipByGlobalNorm(1.0))
def test_momentum_without_dgc(self):
self.check_dgc_momentum_optimizer(
regularization=regularizer.L1Decay(1e-4))
......
......@@ -76,8 +76,8 @@ class TestGradientClip(unittest.TestCase):
startup_program = fluid.Program()
with fluid.program_guard(
main_program=prog, startup_program=startup_program):
image = fluid.data(name='x', shape=[-1, 784], dtype='float32')
label = fluid.data(name='y', shape=[-1, 1], dtype='int64')
image = fluid.data(name="a", shape=[-1, 784], dtype='float32')
label = fluid.data(name="b", shape=[-1, 1], dtype='int64')
hidden = fluid.layers.fc(input=image, size=32, act='relu')
predict = fluid.layers.fc(input=hidden, size=10, act='softmax')
......@@ -112,13 +112,13 @@ class TestGradientClip(unittest.TestCase):
self.check_clip_result(out, out_clip)
def check_sparse_gradient_clip(self, place):
prog = fluid.framework.Program()
startup_program = fluid.framework.Program()
prog = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard(
main_program=prog, startup_program=startup_program):
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
data = fluid.data(
name="words", shape=[-1, 1], dtype="int64", lod_level=1)
label = fluid.data(name="label", shape=[-1, 1], dtype="int64")
cost = bow_net(data, label, self.word_dict_len)
self.backward_and_optimize(cost)
......@@ -172,7 +172,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
self.clip_gradient = func
self.check_gradient_clip(fluid.CPUPlace())
# test whether the ouput is right when use 'minimize(grad_clip)'
# test whether the ouput is right when use grad_clip
def test_new_gradient_clip(self):
def func(params_grads):
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
......@@ -192,9 +192,10 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
clip = fluid.clip.GradientClipByGlobalNorm(
clip_norm=5.0, need_clip=fileter_func)
fluid.clip.set_gradient_clip(clip)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
# if 'set_gradient_clip' and 'minimize(grad_clip)' together, 'set_gradient_clip' will be ineffective
sgd_optimizer.minimize(cost, grad_clip=clip)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01,
grad_clip=clip)
# if 'set_gradient_clip' and 'optimize(grad_clip)' together, 'set_gradient_clip' will be ineffective
sgd_optimizer.minimize(cost)
# 'set_gradient_clip' must before 'minimize', otherwise, 'set_gradient_clip' will be ineffective
fluid.clip.set_gradient_clip(clip)
......@@ -232,24 +233,10 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
clip = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm, need_clip="test")
# the type of minimize(grad_clip=) must be an instance of GradientClipBase's derived class
with self.assertRaises(TypeError):
x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32")
loss = fluid.layers.reduce_mean(x)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
sgd_optimizer.minimize(loss, grad_clip="test")
# the type of RecomputeOptimizer.minimize(grad_clip=) must be an instance of GradientClipBase's derived class
# the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class
with self.assertRaises(TypeError):
x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32")
loss = fluid.layers.reduce_mean(x)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
recompute_optimizer = fluid.optimizer.RecomputeOptimizer(
sgd_optimizer)
recompute_optimizer._set_checkpoints([x])
recompute_optimizer.minimize(loss, grad_clip="test")
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1,
grad_clip="test")
class TestGradientClipByNorm(TestGradientClip):
......@@ -271,7 +258,7 @@ class TestGradientClipByNorm(TestGradientClip):
a=u, b=v, rtol=1e-5, atol=1e-8),
"gradient clip by norm has wrong results!")
# test whether the ouput is right when use 'minimize(grad_clip)'
# test whether the ouput is right when use grad_clip
def test_gradient_clip(self):
self.check_gradient_clip(fluid.CPUPlace())
......@@ -319,7 +306,7 @@ class TestGradientClipByValue(TestGradientClip):
a=u, b=v, rtol=1e-6, atol=1e-8),
"gradient clip by value has wrong results!")
# test whether the ouput is right when use 'minimize(grad_clip)'
# test whether the ouput is right when use grad_clip
def test_gradient_clip(self):
self.check_gradient_clip(fluid.CPUPlace())
......@@ -357,7 +344,9 @@ class TestDygraphGradientClip(unittest.TestCase):
loss = fluid.layers.reduce_mean(out)
loss.backward()
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=0.0, parameter_list=linear.parameters())
learning_rate=0.0,
parameter_list=linear.parameters(),
grad_clip=fluid.clip.GradientClipByGlobalNorm(0.1))
self.check_clip_result(loss, sgd_optimizer)
def check_clip_result(self, loss, optimizer):
......@@ -384,7 +373,7 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
np.array([3, 4]).astype("float32"), name="y")
assert len(self.clip1([(x, x), (x, y), (x, None)])) == 2
# get params and grads from network
opt, params_grads = optimizer.minimize(loss, grad_clip=self.clip2)
opt, params_grads = optimizer.minimize(loss)
_, grads = zip(*params_grads)
params_grads = self.clip2(params_grads)
_, grads_clip = zip(*params_grads)
......@@ -426,7 +415,7 @@ class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
assert len(self.clip([(x, None)])) == 0
# get params and grads from network
self.clip([(fluid.dygraph.to_variable(np.array([2, 3])), None)])
params_grads = optimizer.backward(loss)
opt, params_grads = optimizer.minimize(loss)
_, grads = zip(*params_grads)
params_grads = self.clip(params_grads)
_, grads_clip = zip(*params_grads)
......@@ -460,7 +449,7 @@ class TestDygraphGradientClipByValue(TestDygraphGradientClip):
x = fluid.dygraph.to_variable(np.array([2, 3]).astype("float32"))
assert len(self.clip([(x, None)])) == 0
# get params and grads from network
params_grads = optimizer.backward(loss)
opt, params_grads = optimizer.minimize(loss)
_, grads = zip(*params_grads)
params_grads = self.clip(params_grads)
_, grads_clip = zip(*params_grads)
......
......@@ -329,9 +329,9 @@ class TestImperativeAutoPrune(unittest.TestCase):
place = fluid.CPUPlace()
with fluid.dygraph.guard(place):
model = MyLayer(size, vocab_size, size)
optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters())
grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters(), grad_clip=grad_clip)
indices = fluid.dygraph.to_variable(indices)
embed = fluid.dygraph.to_variable(embed)
......@@ -339,7 +339,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
loss = model.embed_linear0(indices)
loss.backward()
_, params_grads = optimizer.minimize(loss, grad_clip=grad_clip)
_, params_grads = optimizer.minimize(loss)
for items in params_grads:
assert items[0].name is not model.embed1.weight.name
assert items[0].name is not model.linear_1.weight.name
......@@ -348,9 +348,9 @@ class TestImperativeAutoPrune(unittest.TestCase):
with fluid.dygraph.guard(place):
model = MyLayer2(size, vocab_size, size)
optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters())
grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters(), grad_clip=grad_clip)
indices = fluid.dygraph.to_variable(indices)
emebd = fluid.dygraph.to_variable(embed)
......@@ -358,7 +358,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
loss = model.embed_linear0(indices)
loss.backward()
optimizer.minimize(loss, grad_clip=grad_clip)
optimizer.minimize(loss)
for items in params_grads:
assert items[0].name is not model.embed1.weight.name
assert items[0].name is not model.linear_1.weight.name
......
......@@ -58,14 +58,15 @@ class TestSimpleNet(unittest.TestCase):
simplenet = SimpleNet(20, 32, dtype)
adam = SGDOptimizer(
learning_rate=0.001,
parameter_list=simplenet.parameters())
parameter_list=simplenet.parameters(
)) # grad_clip=grad_clip
input_emb, emb = simplenet(input)
self.assertTrue(emb.weight.gradient() is None)
self.assertTrue(input_emb.gradient() is None)
input_emb.backward(backward_strategy)
adam.minimize(input_emb) # grad_clip=grad_clip
adam.minimize(input_emb)
self.assertTrue(emb.weight.gradient() is not None)
emb.clear_gradients()
......@@ -92,14 +93,15 @@ class TestSimpleNet(unittest.TestCase):
simplenet = SimpleNet(20, 32, "float32")
adam = SGDOptimizer(
learning_rate=0.001,
parameter_list=simplenet.parameters())
parameter_list=simplenet.parameters(),
grad_clip=grad_clip)
input_emb, emb = simplenet(input)
self.assertTrue(emb.weight.gradient() is None)
self.assertTrue(input_emb.gradient() is None)
input_emb.backward(backward_strategy)
adam.minimize(input_emb, grad_clip=grad_clip)
adam.minimize(input_emb)
self.assertTrue(emb.weight.gradient() is not None)
emb.clear_gradients()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册