未验证 提交 994438b1 编写于 作者: Q Qi Li 提交者: GitHub

change clip grad api, test=develop (#27767)

上级 365c2c9c
...@@ -26,8 +26,8 @@ from . import name_scope ...@@ -26,8 +26,8 @@ from . import name_scope
from .dygraph import base as imperative_base from .dygraph import base as imperative_base
__all__ = [ __all__ = [
'set_gradient_clip', 'ErrorClipByValue', 'GradientClipByValue', 'set_gradient_clip', 'ErrorClipByValue', 'ClipGradByValue',
'GradientClipByNorm', 'GradientClipByGlobalNorm' 'ClipGradByNorm', 'ClipGradByGlobalNorm'
] ]
...@@ -115,16 +115,9 @@ def error_clip_callback(block, context): ...@@ -115,16 +115,9 @@ def error_clip_callback(block, context):
error_clip._append_clip_op(block, grad_n) error_clip._append_clip_op(block, grad_n)
class GradientClipBase(object): class ClipGradBase(object):
def __init__(self, need_clip=None): def __init__(self):
if need_clip is not None and not callable(need_clip): super(ClipGradBase, self).__init__()
raise TypeError(
"The type of need_clip must be funciton, and it can filter out "
"parameter that does't need gradient clip. This function must return "
"True or False, and True means that clipping is required. Please refer to "
"API documention of GradientClipByGlobalNorm / GradientClipByNorm "
"/GradientClipByValue.")
self._need_clip_func = need_clip
def __str__(self): def __str__(self):
raise NotImplementedError() raise NotImplementedError()
...@@ -144,7 +137,7 @@ class GradientClipBase(object): ...@@ -144,7 +137,7 @@ class GradientClipBase(object):
if getattr(p, 'gradient_clip_attr', None) is not None: if getattr(p, 'gradient_clip_attr', None) is not None:
warnings.warn( warnings.warn(
"'set_gradient_clip' will be ineffective, because you have " "'set_gradient_clip' will be ineffective, because you have "
"set 'grad_clip' in 'optimizer'. So, 'set_gradient_clip' " "set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' "
"is redundant and you can remove it.") "is redundant and you can remove it.")
break break
return self._static_clip(params_grads) return self._static_clip(params_grads)
...@@ -156,7 +149,7 @@ class GradientClipBase(object): ...@@ -156,7 +149,7 @@ class GradientClipBase(object):
raise NotImplementedError() raise NotImplementedError()
class GradientClipByValue(GradientClipBase): class ClipGradByValue(ClipGradBase):
""" """
Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max]. Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
...@@ -164,19 +157,20 @@ class GradientClipByValue(GradientClipBase): ...@@ -164,19 +157,20 @@ class GradientClipByValue(GradientClipBase):
- Any values greater than max are set to ``max``. - Any values greater than max are set to ``max``.
The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip`` The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
is not None, then only part of gradients can be selected for gradient clipping. If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`). (for example: :ref:`api_paddle_optimizer_SGD`).
Note:
``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args: Args:
max (float): The maximum value to clip by. max (float): The maximum value to clip by.
min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max`` min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
automatically. In this case, ``max`` must be greater than 0. automatically. In this case, ``max`` must be greater than 0.
need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool``
(True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None,
and gradients of all parameters in the network will be clipped.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -184,29 +178,20 @@ class GradientClipByValue(GradientClipBase): ...@@ -184,29 +178,20 @@ class GradientClipByValue(GradientClipBase):
import paddle import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32') x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x) out = linear(x)
loss = paddle.mean(out) loss = paddle.mean(out)
loss.backward() loss.backward()
# clip all parameters in network: clip = paddle.nn.ClipGradByValue(min=-1, max=1)
clip = paddle.nn.GradientClipByValue(min=-1, max=1)
# clip a part of parameters in network: (e.g. linear_0.w_0)
# pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
# def fileter_func(ParamBase):
# # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
# return ParamBase.name == "linear_0.w_0"
# # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
# return ParamBase.name == linear.weight.name
# clip = paddle.nn.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip) sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step() sdg.step()
""" """
def __init__(self, max, min=None, need_clip=None): def __init__(self, max, min=None):
super(GradientClipByValue, self).__init__(need_clip) super(ClipGradByValue, self).__init__()
if min is None: if min is None:
assert (max > 0.0) assert (max > 0.0)
min = -max min = -max
...@@ -214,7 +199,7 @@ class GradientClipByValue(GradientClipBase): ...@@ -214,7 +199,7 @@ class GradientClipByValue(GradientClipBase):
self.min = float(min) self.min = float(min)
def __str__(self): def __str__(self):
return "Gradient Clip By Value, min = %f, max=%f" % (self.min, self.max) return "Clip Gradient By Value, min = %f, max=%f" % (self.min, self.max)
@imperative_base.no_grad @imperative_base.no_grad
def _dygraph_clip(self, params_grads): def _dygraph_clip(self, params_grads):
...@@ -222,7 +207,7 @@ class GradientClipByValue(GradientClipBase): ...@@ -222,7 +207,7 @@ class GradientClipByValue(GradientClipBase):
for p, g in params_grads: for p, g in params_grads:
if g is None: if g is None:
continue continue
if self._need_clip_func is not None and not self._need_clip_func(p): if getattr(p, 'need_clip', True) is False:
params_and_grads.append((p, g)) params_and_grads.append((p, g))
continue continue
new_grad = layers.clip(x=g, min=self.min, max=self.max) new_grad = layers.clip(x=g, min=self.min, max=self.max)
...@@ -236,8 +221,7 @@ class GradientClipByValue(GradientClipBase): ...@@ -236,8 +221,7 @@ class GradientClipByValue(GradientClipBase):
for p, g in params_grads: for p, g in params_grads:
if g is None: if g is None:
continue continue
if self._need_clip_func is not None and not self._need_clip_func( if getattr(p, 'need_clip', True) is False:
p):
params_and_grads.append((p, g)) params_and_grads.append((p, g))
continue continue
...@@ -256,7 +240,7 @@ class GradientClipByValue(GradientClipBase): ...@@ -256,7 +240,7 @@ class GradientClipByValue(GradientClipBase):
return param, new_grad return param, new_grad
class GradientClipByNorm(GradientClipBase): class ClipGradByNorm(ClipGradBase):
""" """
Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` . Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
...@@ -264,8 +248,8 @@ class GradientClipByNorm(GradientClipBase): ...@@ -264,8 +248,8 @@ class GradientClipByNorm(GradientClipBase):
- If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done. - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip`` The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
is not None, then only part of gradients can be selected for gradient clipping. If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`). (for example: :ref:`api_paddle_optimizer_SGD`).
...@@ -287,11 +271,12 @@ class GradientClipByNorm(GradientClipBase): ...@@ -287,11 +271,12 @@ class GradientClipByNorm(GradientClipBase):
.. math:: .. math::
norm(X) = ( \\sum_{i=1}^{n}|x\_i|^2)^{ \\frac{1}{2}} norm(X) = ( \\sum_{i=1}^{n}|x\_i|^2)^{ \\frac{1}{2}}
Note:
``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args: Args:
clip_norm(float): The maximum norm value. clip_norm(float): The maximum norm value.
need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool``
(True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None,
and gradients of all parameters in the network will be clipped.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -299,29 +284,20 @@ class GradientClipByNorm(GradientClipBase): ...@@ -299,29 +284,20 @@ class GradientClipByNorm(GradientClipBase):
import paddle import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32') x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x) out = linear(x)
loss = paddle.mean(out) loss = paddle.mean(out)
loss.backward() loss.backward()
# clip all parameters in network: clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
clip = paddle.nn.GradientClipByNorm(clip_norm=1.0)
# clip a part of parameters in network: (e.g. linear_0.w_0)
# pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
# def fileter_func(ParamBase):
# # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
# return ParamBase.name == "linear_0.w_0"
# # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
# return ParamBase.name == linear.weight.name
# clip = paddle.nn.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip) sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step() sdg.step()
""" """
def __init__(self, clip_norm, need_clip=None): def __init__(self, clip_norm):
super(GradientClipByNorm, self).__init__(need_clip) super(ClipGradByNorm, self).__init__()
self.clip_norm = float(clip_norm) self.clip_norm = float(clip_norm)
def __str__(self): def __str__(self):
...@@ -333,7 +309,7 @@ class GradientClipByNorm(GradientClipBase): ...@@ -333,7 +309,7 @@ class GradientClipByNorm(GradientClipBase):
for p, g in params_grads: for p, g in params_grads:
if g is None: if g is None:
continue continue
if self._need_clip_func is not None and not self._need_clip_func(p): if getattr(p, 'need_clip', True) is False:
params_and_grads.append((p, g)) params_and_grads.append((p, g))
continue continue
new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm) new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
...@@ -347,8 +323,7 @@ class GradientClipByNorm(GradientClipBase): ...@@ -347,8 +323,7 @@ class GradientClipByNorm(GradientClipBase):
for p, g in params_grads: for p, g in params_grads:
if g is None: if g is None:
continue continue
if self._need_clip_func is not None and not self._need_clip_func( if getattr(p, 'need_clip', True) is False:
p):
params_and_grads.append((p, g)) params_and_grads.append((p, g))
continue continue
...@@ -367,7 +342,7 @@ class GradientClipByNorm(GradientClipBase): ...@@ -367,7 +342,7 @@ class GradientClipByNorm(GradientClipBase):
return param, new_grad return param, new_grad
class GradientClipByGlobalNorm(GradientClipBase): class ClipGradByGlobalNorm(ClipGradBase):
""" """
Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
:math:`t\_list` , and limit it to ``clip_norm`` . :math:`t\_list` , and limit it to ``clip_norm`` .
...@@ -376,8 +351,8 @@ class GradientClipByGlobalNorm(GradientClipBase): ...@@ -376,8 +351,8 @@ class GradientClipByGlobalNorm(GradientClipBase):
- If the global norm is less than or equal to ``clip_norm`` , nothing will be done. - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip`` The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
is not None, then only part of gradients can be selected for gradient clipping. If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`). (for example: :ref:`api_paddle_optimizer_SGD`).
...@@ -394,12 +369,13 @@ class GradientClipByGlobalNorm(GradientClipBase): ...@@ -394,12 +369,13 @@ class GradientClipByGlobalNorm(GradientClipBase):
global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2} global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
Note:
``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args: Args:
clip_norm (float): The maximum norm value. clip_norm (float): The maximum norm value.
group_name (str, optional): The group name for this clip. Default value is ``default_group`` group_name (str, optional): The group name for this clip. Default value is ``default_group``.
need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool``
(True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None,
and gradients of all parameters in the network will be clipped.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -407,29 +383,20 @@ class GradientClipByGlobalNorm(GradientClipBase): ...@@ -407,29 +383,20 @@ class GradientClipByGlobalNorm(GradientClipBase):
import paddle import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32') x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x) out = linear(x)
loss = paddle.mean(out) loss = paddle.mean(out)
loss.backward() loss.backward()
# clip all parameters in network: clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
clip = paddle.nn.GradientClipByGlobalNorm(clip_norm=1.0)
# clip a part of parameters in network: (e.g. linear_0.w_0)
# pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
# def fileter_func(ParamBase):
# # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
# return ParamBase.name == "linear_0.w_0"
# # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
# return ParamBase.name == linear.weight.name
# clip = paddle.nn.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip) sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step() sdg.step()
""" """
def __init__(self, clip_norm, group_name="default_group", need_clip=None): def __init__(self, clip_norm, group_name="default_group"):
super(GradientClipByGlobalNorm, self).__init__(need_clip) super(ClipGradByGlobalNorm, self).__init__()
self.clip_norm = float(clip_norm) self.clip_norm = float(clip_norm)
self.group_name = group_name self.group_name = group_name
...@@ -443,7 +410,7 @@ class GradientClipByGlobalNorm(GradientClipBase): ...@@ -443,7 +410,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
for p, g in params_grads: for p, g in params_grads:
if g is None: if g is None:
continue continue
if self._need_clip_func is not None and not self._need_clip_func(p): if getattr(p, 'need_clip', True) is False:
continue continue
merge_grad = g merge_grad = g
if g.type == core.VarDesc.VarType.SELECTED_ROWS: if g.type == core.VarDesc.VarType.SELECTED_ROWS:
...@@ -469,7 +436,7 @@ class GradientClipByGlobalNorm(GradientClipBase): ...@@ -469,7 +436,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
for p, g in params_grads: for p, g in params_grads:
if g is None: if g is None:
continue continue
if self._need_clip_func is not None and not self._need_clip_func(p): if getattr(p, 'need_clip', True) is False:
params_and_grads.append((p, g)) params_and_grads.append((p, g))
continue continue
new_grad = layers.elementwise_mul(x=g, y=clip_var) new_grad = layers.elementwise_mul(x=g, y=clip_var)
...@@ -484,8 +451,7 @@ class GradientClipByGlobalNorm(GradientClipBase): ...@@ -484,8 +451,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
for p, g in params_grads: for p, g in params_grads:
if g is None: if g is None:
continue continue
if self._need_clip_func is not None and not self._need_clip_func( if getattr(p, 'need_clip', True) is False:
p):
continue continue
merge_grad = g merge_grad = g
with p.block.program._optimized_guard([p, g]): with p.block.program._optimized_guard([p, g]):
...@@ -518,8 +484,7 @@ class GradientClipByGlobalNorm(GradientClipBase): ...@@ -518,8 +484,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
for p, g in params_grads: for p, g in params_grads:
if g is None: if g is None:
continue continue
if self._need_clip_func is not None and not self._need_clip_func( if getattr(p, 'need_clip', True) is False:
p):
params_and_grads.append((p, g)) params_and_grads.append((p, g))
continue continue
...@@ -670,9 +635,9 @@ def set_gradient_clip(clip, param_list=None, program=None): ...@@ -670,9 +635,9 @@ def set_gradient_clip(clip, param_list=None, program=None):
"This method can reduce the mistakes, please " "This method can reduce the mistakes, please "
"refer to documention of 'optimizer'.") "refer to documention of 'optimizer'.")
if not isinstance(clip, GradientClipBase): if not isinstance(clip, ClipGradBase):
raise TypeError( raise TypeError(
"'clip' should be an instance of GradientClipBase's derived class") "'clip' should be an instance of ClipGradBase's derived class")
if program is None: if program is None:
program = framework.default_main_program() program = framework.default_main_program()
...@@ -708,7 +673,7 @@ def append_gradient_clip_ops(param_grads): ...@@ -708,7 +673,7 @@ def append_gradient_clip_ops(param_grads):
clip_attr = getattr(p, 'gradient_clip_attr', None) clip_attr = getattr(p, 'gradient_clip_attr', None)
if clip_attr is None: if clip_attr is None:
return param_grads return param_grads
if not isinstance(clip_attr, GradientClipBase): if not isinstance(clip_attr, ClipGradBase):
raise TypeError( raise TypeError(
"clip attribute should be an instance of GradientClipBase") "clip attribute should be an instance of GradientClipBase")
...@@ -754,6 +719,7 @@ def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict): ...@@ -754,6 +719,7 @@ def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
op._set_attr('op_role_var', correct_p_g) op._set_attr('op_role_var', correct_p_g)
ClipByValue = GradientClipByValue GradientClipBase = ClipGradBase
ClipByNorm = GradientClipByNorm GradientClipByValue = ClipGradByValue
ClipByGlobalNorm = GradientClipByGlobalNorm GradientClipByNorm = ClipGradByNorm
GradientClipByGlobalNorm = ClipGradByGlobalNorm
...@@ -5123,6 +5123,8 @@ class Parameter(Variable): ...@@ -5123,6 +5123,8 @@ class Parameter(Variable):
be applied on the parameter. Default: None be applied on the parameter. Default: None
do_model_average(bool): True if the model average strategy will do_model_average(bool): True if the model average strategy will
be applied on this parameter. be applied on this parameter.
need_clip (bool): Whether the parameter gradient need to be cliped
in optimizer. Default is True.
""" """
def __init__(self, def __init__(self,
...@@ -5162,6 +5164,8 @@ class Parameter(Variable): ...@@ -5162,6 +5164,8 @@ class Parameter(Variable):
self.do_model_average = kwargs.get('do_model_average', None) self.do_model_average = kwargs.get('do_model_average', None)
self.need_clip = kwargs.get('need_clip', True)
self.is_distributed = False self.is_distributed = False
def __str__(self): def __str__(self):
...@@ -5194,7 +5198,7 @@ class Parameter(Variable): ...@@ -5194,7 +5198,7 @@ class Parameter(Variable):
if with_details: if with_details:
res_str = Variable.to_string(self, throw_on_error, True) res_str = Variable.to_string(self, throw_on_error, True)
additional_attr = ("trainable", "optimize_attr", "regularizer", additional_attr = ("trainable", "optimize_attr", "regularizer",
"do_model_average") "do_model_average", "need_clip")
for attr_name in additional_attr: for attr_name in additional_attr:
res_str += "%s: %s\n" % (attr_name, res_str += "%s: %s\n" % (attr_name,
cpt.to_text(getattr(self, attr_name))) cpt.to_text(getattr(self, attr_name)))
...@@ -5226,6 +5230,8 @@ class ParamBase(core.VarBase): ...@@ -5226,6 +5230,8 @@ class ParamBase(core.VarBase):
be applied on the ParamBase. Default: None be applied on the ParamBase. Default: None
do_model_average(bool): True if the model average strategy will do_model_average(bool): True if the model average strategy will
be applied on this ParamBase. be applied on this ParamBase.
need_clip (bool): Whether the parameter gradient need to be cliped
in optimizer. Default is True.
""" """
@dygraph_only @dygraph_only
...@@ -5265,6 +5271,8 @@ class ParamBase(core.VarBase): ...@@ -5265,6 +5271,8 @@ class ParamBase(core.VarBase):
self.do_model_average = kwargs.get('do_model_average', None) self.do_model_average = kwargs.get('do_model_average', None)
self.need_clip = kwargs.get('need_clip', True)
self.is_distributed = False self.is_distributed = False
# self.block = default_main_program().global_block() # self.block = default_main_program().global_block()
......
...@@ -36,8 +36,8 @@ class ParamAttr(object): ...@@ -36,8 +36,8 @@ class ParamAttr(object):
Note: Note:
``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0. ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
It is recommended to set ``grad_clip`` in ``optimizer`` to clip gradient. Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` , There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` . :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
Parameters: Parameters:
...@@ -57,6 +57,7 @@ class ParamAttr(object): ...@@ -57,6 +57,7 @@ class ParamAttr(object):
trainable (bool): Whether this parameter is trainable. Default True. trainable (bool): Whether this parameter is trainable. Default True.
do_model_average (bool): Whether this parameter should do model average do_model_average (bool): Whether this parameter should do model average
when model average is enabled. Default False. when model average is enabled. Default False.
need_clip (bool): Whether the parameter gradient need to be cliped in optimizer. Default is True.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -78,7 +79,8 @@ class ParamAttr(object): ...@@ -78,7 +79,8 @@ class ParamAttr(object):
learning_rate=1.0, learning_rate=1.0,
regularizer=None, regularizer=None,
trainable=True, trainable=True,
do_model_average=True): do_model_average=True,
need_clip=True):
if sys.version_info.major == 2: if sys.version_info.major == 2:
check_type(name, "name", (str, type(None), unicode), "ParamAttr") check_type(name, "name", (str, type(None), unicode), "ParamAttr")
...@@ -87,6 +89,7 @@ class ParamAttr(object): ...@@ -87,6 +89,7 @@ class ParamAttr(object):
check_type(learning_rate, "learning_rate", (float, int), "ParamAttr") check_type(learning_rate, "learning_rate", (float, int), "ParamAttr")
check_type(trainable, "trainable", (bool), "ParamAttr") check_type(trainable, "trainable", (bool), "ParamAttr")
check_type(do_model_average, "do_model_average", (bool), "ParamAttr") check_type(do_model_average, "do_model_average", (bool), "ParamAttr")
check_type(need_clip, "need_clip", (bool), "ParamAttr")
check_type(initializer, "initializer", (Initializer, type(None)), check_type(initializer, "initializer", (Initializer, type(None)),
"ParamAttr") "ParamAttr")
check_type(regularizer, "regularizer", check_type(regularizer, "regularizer",
...@@ -101,6 +104,7 @@ class ParamAttr(object): ...@@ -101,6 +104,7 @@ class ParamAttr(object):
self.regularizer = regularizer self.regularizer = regularizer
self.trainable = trainable self.trainable = trainable
self.do_model_average = do_model_average self.do_model_average = do_model_average
self.need_clip = need_clip
def _set_default_initializer(self, initializer): def _set_default_initializer(self, initializer):
""" """
...@@ -197,7 +201,8 @@ class ParamAttr(object): ...@@ -197,7 +201,8 @@ class ParamAttr(object):
}, },
'regularizer': self.regularizer, 'regularizer': self.regularizer,
'trainable': self.trainable, 'trainable': self.trainable,
'do_model_average': self.do_model_average 'do_model_average': self.do_model_average,
'need_clip': self.need_clip
} }
if with_initializer: if with_initializer:
kwargs['initializer'] = self.initializer kwargs['initializer'] = self.initializer
...@@ -219,9 +224,9 @@ class WeightNormParamAttr(ParamAttr): ...@@ -219,9 +224,9 @@ class WeightNormParamAttr(ParamAttr):
<https://arxiv.org/pdf/1602.07868.pdf>`_. <https://arxiv.org/pdf/1602.07868.pdf>`_.
Note: Note:
``gradient_clip`` of ``WeightNormParamAttr`` HAS BEEN DEPRECATED since 2.0. ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient. Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` , There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` . :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
...@@ -248,6 +253,7 @@ class WeightNormParamAttr(ParamAttr): ...@@ -248,6 +253,7 @@ class WeightNormParamAttr(ParamAttr):
trainable(bool, optional): Whether this parameter is trainable. Default True. trainable(bool, optional): Whether this parameter is trainable. Default True.
do_model_average(bool, optional): Whether this parameter should do model average. do_model_average(bool, optional): Whether this parameter should do model average.
Default False. Default False.
need_clip (bool, optional): Whether the parameter gradient need to be cliped in optimizer. Default is True.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -267,7 +273,8 @@ class WeightNormParamAttr(ParamAttr): ...@@ -267,7 +273,8 @@ class WeightNormParamAttr(ParamAttr):
learning_rate=1.0, learning_rate=1.0,
regularizer=paddle.regularizer.L2Decay(0.1), regularizer=paddle.regularizer.L2Decay(0.1),
trainable=True, trainable=True,
do_model_average=False)) do_model_average=False,
need_clip=True))
""" """
# List to record the parameters reparameterized by weight normalization. # List to record the parameters reparameterized by weight normalization.
...@@ -283,12 +290,14 @@ class WeightNormParamAttr(ParamAttr): ...@@ -283,12 +290,14 @@ class WeightNormParamAttr(ParamAttr):
learning_rate=1.0, learning_rate=1.0,
regularizer=None, regularizer=None,
trainable=True, trainable=True,
do_model_average=False): do_model_average=False,
need_clip=True):
super(WeightNormParamAttr, self).__init__( super(WeightNormParamAttr, self).__init__(
name=name, name=name,
initializer=initializer, initializer=initializer,
learning_rate=learning_rate, learning_rate=learning_rate,
regularizer=regularizer, regularizer=regularizer,
trainable=trainable, trainable=trainable,
do_model_average=do_model_average) do_model_average=do_model_average,
need_clip=need_clip)
self.dim = dim self.dim = dim
...@@ -185,12 +185,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip): ...@@ -185,12 +185,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# invoke 'set_gradient_clip' in a wrong order # invoke 'set_gradient_clip' in a wrong order
def test_wrong_API_order(self): def test_wrong_API_order(self):
def backward_func(cost): def backward_func(cost):
# no clip gradient clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
def fileter_func(param):
return param.name == "fc.w_0"
clip = fluid.clip.GradientClipByGlobalNorm(
clip_norm=5.0, need_clip=fileter_func)
fluid.clip.set_gradient_clip(clip) fluid.clip.set_gradient_clip(clip)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01, sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01,
grad_clip=clip) grad_clip=clip)
...@@ -205,11 +200,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip): ...@@ -205,11 +200,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# if grad is None or not need clip # if grad is None or not need clip
def test_none_grad(self): def test_none_grad(self):
def fileter_func(param): clip = fluid.clip.GradientClipByGlobalNorm(self.clip_norm)
return param.name == "x"
clip = fluid.clip.GradientClipByGlobalNorm(
self.clip_norm, need_clip=fileter_func)
x = fluid.default_main_program().global_block().create_parameter( x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32") name="x", shape=[2, 3], dtype="float32")
y = fluid.default_main_program().global_block().create_parameter( y = fluid.default_main_program().global_block().create_parameter(
...@@ -228,11 +219,6 @@ class TestGradientClipByGlobalNorm(TestGradientClip): ...@@ -228,11 +219,6 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# raise typeError # raise typeError
def test_tpyeError(self): def test_tpyeError(self):
# the type of need_clip must be an funciton
with self.assertRaises(TypeError):
clip = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm, need_clip="test")
# the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class # the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class
with self.assertRaises(TypeError): with self.assertRaises(TypeError):
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1, sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1,
...@@ -264,26 +250,22 @@ class TestGradientClipByNorm(TestGradientClip): ...@@ -264,26 +250,22 @@ class TestGradientClipByNorm(TestGradientClip):
# if grad is None or not need clip # if grad is None or not need clip
def test_none_grad(self): def test_none_grad(self):
def fileter_func(param): clip = fluid.clip.GradientClipByNorm(self.clip_norm)
return param.name == "z"
clip = fluid.clip.GradientClipByNorm(
self.clip_norm, need_clip=fileter_func)
x = fluid.default_main_program().global_block().create_parameter( x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32") name="x", shape=[2, 3], dtype="float32", need_clip=False)
y = fluid.default_main_program().global_block().create_parameter( y = fluid.default_main_program().global_block().create_parameter(
name="y", shape=[2, 3], dtype="float32") name="y", shape=[2, 3], dtype="float32", need_clip=False)
# (x, None) should not be returned # (x, None) should not be returned
params_grads = [(x, None), (x, y)] params_grads = [(x, None), (x, y)]
params_grads = clip(params_grads) params_grads = clip(params_grads)
self.assertTrue( self.assertTrue(
len(clip(params_grads)) == 1, len(clip(params_grads)) == 1,
"ClipByNorm: when grad is None, it shouldn't be returned by gradient clip!" "ClipGradByNorm: when grad is None, it shouldn't be returned by gradient clip!"
) )
self.assertTrue( self.assertTrue(
params_grads[0][1].name == 'y', params_grads[0][1].name == 'y',
"ClipByNorm: grad should not be clipped when filtered out!") "ClipGradByNorm: grad should not be clipped when filtered out!")
class TestGradientClipByValue(TestGradientClip): class TestGradientClipByValue(TestGradientClip):
...@@ -312,26 +294,22 @@ class TestGradientClipByValue(TestGradientClip): ...@@ -312,26 +294,22 @@ class TestGradientClipByValue(TestGradientClip):
# if grad is None or not need clip # if grad is None or not need clip
def test_none_grad(self): def test_none_grad(self):
def fileter_func(param): clip = fluid.clip.GradientClipByValue(self.max, self.min)
return param.name == "z"
clip = fluid.clip.GradientClipByValue(
self.max, self.min, need_clip=fileter_func)
x = fluid.default_main_program().global_block().create_parameter( x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32") name="x", shape=[2, 3], dtype="float32", need_clip=False)
y = fluid.default_main_program().global_block().create_parameter( y = fluid.default_main_program().global_block().create_parameter(
name="y", shape=[2, 3], dtype="float32") name="y", shape=[2, 3], dtype="float32", need_clip=False)
# (x, None) should not be returned # (x, None) should not be returned
params_grads = [(x, None), (x, y)] params_grads = [(x, None), (x, y)]
params_grads = clip(params_grads) params_grads = clip(params_grads)
self.assertTrue( self.assertTrue(
len(clip(params_grads)) == 1, len(clip(params_grads)) == 1,
"ClipByValue: when grad is None, it shouldn't be returned by gradient clip!" "ClipGradByValue: when grad is None, it shouldn't be returned by gradient clip!"
) )
self.assertTrue( self.assertTrue(
params_grads[0][1].name == 'y', params_grads[0][1].name == 'y',
"ClipByValue: grad should not be clipped when filtered out!") "ClipGradByValue: grad should not be clipped when filtered out!")
class TestDygraphGradientClip(unittest.TestCase): class TestDygraphGradientClip(unittest.TestCase):
...@@ -355,13 +333,9 @@ class TestDygraphGradientClip(unittest.TestCase): ...@@ -355,13 +333,9 @@ class TestDygraphGradientClip(unittest.TestCase):
class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip): class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
def setUp(self): def setUp(self):
# only clip gradient of x (ParamBase)
def fileter_func(param):
return param.name == "x"
self.clip_norm = 0.8 self.clip_norm = 0.8
self.clip1 = fluid.clip.GradientClipByGlobalNorm( self.clip1 = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm, need_clip=fileter_func) clip_norm=self.clip_norm)
self.clip2 = fluid.clip.GradientClipByGlobalNorm( self.clip2 = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm) clip_norm=self.clip_norm)
...@@ -401,13 +375,8 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip): ...@@ -401,13 +375,8 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
class TestDygraphGradientClipByNorm(TestDygraphGradientClip): class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
def setUp(self): def setUp(self):
# only clip gradient of linear_0.w_0 (ParamBase)
def fileter_func(param):
return param.name == "linear_0.w_0"
self.clip_norm = 0.8 self.clip_norm = 0.8
self.clip = fluid.clip.GradientClipByNorm( self.clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
clip_norm=self.clip_norm, need_clip=fileter_func)
def check_clip_result(self, loss, optimizer): def check_clip_result(self, loss, optimizer):
# if grad is None # if grad is None
...@@ -435,14 +404,9 @@ class TestDygraphGradientClipByNorm(TestDygraphGradientClip): ...@@ -435,14 +404,9 @@ class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
class TestDygraphGradientClipByValue(TestDygraphGradientClip): class TestDygraphGradientClipByValue(TestDygraphGradientClip):
def setUp(self): def setUp(self):
# only clip gradient of linear_0.w_0 (ParamBase)
def fileter_func(param):
return param.name == "linear_0.w_0"
self.max = 0.2 self.max = 0.2
self.min = 0.1 self.min = 0.1
self.clip = fluid.clip.GradientClipByValue( self.clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
max=self.max, min=self.min, need_clip=fileter_func)
def check_clip_result(self, loss, optimizer): def check_clip_result(self, loss, optimizer):
# if grad is None # if grad is None
......
...@@ -31,9 +31,9 @@ __all__ += rnn.__all__ ...@@ -31,9 +31,9 @@ __all__ += rnn.__all__
__all__ += weight_norm_hook.__all__ __all__ += weight_norm_hook.__all__
# TODO: define alias in nn directory # TODO: define alias in nn directory
from .clip import GradientClipByGlobalNorm #DEFINE_ALIAS from .clip import ClipGradByGlobalNorm #DEFINE_ALIAS
from .clip import GradientClipByNorm #DEFINE_ALIAS from .clip import ClipGradByNorm #DEFINE_ALIAS
from .clip import GradientClipByValue #DEFINE_ALIAS from .clip import ClipGradByValue #DEFINE_ALIAS
# from .clip import set_gradient_clip #DEFINE_ALIAS # from .clip import set_gradient_clip #DEFINE_ALIAS
from .clip import clip #DEFINE_ALIAS from .clip import clip #DEFINE_ALIAS
from .clip import clip_by_norm #DEFINE_ALIAS from .clip import clip_by_norm #DEFINE_ALIAS
...@@ -51,13 +51,13 @@ from .decode import beam_search_decode #DEFINE_ALIAS ...@@ -51,13 +51,13 @@ from .decode import beam_search_decode #DEFINE_ALIAS
# from .decode import dynamic_decode #DEFINE_ALIAS # from .decode import dynamic_decode #DEFINE_ALIAS
from .decode import gather_tree #DEFINE_ALIAS from .decode import gather_tree #DEFINE_ALIAS
# from .input import Input #DEFINE_ALIAS # from .input import Input #DEFINE_ALIAS
from .layer.activation import ELU from .layer.activation import ELU #DEFINE_ALIAS
from .layer.activation import GELU from .layer.activation import GELU #DEFINE_ALIAS
from .layer.activation import Tanh from .layer.activation import Tanh #DEFINE_ALIAS
from .layer.activation import Hardshrink from .layer.activation import Hardshrink #DEFINE_ALIAS
from .layer.activation import Hardtanh from .layer.activation import Hardtanh #DEFINE_ALIAS
from .layer.activation import PReLU from .layer.activation import PReLU #DEFINE_ALIAS
from .layer.activation import ReLU from .layer.activation import ReLU #DEFINE_ALIAS
from .layer.activation import ReLU6 #DEFINE_ALIAS from .layer.activation import ReLU6 #DEFINE_ALIAS
from .layer.activation import SELU #DEFINE_ALIAS from .layer.activation import SELU #DEFINE_ALIAS
from .layer.activation import LeakyReLU #DEFINE_ALIAS from .layer.activation import LeakyReLU #DEFINE_ALIAS
......
...@@ -13,18 +13,18 @@ ...@@ -13,18 +13,18 @@
# limitations under the License. # limitations under the License.
# TODO: define the functions to clip gradient of parameter # TODO: define the functions to clip gradient of parameter
from ..fluid.clip import GradientClipByGlobalNorm #DEFINE_ALIAS from ..fluid.clip import ClipGradByGlobalNorm #DEFINE_ALIAS
from ..fluid.clip import GradientClipByNorm #DEFINE_ALIAS from ..fluid.clip import ClipGradByNorm #DEFINE_ALIAS
from ..fluid.clip import GradientClipByValue #DEFINE_ALIAS from ..fluid.clip import ClipGradByValue #DEFINE_ALIAS
from ..fluid.layers import clip #DEFINE_ALIAS from ..fluid.layers import clip #DEFINE_ALIAS
from ..fluid.layers import clip_by_norm #DEFINE_ALIAS from ..fluid.layers import clip_by_norm #DEFINE_ALIAS
__all__ = [ __all__ = [
# 'ErrorClipByValue', # 'ErrorClipByValue',
'GradientClipByGlobalNorm', 'ClipGradByGlobalNorm',
'GradientClipByNorm', 'ClipGradByNorm',
'GradientClipByValue', 'ClipGradByValue',
# 'set_gradient_clip', # 'set_gradient_clip',
'clip', 'clip',
'clip_by_norm' 'clip_by_norm'
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册