未验证 提交 994438b1 编写于 作者: Q Qi Li 提交者: GitHub

change clip grad api, test=develop (#27767)

上级 365c2c9c
......@@ -26,8 +26,8 @@ from . import name_scope
from .dygraph import base as imperative_base
__all__ = [
'set_gradient_clip', 'ErrorClipByValue', 'GradientClipByValue',
'GradientClipByNorm', 'GradientClipByGlobalNorm'
'set_gradient_clip', 'ErrorClipByValue', 'ClipGradByValue',
'ClipGradByNorm', 'ClipGradByGlobalNorm'
]
......@@ -115,16 +115,9 @@ def error_clip_callback(block, context):
error_clip._append_clip_op(block, grad_n)
class GradientClipBase(object):
def __init__(self, need_clip=None):
if need_clip is not None and not callable(need_clip):
raise TypeError(
"The type of need_clip must be funciton, and it can filter out "
"parameter that does't need gradient clip. This function must return "
"True or False, and True means that clipping is required. Please refer to "
"API documention of GradientClipByGlobalNorm / GradientClipByNorm "
"/GradientClipByValue.")
self._need_clip_func = need_clip
class ClipGradBase(object):
def __init__(self):
super(ClipGradBase, self).__init__()
def __str__(self):
raise NotImplementedError()
......@@ -144,7 +137,7 @@ class GradientClipBase(object):
if getattr(p, 'gradient_clip_attr', None) is not None:
warnings.warn(
"'set_gradient_clip' will be ineffective, because you have "
"set 'grad_clip' in 'optimizer'. So, 'set_gradient_clip' "
"set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' "
"is redundant and you can remove it.")
break
return self._static_clip(params_grads)
......@@ -156,7 +149,7 @@ class GradientClipBase(object):
raise NotImplementedError()
class GradientClipByValue(GradientClipBase):
class ClipGradByValue(ClipGradBase):
"""
Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
......@@ -164,19 +157,20 @@ class GradientClipByValue(GradientClipBase):
- Any values greater than max are set to ``max``.
The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
is not None, then only part of gradients can be selected for gradient clipping.
The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`).
Note:
``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args:
max (float): The maximum value to clip by.
min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
automatically. In this case, ``max`` must be greater than 0.
need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool``
(True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None,
and gradients of all parameters in the network will be clipped.
Examples:
.. code-block:: python
......@@ -184,29 +178,20 @@ class GradientClipByValue(GradientClipBase):
import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(10, 10)
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
loss = paddle.mean(out)
loss.backward()
# clip all parameters in network:
clip = paddle.nn.GradientClipByValue(min=-1, max=1)
# clip a part of parameters in network: (e.g. linear_0.w_0)
# pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
# def fileter_func(ParamBase):
# # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
# return ParamBase.name == "linear_0.w_0"
# # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
# return ParamBase.name == linear.weight.name
# clip = paddle.nn.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)
clip = paddle.nn.ClipGradByValue(min=-1, max=1)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step()
"""
def __init__(self, max, min=None, need_clip=None):
super(GradientClipByValue, self).__init__(need_clip)
def __init__(self, max, min=None):
super(ClipGradByValue, self).__init__()
if min is None:
assert (max > 0.0)
min = -max
......@@ -214,7 +199,7 @@ class GradientClipByValue(GradientClipBase):
self.min = float(min)
def __str__(self):
return "Gradient Clip By Value, min = %f, max=%f" % (self.min, self.max)
return "Clip Gradient By Value, min = %f, max=%f" % (self.min, self.max)
@imperative_base.no_grad
def _dygraph_clip(self, params_grads):
......@@ -222,7 +207,7 @@ class GradientClipByValue(GradientClipBase):
for p, g in params_grads:
if g is None:
continue
if self._need_clip_func is not None and not self._need_clip_func(p):
if getattr(p, 'need_clip', True) is False:
params_and_grads.append((p, g))
continue
new_grad = layers.clip(x=g, min=self.min, max=self.max)
......@@ -236,8 +221,7 @@ class GradientClipByValue(GradientClipBase):
for p, g in params_grads:
if g is None:
continue
if self._need_clip_func is not None and not self._need_clip_func(
p):
if getattr(p, 'need_clip', True) is False:
params_and_grads.append((p, g))
continue
......@@ -256,7 +240,7 @@ class GradientClipByValue(GradientClipBase):
return param, new_grad
class GradientClipByNorm(GradientClipBase):
class ClipGradByNorm(ClipGradBase):
"""
Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
......@@ -264,8 +248,8 @@ class GradientClipByNorm(GradientClipBase):
- If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
is not None, then only part of gradients can be selected for gradient clipping.
The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`).
......@@ -287,11 +271,12 @@ class GradientClipByNorm(GradientClipBase):
.. math::
norm(X) = ( \\sum_{i=1}^{n}|x\_i|^2)^{ \\frac{1}{2}}
Note:
``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args:
clip_norm(float): The maximum norm value.
need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool``
(True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None,
and gradients of all parameters in the network will be clipped.
Examples:
.. code-block:: python
......@@ -299,29 +284,20 @@ class GradientClipByNorm(GradientClipBase):
import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(10, 10)
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
loss = paddle.mean(out)
loss.backward()
# clip all parameters in network:
clip = paddle.nn.GradientClipByNorm(clip_norm=1.0)
# clip a part of parameters in network: (e.g. linear_0.w_0)
# pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
# def fileter_func(ParamBase):
# # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
# return ParamBase.name == "linear_0.w_0"
# # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
# return ParamBase.name == linear.weight.name
# clip = paddle.nn.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func)
clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step()
"""
def __init__(self, clip_norm, need_clip=None):
super(GradientClipByNorm, self).__init__(need_clip)
def __init__(self, clip_norm):
super(ClipGradByNorm, self).__init__()
self.clip_norm = float(clip_norm)
def __str__(self):
......@@ -333,7 +309,7 @@ class GradientClipByNorm(GradientClipBase):
for p, g in params_grads:
if g is None:
continue
if self._need_clip_func is not None and not self._need_clip_func(p):
if getattr(p, 'need_clip', True) is False:
params_and_grads.append((p, g))
continue
new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
......@@ -347,8 +323,7 @@ class GradientClipByNorm(GradientClipBase):
for p, g in params_grads:
if g is None:
continue
if self._need_clip_func is not None and not self._need_clip_func(
p):
if getattr(p, 'need_clip', True) is False:
params_and_grads.append((p, g))
continue
......@@ -367,7 +342,7 @@ class GradientClipByNorm(GradientClipBase):
return param, new_grad
class GradientClipByGlobalNorm(GradientClipBase):
class ClipGradByGlobalNorm(ClipGradBase):
"""
Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
:math:`t\_list` , and limit it to ``clip_norm`` .
......@@ -376,8 +351,8 @@ class GradientClipByGlobalNorm(GradientClipBase):
- If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
is not None, then only part of gradients can be selected for gradient clipping.
The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`).
......@@ -394,12 +369,13 @@ class GradientClipByGlobalNorm(GradientClipBase):
global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
Note:
``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args:
clip_norm (float): The maximum norm value.
group_name (str, optional): The group name for this clip. Default value is ``default_group``
need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool``
(True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None,
and gradients of all parameters in the network will be clipped.
group_name (str, optional): The group name for this clip. Default value is ``default_group``.
Examples:
.. code-block:: python
......@@ -407,29 +383,20 @@ class GradientClipByGlobalNorm(GradientClipBase):
import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(10, 10)
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
loss = paddle.mean(out)
loss.backward()
# clip all parameters in network:
clip = paddle.nn.GradientClipByGlobalNorm(clip_norm=1.0)
# clip a part of parameters in network: (e.g. linear_0.w_0)
# pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
# def fileter_func(ParamBase):
# # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
# return ParamBase.name == "linear_0.w_0"
# # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
# return ParamBase.name == linear.weight.name
# clip = paddle.nn.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step()
"""
def __init__(self, clip_norm, group_name="default_group", need_clip=None):
super(GradientClipByGlobalNorm, self).__init__(need_clip)
def __init__(self, clip_norm, group_name="default_group"):
super(ClipGradByGlobalNorm, self).__init__()
self.clip_norm = float(clip_norm)
self.group_name = group_name
......@@ -443,7 +410,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
for p, g in params_grads:
if g is None:
continue
if self._need_clip_func is not None and not self._need_clip_func(p):
if getattr(p, 'need_clip', True) is False:
continue
merge_grad = g
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
......@@ -469,7 +436,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
for p, g in params_grads:
if g is None:
continue
if self._need_clip_func is not None and not self._need_clip_func(p):
if getattr(p, 'need_clip', True) is False:
params_and_grads.append((p, g))
continue
new_grad = layers.elementwise_mul(x=g, y=clip_var)
......@@ -484,8 +451,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
for p, g in params_grads:
if g is None:
continue
if self._need_clip_func is not None and not self._need_clip_func(
p):
if getattr(p, 'need_clip', True) is False:
continue
merge_grad = g
with p.block.program._optimized_guard([p, g]):
......@@ -518,8 +484,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
for p, g in params_grads:
if g is None:
continue
if self._need_clip_func is not None and not self._need_clip_func(
p):
if getattr(p, 'need_clip', True) is False:
params_and_grads.append((p, g))
continue
......@@ -670,9 +635,9 @@ def set_gradient_clip(clip, param_list=None, program=None):
"This method can reduce the mistakes, please "
"refer to documention of 'optimizer'.")
if not isinstance(clip, GradientClipBase):
if not isinstance(clip, ClipGradBase):
raise TypeError(
"'clip' should be an instance of GradientClipBase's derived class")
"'clip' should be an instance of ClipGradBase's derived class")
if program is None:
program = framework.default_main_program()
......@@ -708,7 +673,7 @@ def append_gradient_clip_ops(param_grads):
clip_attr = getattr(p, 'gradient_clip_attr', None)
if clip_attr is None:
return param_grads
if not isinstance(clip_attr, GradientClipBase):
if not isinstance(clip_attr, ClipGradBase):
raise TypeError(
"clip attribute should be an instance of GradientClipBase")
......@@ -754,6 +719,7 @@ def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
op._set_attr('op_role_var', correct_p_g)
ClipByValue = GradientClipByValue
ClipByNorm = GradientClipByNorm
ClipByGlobalNorm = GradientClipByGlobalNorm
GradientClipBase = ClipGradBase
GradientClipByValue = ClipGradByValue
GradientClipByNorm = ClipGradByNorm
GradientClipByGlobalNorm = ClipGradByGlobalNorm
......@@ -5123,6 +5123,8 @@ class Parameter(Variable):
be applied on the parameter. Default: None
do_model_average(bool): True if the model average strategy will
be applied on this parameter.
need_clip (bool): Whether the parameter gradient need to be cliped
in optimizer. Default is True.
"""
def __init__(self,
......@@ -5162,6 +5164,8 @@ class Parameter(Variable):
self.do_model_average = kwargs.get('do_model_average', None)
self.need_clip = kwargs.get('need_clip', True)
self.is_distributed = False
def __str__(self):
......@@ -5194,7 +5198,7 @@ class Parameter(Variable):
if with_details:
res_str = Variable.to_string(self, throw_on_error, True)
additional_attr = ("trainable", "optimize_attr", "regularizer",
"do_model_average")
"do_model_average", "need_clip")
for attr_name in additional_attr:
res_str += "%s: %s\n" % (attr_name,
cpt.to_text(getattr(self, attr_name)))
......@@ -5226,6 +5230,8 @@ class ParamBase(core.VarBase):
be applied on the ParamBase. Default: None
do_model_average(bool): True if the model average strategy will
be applied on this ParamBase.
need_clip (bool): Whether the parameter gradient need to be cliped
in optimizer. Default is True.
"""
@dygraph_only
......@@ -5265,6 +5271,8 @@ class ParamBase(core.VarBase):
self.do_model_average = kwargs.get('do_model_average', None)
self.need_clip = kwargs.get('need_clip', True)
self.is_distributed = False
# self.block = default_main_program().global_block()
......
......@@ -36,8 +36,8 @@ class ParamAttr(object):
Note:
``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
It is recommended to set ``grad_clip`` in ``optimizer`` to clip gradient.
There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` ,
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
Parameters:
......@@ -57,6 +57,7 @@ class ParamAttr(object):
trainable (bool): Whether this parameter is trainable. Default True.
do_model_average (bool): Whether this parameter should do model average
when model average is enabled. Default False.
need_clip (bool): Whether the parameter gradient need to be cliped in optimizer. Default is True.
Examples:
.. code-block:: python
......@@ -78,7 +79,8 @@ class ParamAttr(object):
learning_rate=1.0,
regularizer=None,
trainable=True,
do_model_average=True):
do_model_average=True,
need_clip=True):
if sys.version_info.major == 2:
check_type(name, "name", (str, type(None), unicode), "ParamAttr")
......@@ -87,6 +89,7 @@ class ParamAttr(object):
check_type(learning_rate, "learning_rate", (float, int), "ParamAttr")
check_type(trainable, "trainable", (bool), "ParamAttr")
check_type(do_model_average, "do_model_average", (bool), "ParamAttr")
check_type(need_clip, "need_clip", (bool), "ParamAttr")
check_type(initializer, "initializer", (Initializer, type(None)),
"ParamAttr")
check_type(regularizer, "regularizer",
......@@ -101,6 +104,7 @@ class ParamAttr(object):
self.regularizer = regularizer
self.trainable = trainable
self.do_model_average = do_model_average
self.need_clip = need_clip
def _set_default_initializer(self, initializer):
"""
......@@ -197,7 +201,8 @@ class ParamAttr(object):
},
'regularizer': self.regularizer,
'trainable': self.trainable,
'do_model_average': self.do_model_average
'do_model_average': self.do_model_average,
'need_clip': self.need_clip
}
if with_initializer:
kwargs['initializer'] = self.initializer
......@@ -219,9 +224,9 @@ class WeightNormParamAttr(ParamAttr):
<https://arxiv.org/pdf/1602.07868.pdf>`_.
Note:
``gradient_clip`` of ``WeightNormParamAttr`` HAS BEEN DEPRECATED since 2.0.
It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient.
There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` ,
``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
......@@ -248,6 +253,7 @@ class WeightNormParamAttr(ParamAttr):
trainable(bool, optional): Whether this parameter is trainable. Default True.
do_model_average(bool, optional): Whether this parameter should do model average.
Default False.
need_clip (bool, optional): Whether the parameter gradient need to be cliped in optimizer. Default is True.
Examples:
.. code-block:: python
......@@ -267,7 +273,8 @@ class WeightNormParamAttr(ParamAttr):
learning_rate=1.0,
regularizer=paddle.regularizer.L2Decay(0.1),
trainable=True,
do_model_average=False))
do_model_average=False,
need_clip=True))
"""
# List to record the parameters reparameterized by weight normalization.
......@@ -283,12 +290,14 @@ class WeightNormParamAttr(ParamAttr):
learning_rate=1.0,
regularizer=None,
trainable=True,
do_model_average=False):
do_model_average=False,
need_clip=True):
super(WeightNormParamAttr, self).__init__(
name=name,
initializer=initializer,
learning_rate=learning_rate,
regularizer=regularizer,
trainable=trainable,
do_model_average=do_model_average)
do_model_average=do_model_average,
need_clip=need_clip)
self.dim = dim
......@@ -185,12 +185,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# invoke 'set_gradient_clip' in a wrong order
def test_wrong_API_order(self):
def backward_func(cost):
# no clip gradient
def fileter_func(param):
return param.name == "fc.w_0"
clip = fluid.clip.GradientClipByGlobalNorm(
clip_norm=5.0, need_clip=fileter_func)
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
fluid.clip.set_gradient_clip(clip)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01,
grad_clip=clip)
......@@ -205,11 +200,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# if grad is None or not need clip
def test_none_grad(self):
def fileter_func(param):
return param.name == "x"
clip = fluid.clip.GradientClipByGlobalNorm(
self.clip_norm, need_clip=fileter_func)
clip = fluid.clip.GradientClipByGlobalNorm(self.clip_norm)
x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32")
y = fluid.default_main_program().global_block().create_parameter(
......@@ -228,11 +219,6 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# raise typeError
def test_tpyeError(self):
# the type of need_clip must be an funciton
with self.assertRaises(TypeError):
clip = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm, need_clip="test")
# the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class
with self.assertRaises(TypeError):
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1,
......@@ -264,26 +250,22 @@ class TestGradientClipByNorm(TestGradientClip):
# if grad is None or not need clip
def test_none_grad(self):
def fileter_func(param):
return param.name == "z"
clip = fluid.clip.GradientClipByNorm(
self.clip_norm, need_clip=fileter_func)
clip = fluid.clip.GradientClipByNorm(self.clip_norm)
x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32")
name="x", shape=[2, 3], dtype="float32", need_clip=False)
y = fluid.default_main_program().global_block().create_parameter(
name="y", shape=[2, 3], dtype="float32")
name="y", shape=[2, 3], dtype="float32", need_clip=False)
# (x, None) should not be returned
params_grads = [(x, None), (x, y)]
params_grads = clip(params_grads)
self.assertTrue(
len(clip(params_grads)) == 1,
"ClipByNorm: when grad is None, it shouldn't be returned by gradient clip!"
"ClipGradByNorm: when grad is None, it shouldn't be returned by gradient clip!"
)
self.assertTrue(
params_grads[0][1].name == 'y',
"ClipByNorm: grad should not be clipped when filtered out!")
"ClipGradByNorm: grad should not be clipped when filtered out!")
class TestGradientClipByValue(TestGradientClip):
......@@ -312,26 +294,22 @@ class TestGradientClipByValue(TestGradientClip):
# if grad is None or not need clip
def test_none_grad(self):
def fileter_func(param):
return param.name == "z"
clip = fluid.clip.GradientClipByValue(
self.max, self.min, need_clip=fileter_func)
clip = fluid.clip.GradientClipByValue(self.max, self.min)
x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32")
name="x", shape=[2, 3], dtype="float32", need_clip=False)
y = fluid.default_main_program().global_block().create_parameter(
name="y", shape=[2, 3], dtype="float32")
name="y", shape=[2, 3], dtype="float32", need_clip=False)
# (x, None) should not be returned
params_grads = [(x, None), (x, y)]
params_grads = clip(params_grads)
self.assertTrue(
len(clip(params_grads)) == 1,
"ClipByValue: when grad is None, it shouldn't be returned by gradient clip!"
"ClipGradByValue: when grad is None, it shouldn't be returned by gradient clip!"
)
self.assertTrue(
params_grads[0][1].name == 'y',
"ClipByValue: grad should not be clipped when filtered out!")
"ClipGradByValue: grad should not be clipped when filtered out!")
class TestDygraphGradientClip(unittest.TestCase):
......@@ -355,13 +333,9 @@ class TestDygraphGradientClip(unittest.TestCase):
class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
def setUp(self):
# only clip gradient of x (ParamBase)
def fileter_func(param):
return param.name == "x"
self.clip_norm = 0.8
self.clip1 = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm, need_clip=fileter_func)
clip_norm=self.clip_norm)
self.clip2 = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm)
......@@ -401,13 +375,8 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
def setUp(self):
# only clip gradient of linear_0.w_0 (ParamBase)
def fileter_func(param):
return param.name == "linear_0.w_0"
self.clip_norm = 0.8
self.clip = fluid.clip.GradientClipByNorm(
clip_norm=self.clip_norm, need_clip=fileter_func)
self.clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
def check_clip_result(self, loss, optimizer):
# if grad is None
......@@ -435,14 +404,9 @@ class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
class TestDygraphGradientClipByValue(TestDygraphGradientClip):
def setUp(self):
# only clip gradient of linear_0.w_0 (ParamBase)
def fileter_func(param):
return param.name == "linear_0.w_0"
self.max = 0.2
self.min = 0.1
self.clip = fluid.clip.GradientClipByValue(
max=self.max, min=self.min, need_clip=fileter_func)
self.clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
def check_clip_result(self, loss, optimizer):
# if grad is None
......
......@@ -31,9 +31,9 @@ __all__ += rnn.__all__
__all__ += weight_norm_hook.__all__
# TODO: define alias in nn directory
from .clip import GradientClipByGlobalNorm #DEFINE_ALIAS
from .clip import GradientClipByNorm #DEFINE_ALIAS
from .clip import GradientClipByValue #DEFINE_ALIAS
from .clip import ClipGradByGlobalNorm #DEFINE_ALIAS
from .clip import ClipGradByNorm #DEFINE_ALIAS
from .clip import ClipGradByValue #DEFINE_ALIAS
# from .clip import set_gradient_clip #DEFINE_ALIAS
from .clip import clip #DEFINE_ALIAS
from .clip import clip_by_norm #DEFINE_ALIAS
......@@ -51,13 +51,13 @@ from .decode import beam_search_decode #DEFINE_ALIAS
# from .decode import dynamic_decode #DEFINE_ALIAS
from .decode import gather_tree #DEFINE_ALIAS
# from .input import Input #DEFINE_ALIAS
from .layer.activation import ELU
from .layer.activation import GELU
from .layer.activation import Tanh
from .layer.activation import Hardshrink
from .layer.activation import Hardtanh
from .layer.activation import PReLU
from .layer.activation import ReLU
from .layer.activation import ELU #DEFINE_ALIAS
from .layer.activation import GELU #DEFINE_ALIAS
from .layer.activation import Tanh #DEFINE_ALIAS
from .layer.activation import Hardshrink #DEFINE_ALIAS
from .layer.activation import Hardtanh #DEFINE_ALIAS
from .layer.activation import PReLU #DEFINE_ALIAS
from .layer.activation import ReLU #DEFINE_ALIAS
from .layer.activation import ReLU6 #DEFINE_ALIAS
from .layer.activation import SELU #DEFINE_ALIAS
from .layer.activation import LeakyReLU #DEFINE_ALIAS
......
......@@ -13,18 +13,18 @@
# limitations under the License.
# TODO: define the functions to clip gradient of parameter
from ..fluid.clip import GradientClipByGlobalNorm #DEFINE_ALIAS
from ..fluid.clip import GradientClipByNorm #DEFINE_ALIAS
from ..fluid.clip import GradientClipByValue #DEFINE_ALIAS
from ..fluid.clip import ClipGradByGlobalNorm #DEFINE_ALIAS
from ..fluid.clip import ClipGradByNorm #DEFINE_ALIAS
from ..fluid.clip import ClipGradByValue #DEFINE_ALIAS
from ..fluid.layers import clip #DEFINE_ALIAS
from ..fluid.layers import clip_by_norm #DEFINE_ALIAS
__all__ = [
# 'ErrorClipByValue',
'GradientClipByGlobalNorm',
'GradientClipByNorm',
'GradientClipByValue',
'ClipGradByGlobalNorm',
'ClipGradByNorm',
'ClipGradByValue',
# 'set_gradient_clip',
'clip',
'clip_by_norm'
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册