From 994438b10956965a6034e541aa2578ef3dc41ac6 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Fri, 9 Oct 2020 12:28:26 +0800 Subject: [PATCH] change clip grad api, test=develop (#27767) --- python/paddle/fluid/clip.py | 158 +++++++----------- python/paddle/fluid/framework.py | 10 +- python/paddle/fluid/param_attr.py | 29 ++-- .../tests/unittests/test_gradient_clip.py | 66 ++------ python/paddle/nn/__init__.py | 20 +-- python/paddle/nn/clip.py | 12 +- 6 files changed, 121 insertions(+), 174 deletions(-) diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 0e7a9dbea25..505d6fef8fb 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -26,8 +26,8 @@ from . import name_scope from .dygraph import base as imperative_base __all__ = [ - 'set_gradient_clip', 'ErrorClipByValue', 'GradientClipByValue', - 'GradientClipByNorm', 'GradientClipByGlobalNorm' + 'set_gradient_clip', 'ErrorClipByValue', 'ClipGradByValue', + 'ClipGradByNorm', 'ClipGradByGlobalNorm' ] @@ -115,16 +115,9 @@ def error_clip_callback(block, context): error_clip._append_clip_op(block, grad_n) -class GradientClipBase(object): - def __init__(self, need_clip=None): - if need_clip is not None and not callable(need_clip): - raise TypeError( - "The type of need_clip must be funciton, and it can filter out " - "parameter that does't need gradient clip. This function must return " - "True or False, and True means that clipping is required. Please refer to " - "API documention of GradientClipByGlobalNorm / GradientClipByNorm " - "/GradientClipByValue.") - self._need_clip_func = need_clip +class ClipGradBase(object): + def __init__(self): + super(ClipGradBase, self).__init__() def __str__(self): raise NotImplementedError() @@ -144,7 +137,7 @@ class GradientClipBase(object): if getattr(p, 'gradient_clip_attr', None) is not None: warnings.warn( "'set_gradient_clip' will be ineffective, because you have " - "set 'grad_clip' in 'optimizer'. So, 'set_gradient_clip' " + "set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' " "is redundant and you can remove it.") break return self._static_clip(params_grads) @@ -156,7 +149,7 @@ class GradientClipBase(object): raise NotImplementedError() -class GradientClipByValue(GradientClipBase): +class ClipGradByValue(ClipGradBase): """ Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max]. @@ -164,19 +157,20 @@ class GradientClipByValue(GradientClipBase): - Any values greater than max are set to ``max``. - The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip`` - is not None, then only part of gradients can be selected for gradient clipping. + The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``. + If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped. Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` (for example: :ref:`api_paddle_optimizer_SGD`). + + Note: + ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0. + Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope. Args: max (float): The maximum value to clip by. min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max`` automatically. In this case, ``max`` must be greater than 0. - need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool`` - (True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None, - and gradients of all parameters in the network will be clipped. Examples: .. code-block:: python @@ -184,29 +178,20 @@ class GradientClipByValue(GradientClipBase): import paddle x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32') - linear = paddle.nn.Linear(10, 10) + linear = paddle.nn.Linear(in_features=10, out_features=10, + weight_attr=paddle.ParamAttr(need_clip=True), + bias_attr=paddle.ParamAttr(need_clip=False)) out = linear(x) loss = paddle.mean(out) loss.backward() - # clip all parameters in network: - clip = paddle.nn.GradientClipByValue(min=-1, max=1) - - # clip a part of parameters in network: (e.g. linear_0.w_0) - # pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool - # def fileter_func(ParamBase): - # # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0) - # return ParamBase.name == "linear_0.w_0" - # # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter - # return ParamBase.name == linear.weight.name - # clip = paddle.nn.GradientClipByValue(min=-1, max=1, need_clip=fileter_func) - + clip = paddle.nn.ClipGradByValue(min=-1, max=1) sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip) sdg.step() """ - def __init__(self, max, min=None, need_clip=None): - super(GradientClipByValue, self).__init__(need_clip) + def __init__(self, max, min=None): + super(ClipGradByValue, self).__init__() if min is None: assert (max > 0.0) min = -max @@ -214,7 +199,7 @@ class GradientClipByValue(GradientClipBase): self.min = float(min) def __str__(self): - return "Gradient Clip By Value, min = %f, max=%f" % (self.min, self.max) + return "Clip Gradient By Value, min = %f, max=%f" % (self.min, self.max) @imperative_base.no_grad def _dygraph_clip(self, params_grads): @@ -222,7 +207,7 @@ class GradientClipByValue(GradientClipBase): for p, g in params_grads: if g is None: continue - if self._need_clip_func is not None and not self._need_clip_func(p): + if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue new_grad = layers.clip(x=g, min=self.min, max=self.max) @@ -236,8 +221,7 @@ class GradientClipByValue(GradientClipBase): for p, g in params_grads: if g is None: continue - if self._need_clip_func is not None and not self._need_clip_func( - p): + if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue @@ -256,7 +240,7 @@ class GradientClipByValue(GradientClipBase): return param, new_grad -class GradientClipByNorm(GradientClipBase): +class ClipGradByNorm(ClipGradBase): """ Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` . @@ -264,8 +248,8 @@ class GradientClipByNorm(GradientClipBase): - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done. - The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip`` - is not None, then only part of gradients can be selected for gradient clipping. + The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``. + If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped. Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` (for example: :ref:`api_paddle_optimizer_SGD`). @@ -287,11 +271,12 @@ class GradientClipByNorm(GradientClipBase): .. math:: norm(X) = ( \\sum_{i=1}^{n}|x\_i|^2)^{ \\frac{1}{2}} + Note: + ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0. + Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope. + Args: clip_norm(float): The maximum norm value. - need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool`` - (True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None, - and gradients of all parameters in the network will be clipped. Examples: .. code-block:: python @@ -299,29 +284,20 @@ class GradientClipByNorm(GradientClipBase): import paddle x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32') - linear = paddle.nn.Linear(10, 10) + linear = paddle.nn.Linear(in_features=10, out_features=10, + weight_attr=paddle.ParamAttr(need_clip=True), + bias_attr=paddle.ParamAttr(need_clip=False)) out = linear(x) loss = paddle.mean(out) loss.backward() - # clip all parameters in network: - clip = paddle.nn.GradientClipByNorm(clip_norm=1.0) - - # clip a part of parameters in network: (e.g. linear_0.w_0) - # pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool - # def fileter_func(ParamBase): - # # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0) - # return ParamBase.name == "linear_0.w_0" - # # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter - # return ParamBase.name == linear.weight.name - # clip = paddle.nn.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func) - + clip = paddle.nn.ClipGradByNorm(clip_norm=1.0) sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip) sdg.step() """ - def __init__(self, clip_norm, need_clip=None): - super(GradientClipByNorm, self).__init__(need_clip) + def __init__(self, clip_norm): + super(ClipGradByNorm, self).__init__() self.clip_norm = float(clip_norm) def __str__(self): @@ -333,7 +309,7 @@ class GradientClipByNorm(GradientClipBase): for p, g in params_grads: if g is None: continue - if self._need_clip_func is not None and not self._need_clip_func(p): + if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm) @@ -347,8 +323,7 @@ class GradientClipByNorm(GradientClipBase): for p, g in params_grads: if g is None: continue - if self._need_clip_func is not None and not self._need_clip_func( - p): + if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue @@ -367,7 +342,7 @@ class GradientClipByNorm(GradientClipBase): return param, new_grad -class GradientClipByGlobalNorm(GradientClipBase): +class ClipGradByGlobalNorm(ClipGradBase): """ Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in :math:`t\_list` , and limit it to ``clip_norm`` . @@ -376,8 +351,8 @@ class GradientClipByGlobalNorm(GradientClipBase): - If the global norm is less than or equal to ``clip_norm`` , nothing will be done. - The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip`` - is not None, then only part of gradients can be selected for gradient clipping. + The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``. + If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped. Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` (for example: :ref:`api_paddle_optimizer_SGD`). @@ -394,12 +369,13 @@ class GradientClipByGlobalNorm(GradientClipBase): global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2} + Note: + ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0. + Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope. + Args: clip_norm (float): The maximum norm value. - group_name (str, optional): The group name for this clip. Default value is ``default_group`` - need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool`` - (True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None, - and gradients of all parameters in the network will be clipped. + group_name (str, optional): The group name for this clip. Default value is ``default_group``. Examples: .. code-block:: python @@ -407,29 +383,20 @@ class GradientClipByGlobalNorm(GradientClipBase): import paddle x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32') - linear = paddle.nn.Linear(10, 10) + linear = paddle.nn.Linear(in_features=10, out_features=10, + weight_attr=paddle.ParamAttr(need_clip=True), + bias_attr=paddle.ParamAttr(need_clip=False)) out = linear(x) loss = paddle.mean(out) loss.backward() - # clip all parameters in network: - clip = paddle.nn.GradientClipByGlobalNorm(clip_norm=1.0) - - # clip a part of parameters in network: (e.g. linear_0.w_0) - # pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool - # def fileter_func(ParamBase): - # # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0) - # return ParamBase.name == "linear_0.w_0" - # # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter - # return ParamBase.name == linear.weight.name - # clip = paddle.nn.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func) - + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip) sdg.step() """ - def __init__(self, clip_norm, group_name="default_group", need_clip=None): - super(GradientClipByGlobalNorm, self).__init__(need_clip) + def __init__(self, clip_norm, group_name="default_group"): + super(ClipGradByGlobalNorm, self).__init__() self.clip_norm = float(clip_norm) self.group_name = group_name @@ -443,7 +410,7 @@ class GradientClipByGlobalNorm(GradientClipBase): for p, g in params_grads: if g is None: continue - if self._need_clip_func is not None and not self._need_clip_func(p): + if getattr(p, 'need_clip', True) is False: continue merge_grad = g if g.type == core.VarDesc.VarType.SELECTED_ROWS: @@ -469,7 +436,7 @@ class GradientClipByGlobalNorm(GradientClipBase): for p, g in params_grads: if g is None: continue - if self._need_clip_func is not None and not self._need_clip_func(p): + if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue new_grad = layers.elementwise_mul(x=g, y=clip_var) @@ -484,8 +451,7 @@ class GradientClipByGlobalNorm(GradientClipBase): for p, g in params_grads: if g is None: continue - if self._need_clip_func is not None and not self._need_clip_func( - p): + if getattr(p, 'need_clip', True) is False: continue merge_grad = g with p.block.program._optimized_guard([p, g]): @@ -518,8 +484,7 @@ class GradientClipByGlobalNorm(GradientClipBase): for p, g in params_grads: if g is None: continue - if self._need_clip_func is not None and not self._need_clip_func( - p): + if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue @@ -670,9 +635,9 @@ def set_gradient_clip(clip, param_list=None, program=None): "This method can reduce the mistakes, please " "refer to documention of 'optimizer'.") - if not isinstance(clip, GradientClipBase): + if not isinstance(clip, ClipGradBase): raise TypeError( - "'clip' should be an instance of GradientClipBase's derived class") + "'clip' should be an instance of ClipGradBase's derived class") if program is None: program = framework.default_main_program() @@ -708,7 +673,7 @@ def append_gradient_clip_ops(param_grads): clip_attr = getattr(p, 'gradient_clip_attr', None) if clip_attr is None: return param_grads - if not isinstance(clip_attr, GradientClipBase): + if not isinstance(clip_attr, ClipGradBase): raise TypeError( "clip attribute should be an instance of GradientClipBase") @@ -754,6 +719,7 @@ def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict): op._set_attr('op_role_var', correct_p_g) -ClipByValue = GradientClipByValue -ClipByNorm = GradientClipByNorm -ClipByGlobalNorm = GradientClipByGlobalNorm +GradientClipBase = ClipGradBase +GradientClipByValue = ClipGradByValue +GradientClipByNorm = ClipGradByNorm +GradientClipByGlobalNorm = ClipGradByGlobalNorm diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index a1cf11364f8..52c1e5d5e16 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -5123,6 +5123,8 @@ class Parameter(Variable): be applied on the parameter. Default: None do_model_average(bool): True if the model average strategy will be applied on this parameter. + need_clip (bool): Whether the parameter gradient need to be cliped + in optimizer. Default is True. """ def __init__(self, @@ -5162,6 +5164,8 @@ class Parameter(Variable): self.do_model_average = kwargs.get('do_model_average', None) + self.need_clip = kwargs.get('need_clip', True) + self.is_distributed = False def __str__(self): @@ -5194,7 +5198,7 @@ class Parameter(Variable): if with_details: res_str = Variable.to_string(self, throw_on_error, True) additional_attr = ("trainable", "optimize_attr", "regularizer", - "do_model_average") + "do_model_average", "need_clip") for attr_name in additional_attr: res_str += "%s: %s\n" % (attr_name, cpt.to_text(getattr(self, attr_name))) @@ -5226,6 +5230,8 @@ class ParamBase(core.VarBase): be applied on the ParamBase. Default: None do_model_average(bool): True if the model average strategy will be applied on this ParamBase. + need_clip (bool): Whether the parameter gradient need to be cliped + in optimizer. Default is True. """ @dygraph_only @@ -5265,6 +5271,8 @@ class ParamBase(core.VarBase): self.do_model_average = kwargs.get('do_model_average', None) + self.need_clip = kwargs.get('need_clip', True) + self.is_distributed = False # self.block = default_main_program().global_block() diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py index 83f54fc8208..bf042393706 100644 --- a/python/paddle/fluid/param_attr.py +++ b/python/paddle/fluid/param_attr.py @@ -36,8 +36,8 @@ class ParamAttr(object): Note: ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0. - It is recommended to set ``grad_clip`` in ``optimizer`` to clip gradient. - There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` , + Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope. + There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` . Parameters: @@ -57,6 +57,7 @@ class ParamAttr(object): trainable (bool): Whether this parameter is trainable. Default True. do_model_average (bool): Whether this parameter should do model average when model average is enabled. Default False. + need_clip (bool): Whether the parameter gradient need to be cliped in optimizer. Default is True. Examples: .. code-block:: python @@ -78,7 +79,8 @@ class ParamAttr(object): learning_rate=1.0, regularizer=None, trainable=True, - do_model_average=True): + do_model_average=True, + need_clip=True): if sys.version_info.major == 2: check_type(name, "name", (str, type(None), unicode), "ParamAttr") @@ -87,6 +89,7 @@ class ParamAttr(object): check_type(learning_rate, "learning_rate", (float, int), "ParamAttr") check_type(trainable, "trainable", (bool), "ParamAttr") check_type(do_model_average, "do_model_average", (bool), "ParamAttr") + check_type(need_clip, "need_clip", (bool), "ParamAttr") check_type(initializer, "initializer", (Initializer, type(None)), "ParamAttr") check_type(regularizer, "regularizer", @@ -101,6 +104,7 @@ class ParamAttr(object): self.regularizer = regularizer self.trainable = trainable self.do_model_average = do_model_average + self.need_clip = need_clip def _set_default_initializer(self, initializer): """ @@ -197,7 +201,8 @@ class ParamAttr(object): }, 'regularizer': self.regularizer, 'trainable': self.trainable, - 'do_model_average': self.do_model_average + 'do_model_average': self.do_model_average, + 'need_clip': self.need_clip } if with_initializer: kwargs['initializer'] = self.initializer @@ -219,9 +224,9 @@ class WeightNormParamAttr(ParamAttr): `_. Note: - ``gradient_clip`` of ``WeightNormParamAttr`` HAS BEEN DEPRECATED since 2.0. - It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient. - There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` , + ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0. + Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope. + There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` . @@ -248,6 +253,7 @@ class WeightNormParamAttr(ParamAttr): trainable(bool, optional): Whether this parameter is trainable. Default True. do_model_average(bool, optional): Whether this parameter should do model average. Default False. + need_clip (bool, optional): Whether the parameter gradient need to be cliped in optimizer. Default is True. Examples: .. code-block:: python @@ -267,7 +273,8 @@ class WeightNormParamAttr(ParamAttr): learning_rate=1.0, regularizer=paddle.regularizer.L2Decay(0.1), trainable=True, - do_model_average=False)) + do_model_average=False, + need_clip=True)) """ # List to record the parameters reparameterized by weight normalization. @@ -283,12 +290,14 @@ class WeightNormParamAttr(ParamAttr): learning_rate=1.0, regularizer=None, trainable=True, - do_model_average=False): + do_model_average=False, + need_clip=True): super(WeightNormParamAttr, self).__init__( name=name, initializer=initializer, learning_rate=learning_rate, regularizer=regularizer, trainable=trainable, - do_model_average=do_model_average) + do_model_average=do_model_average, + need_clip=need_clip) self.dim = dim diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py index cc54e680c75..f258e830b5f 100644 --- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py +++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py @@ -185,12 +185,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip): # invoke 'set_gradient_clip' in a wrong order def test_wrong_API_order(self): def backward_func(cost): - # no clip gradient - def fileter_func(param): - return param.name == "fc.w_0" - - clip = fluid.clip.GradientClipByGlobalNorm( - clip_norm=5.0, need_clip=fileter_func) + clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0) fluid.clip.set_gradient_clip(clip) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01, grad_clip=clip) @@ -205,11 +200,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip): # if grad is None or not need clip def test_none_grad(self): - def fileter_func(param): - return param.name == "x" - - clip = fluid.clip.GradientClipByGlobalNorm( - self.clip_norm, need_clip=fileter_func) + clip = fluid.clip.GradientClipByGlobalNorm(self.clip_norm) x = fluid.default_main_program().global_block().create_parameter( name="x", shape=[2, 3], dtype="float32") y = fluid.default_main_program().global_block().create_parameter( @@ -228,11 +219,6 @@ class TestGradientClipByGlobalNorm(TestGradientClip): # raise typeError def test_tpyeError(self): - # the type of need_clip must be an funciton - with self.assertRaises(TypeError): - clip = fluid.clip.GradientClipByGlobalNorm( - clip_norm=self.clip_norm, need_clip="test") - # the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class with self.assertRaises(TypeError): sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1, @@ -264,26 +250,22 @@ class TestGradientClipByNorm(TestGradientClip): # if grad is None or not need clip def test_none_grad(self): - def fileter_func(param): - return param.name == "z" - - clip = fluid.clip.GradientClipByNorm( - self.clip_norm, need_clip=fileter_func) + clip = fluid.clip.GradientClipByNorm(self.clip_norm) x = fluid.default_main_program().global_block().create_parameter( - name="x", shape=[2, 3], dtype="float32") + name="x", shape=[2, 3], dtype="float32", need_clip=False) y = fluid.default_main_program().global_block().create_parameter( - name="y", shape=[2, 3], dtype="float32") + name="y", shape=[2, 3], dtype="float32", need_clip=False) # (x, None) should not be returned params_grads = [(x, None), (x, y)] params_grads = clip(params_grads) self.assertTrue( len(clip(params_grads)) == 1, - "ClipByNorm: when grad is None, it shouldn't be returned by gradient clip!" + "ClipGradByNorm: when grad is None, it shouldn't be returned by gradient clip!" ) self.assertTrue( params_grads[0][1].name == 'y', - "ClipByNorm: grad should not be clipped when filtered out!") + "ClipGradByNorm: grad should not be clipped when filtered out!") class TestGradientClipByValue(TestGradientClip): @@ -312,26 +294,22 @@ class TestGradientClipByValue(TestGradientClip): # if grad is None or not need clip def test_none_grad(self): - def fileter_func(param): - return param.name == "z" - - clip = fluid.clip.GradientClipByValue( - self.max, self.min, need_clip=fileter_func) + clip = fluid.clip.GradientClipByValue(self.max, self.min) x = fluid.default_main_program().global_block().create_parameter( - name="x", shape=[2, 3], dtype="float32") + name="x", shape=[2, 3], dtype="float32", need_clip=False) y = fluid.default_main_program().global_block().create_parameter( - name="y", shape=[2, 3], dtype="float32") + name="y", shape=[2, 3], dtype="float32", need_clip=False) # (x, None) should not be returned params_grads = [(x, None), (x, y)] params_grads = clip(params_grads) self.assertTrue( len(clip(params_grads)) == 1, - "ClipByValue: when grad is None, it shouldn't be returned by gradient clip!" + "ClipGradByValue: when grad is None, it shouldn't be returned by gradient clip!" ) self.assertTrue( params_grads[0][1].name == 'y', - "ClipByValue: grad should not be clipped when filtered out!") + "ClipGradByValue: grad should not be clipped when filtered out!") class TestDygraphGradientClip(unittest.TestCase): @@ -355,13 +333,9 @@ class TestDygraphGradientClip(unittest.TestCase): class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip): def setUp(self): - # only clip gradient of x (ParamBase) - def fileter_func(param): - return param.name == "x" - self.clip_norm = 0.8 self.clip1 = fluid.clip.GradientClipByGlobalNorm( - clip_norm=self.clip_norm, need_clip=fileter_func) + clip_norm=self.clip_norm) self.clip2 = fluid.clip.GradientClipByGlobalNorm( clip_norm=self.clip_norm) @@ -401,13 +375,8 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip): class TestDygraphGradientClipByNorm(TestDygraphGradientClip): def setUp(self): - # only clip gradient of linear_0.w_0 (ParamBase) - def fileter_func(param): - return param.name == "linear_0.w_0" - self.clip_norm = 0.8 - self.clip = fluid.clip.GradientClipByNorm( - clip_norm=self.clip_norm, need_clip=fileter_func) + self.clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm) def check_clip_result(self, loss, optimizer): # if grad is None @@ -435,14 +404,9 @@ class TestDygraphGradientClipByNorm(TestDygraphGradientClip): class TestDygraphGradientClipByValue(TestDygraphGradientClip): def setUp(self): - # only clip gradient of linear_0.w_0 (ParamBase) - def fileter_func(param): - return param.name == "linear_0.w_0" - self.max = 0.2 self.min = 0.1 - self.clip = fluid.clip.GradientClipByValue( - max=self.max, min=self.min, need_clip=fileter_func) + self.clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min) def check_clip_result(self, loss, optimizer): # if grad is None diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py index 2452f196987..82fec5c0faa 100644 --- a/python/paddle/nn/__init__.py +++ b/python/paddle/nn/__init__.py @@ -31,9 +31,9 @@ __all__ += rnn.__all__ __all__ += weight_norm_hook.__all__ # TODO: define alias in nn directory -from .clip import GradientClipByGlobalNorm #DEFINE_ALIAS -from .clip import GradientClipByNorm #DEFINE_ALIAS -from .clip import GradientClipByValue #DEFINE_ALIAS +from .clip import ClipGradByGlobalNorm #DEFINE_ALIAS +from .clip import ClipGradByNorm #DEFINE_ALIAS +from .clip import ClipGradByValue #DEFINE_ALIAS # from .clip import set_gradient_clip #DEFINE_ALIAS from .clip import clip #DEFINE_ALIAS from .clip import clip_by_norm #DEFINE_ALIAS @@ -51,13 +51,13 @@ from .decode import beam_search_decode #DEFINE_ALIAS # from .decode import dynamic_decode #DEFINE_ALIAS from .decode import gather_tree #DEFINE_ALIAS # from .input import Input #DEFINE_ALIAS -from .layer.activation import ELU -from .layer.activation import GELU -from .layer.activation import Tanh -from .layer.activation import Hardshrink -from .layer.activation import Hardtanh -from .layer.activation import PReLU -from .layer.activation import ReLU +from .layer.activation import ELU #DEFINE_ALIAS +from .layer.activation import GELU #DEFINE_ALIAS +from .layer.activation import Tanh #DEFINE_ALIAS +from .layer.activation import Hardshrink #DEFINE_ALIAS +from .layer.activation import Hardtanh #DEFINE_ALIAS +from .layer.activation import PReLU #DEFINE_ALIAS +from .layer.activation import ReLU #DEFINE_ALIAS from .layer.activation import ReLU6 #DEFINE_ALIAS from .layer.activation import SELU #DEFINE_ALIAS from .layer.activation import LeakyReLU #DEFINE_ALIAS diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py index a50dad628cf..9fd1241bd83 100644 --- a/python/paddle/nn/clip.py +++ b/python/paddle/nn/clip.py @@ -13,18 +13,18 @@ # limitations under the License. # TODO: define the functions to clip gradient of parameter -from ..fluid.clip import GradientClipByGlobalNorm #DEFINE_ALIAS -from ..fluid.clip import GradientClipByNorm #DEFINE_ALIAS -from ..fluid.clip import GradientClipByValue #DEFINE_ALIAS +from ..fluid.clip import ClipGradByGlobalNorm #DEFINE_ALIAS +from ..fluid.clip import ClipGradByNorm #DEFINE_ALIAS +from ..fluid.clip import ClipGradByValue #DEFINE_ALIAS from ..fluid.layers import clip #DEFINE_ALIAS from ..fluid.layers import clip_by_norm #DEFINE_ALIAS __all__ = [ # 'ErrorClipByValue', - 'GradientClipByGlobalNorm', - 'GradientClipByNorm', - 'GradientClipByValue', + 'ClipGradByGlobalNorm', + 'ClipGradByNorm', + 'ClipGradByValue', # 'set_gradient_clip', 'clip', 'clip_by_norm' -- GitLab