From 7fda333ac1a5bcb1d1b1557562885edb69fb0f02 Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Thu, 2 Apr 2020 14:09:31 +0800 Subject: [PATCH] add new method of gradient_clip, better to use,test=develop (#23224) --- python/paddle/fluid/__init__.py | 2 - python/paddle/fluid/clip.py | 306 ++++++++++--- python/paddle/fluid/dygraph_grad_clip.py | 289 ------------ python/paddle/fluid/framework.py | 15 +- python/paddle/fluid/optimizer.py | 57 ++- python/paddle/fluid/param_attr.py | 6 +- .../tests/unittests/test_dist_transpiler.py | 11 +- .../unittests/test_grad_clip_minimize.py | 14 +- .../tests/unittests/test_gradient_clip.py | 417 +++++++++++++++--- .../unittests/test_imperative_auto_prune.py | 4 +- .../test_imperative_selected_rows.py | 5 +- 11 files changed, 665 insertions(+), 461 deletions(-) delete mode 100644 python/paddle/fluid/dygraph_grad_clip.py diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 8546b313986..04a70c5282c 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -75,7 +75,6 @@ from .transpiler import DistributeTranspiler, \ memory_optimize, release_memory, DistributeTranspilerConfig from .lod_tensor import create_lod_tensor, create_random_int_lodtensor from . import clip -from . import dygraph_grad_clip from . import profiler from . import unique_name from . import parallel_executor @@ -122,7 +121,6 @@ __all__ = framework.__all__ + executor.__all__ + \ 'WeightNormParamAttr', 'DataFeeder', 'clip', - 'dygraph_grad_clip', 'profiler', 'unique_name', 'Scope', diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index cc2bbf5071f..8647c2576ef 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -16,19 +16,18 @@ from __future__ import print_function import copy import six +import warnings import functools from . import layers from . import framework from . import core from . import name_scope +from .dygraph import base as imperative_base __all__ = [ - 'set_gradient_clip', - 'ErrorClipByValue', - 'GradientClipByValue', - 'GradientClipByNorm', - 'GradientClipByGlobalNorm', + 'set_gradient_clip', 'ErrorClipByValue', 'GradientClipByValue', + 'GradientClipByNorm', 'GradientClipByGlobalNorm' ] @@ -116,29 +115,51 @@ def error_clip_callback(block, context): error_clip._append_clip_op(block, grad_n) -class BaseGradientClipAttr(object): +class GradientClipBase(object): + def __init__(self, need_clip=None): + if need_clip is not None and not callable(need_clip): + raise TypeError( + "The type of need_clip must be funciton, and it can filter out " + "parameter that does't need gradient clip. This function must return " + "True or False, and True means that clipping is required. Please refer to " + "API documention of GradientClipByGlobalNorm / GradientClipByNorm " + "/GradientClipByValue.") + self._need_clip_func = need_clip + def __str__(self): raise NotImplementedError() - def _process_context(self, context, param, grad): - raise NotImplementedError() - - def _create_operators(self, param, grad): - raise NotImplementedError() + @imperative_base.no_grad + def _dygraph_clip(self, params_grads): + raise NotImplementedError + def _static_clip(self, params_grads): + raise NotImplementedError -class NullGradientClipAttr(BaseGradientClipAttr): - def __str__(self): - return "Null" + def __call__(self, params_grads): + assert len( + params_grads + ) > 0, "The number of trainable parameters should be greater than 0." + if framework.in_dygraph_mode(): + return self._dygraph_clip(params_grads) + else: + for p, g in params_grads: + if getattr(p, 'gradient_clip_attr', None) is not None: + warnings.warn( + "'set_gradient_clip' will be ineffective, because you have " + "pass 'grad_clip' into 'minimize'. So, 'set_gradient_clip' " + "is redundant and you can remove it.") + break + return self._static_clip(params_grads) def _process_context(self, context, param, grad): - pass + raise NotImplementedError() def _create_operators(self, param, grad): - return param, grad + raise NotImplementedError() -class GradientClipByValue(BaseGradientClipAttr): +class GradientClipByValue(GradientClipBase): """ Clips gradient values to the range [min, max]. @@ -168,17 +189,46 @@ class GradientClipByValue(BaseGradientClipAttr): input=x, size=1, param_attr=w_param_attrs) """ - def __init__(self, max, min=None): - max = float(max) + def __init__(self, max, min=None, need_clip=None): + super(GradientClipByValue, self).__init__(need_clip) if min is None: + assert (max > 0.0) min = -max - else: - min = float(min) - self.max = max - self.min = min + self.max = float(max) + self.min = float(min) def __str__(self): - return "ByValue, min=%f, max=%f" % (self.min, self.max) + return "Gradient Clip By Value, min = %f, max=%f" % (self.min, self.max) + + @imperative_base.no_grad + def _dygraph_clip(self, params_grads): + params_and_grads = [] + for p, g in params_grads: + if g is None: + continue + if self._need_clip_func is not None and not self._need_clip_func(p): + params_and_grads.append((p, g)) + continue + new_grad = layers.clip(x=g, min=self.min, max=self.max) + params_and_grads.append((p, new_grad)) + return params_and_grads + + def _static_clip(self, params_grads): + params_and_grads = [] + with framework.name_scope('gradient_clip'): + for p, g in params_grads: + if g is None: + continue + if self._need_clip_func is not None and not self._need_clip_func( + p): + params_and_grads.append((p, g)) + continue + + with p.block.program._optimized_guard([p, g]): + new_grad = layers.clip(x=g, min=self.min, max=self.max) + params_and_grads.append((p, new_grad)) + _correct_clip_op_role_var(params_and_grads) + return params_and_grads def _process_context(self, context, param, grad): pass @@ -188,7 +238,7 @@ class GradientClipByValue(BaseGradientClipAttr): return param, new_grad -class GradientClipByNorm(BaseGradientClipAttr): +class GradientClipByNorm(GradientClipBase): """ Convert the input multidimensional Tensor :math:`X` to a multidimensional Tensor whose L2 norm does not exceed the given two-norm maximum ( :math:`clip\_norm` ). @@ -268,11 +318,42 @@ class GradientClipByNorm(BaseGradientClipAttr): """ - def __init__(self, clip_norm): - self.clip_norm = clip_norm + def __init__(self, clip_norm, need_clip=None): + super(GradientClipByNorm, self).__init__(need_clip) + self.clip_norm = float(clip_norm) def __str__(self): - return "ByNorm, clip_norm=%f" % self.clip_norm + return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm + + @imperative_base.no_grad + def _dygraph_clip(self, params_grads): + params_and_grads = [] + for p, g in params_grads: + if g is None: + continue + if self._need_clip_func is not None and not self._need_clip_func(p): + params_and_grads.append((p, g)) + continue + new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm) + params_and_grads.append((p, new_grad)) + return params_and_grads + + def _static_clip(self, params_grads): + params_and_grads = [] + with framework.name_scope('gradient_clip'): + for p, g in params_grads: + if g is None: + continue + if self._need_clip_func is not None and not self._need_clip_func( + p): + params_and_grads.append((p, g)) + continue + + with p.block.program._optimized_guard([p, g]): + new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm) + params_and_grads.append((p, new_grad)) + _correct_clip_op_role_var(params_and_grads) + return params_and_grads def _process_context(self, context, param, grad): pass @@ -282,7 +363,7 @@ class GradientClipByNorm(BaseGradientClipAttr): return param, new_grad -class GradientClipByGlobalNorm(BaseGradientClipAttr): +class GradientClipByGlobalNorm(GradientClipBase): """ Clips values of multiple tensors by the ratio of the sum of their norms. @@ -371,16 +452,104 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): """ - def __init__(self, clip_norm, group_name="default_group"): - if not isinstance(group_name, six.string_types): - raise TypeError("'group_name' must be a %s." % (six.string_types)) - - self.clip_norm = clip_norm + def __init__(self, clip_norm, group_name="default_group", need_clip=None): + super(GradientClipByGlobalNorm, self).__init__(need_clip) + self.clip_norm = float(clip_norm) self.group_name = group_name def __str__(self): - return "ByGlobalNorm, group_name=%s, clip_norm=%f" % (self.group_name, - self.clip_norm) + return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm) + + @imperative_base.no_grad + def _dygraph_clip(self, params_grads): + params_and_grads = [] + sum_square_list = [] + for p, g in params_grads: + if g is None: + continue + if self._need_clip_func is not None and not self._need_clip_func(p): + continue + merge_grad = g + if g.type == core.VarDesc.VarType.SELECTED_ROWS: + merge_grad = layers.merge_selected_rows(g) + merge_grad = layers.get_tensor_from_selected_rows(merge_grad) + square = layers.square(merge_grad) + sum_square = layers.reduce_sum(square) + sum_square_list.append(sum_square) + + # all parameters have been filterd out + if len(sum_square_list) == 0: + return params_grads + + global_norm_var = layers.concat(sum_square_list) + global_norm_var = layers.reduce_sum(global_norm_var) + global_norm_var = layers.sqrt(global_norm_var) + max_global_norm = layers.fill_constant( + shape=[1], dtype='float32', value=self.clip_norm) + clip_var = layers.elementwise_div( + x=max_global_norm, + y=layers.elementwise_max( + x=global_norm_var, y=max_global_norm)) + for p, g in params_grads: + if g is None: + continue + if self._need_clip_func is not None and not self._need_clip_func(p): + params_and_grads.append((p, g)) + continue + new_grad = layers.elementwise_mul(x=g, y=clip_var) + params_and_grads.append((p, new_grad)) + + return params_and_grads + + def _static_clip(self, params_grads): + params_and_grads = [] + sum_square_list = [] + with framework.name_scope('gradient_clip'): + for p, g in params_grads: + if g is None: + continue + if self._need_clip_func is not None and not self._need_clip_func( + p): + continue + merge_grad = g + with p.block.program._optimized_guard([p, g]): + if g.type == core.VarDesc.VarType.SELECTED_ROWS: + merge_grad = layers.merge_selected_rows(g) + merge_grad = layers.get_tensor_from_selected_rows( + merge_grad) + + square = layers.square(merge_grad) + sum_square = layers.reduce_sum(input=square) + sum_square_list.append(sum_square) + + # all parameters have been filterd out + if len(sum_square_list) == 0: + return params_grads + + with p.block.program._optimized_guard([p, g]): + global_norm_var = layers.sums(sum_square_list) + global_norm_var = layers.sqrt(x=global_norm_var) + max_global_norm = layers.fill_constant( + shape=[1], dtype="float32", value=self.clip_norm) + scale_var = layers.elementwise_div( + x=max_global_norm, + y=layers.elementwise_max( + x=max_global_norm, y=global_norm_var)) + + for p, g in params_grads: + if g is None: + continue + if self._need_clip_func is not None and not self._need_clip_func( + p): + params_and_grads.append((p, g)) + continue + + with p.block.program._optimized_guard([p, g]): + new_grad = layers.elementwise_mul(x=g, y=scale_var) + params_and_grads.append((p, new_grad)) + + _correct_clip_op_role_var(params_and_grads) + return params_and_grads def _process_context(self, context, param, grad): if self.group_name not in context: @@ -486,12 +655,28 @@ def set_gradient_clip(clip, param_list=None, program=None): sgd = fluid.optimizer.SGD(learning_rate=1e-3) sgd.minimize(loss) """ - if not isinstance(clip, BaseGradientClipAttr): + warnings.warn("Caution! 'set_gradient_clip' is not recommended " + "and may be deprecated in future! " + "We recommend a new strategy: clip gradient by " + "'optimizer.minimize(loss, grad_clip=clip)'. " + "This method can reduce the mistakes, please " + "see documention of 'optimzier.minimize'.") + + if not isinstance(clip, GradientClipBase): raise TypeError( - "'clip' should be an instance of BaseGradientClipAttr's derived class" - ) + "'clip' should be an instance of GradientClipBase's derived class") if program is None: program = framework.default_main_program() + + for op in program.block(0).ops: + if 'op_namescope' in op.all_attrs() and "optimizer" in op.attr( + "op_namescope"): + warnings.warn( + "'minimize' has been invoked before, this will make 'set_gradient_clip' " + "be ineffective! Please invoke 'set_gradient_clip' before 'minimize'." + ) + break + if param_list is None: param_list = program.block(0).all_parameters() if all(isinstance(elem, six.string_types) for elem in param_list): @@ -511,46 +696,45 @@ def append_gradient_clip_ops(param_grads): if g is None: continue with p.block.program._optimized_guard( - [p, g]), framework.name_scope('append_clip_@CLIP'): - clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr()) + [p, g]), framework.name_scope('gradient_clip_@CLIP'): + clip_attr = getattr(p, 'gradient_clip_attr', None) if clip_attr is None: - clip_attr = NullGradientClipAttr() - if not isinstance(clip_attr, BaseGradientClipAttr): + return param_grads + if not isinstance(clip_attr, GradientClipBase): raise TypeError( - "clip attribute should be an instance of BaseGradientClipAttr" - ) + "clip attribute should be an instance of GradientClipBase") clip_attr._process_context(context=context, param=p, grad=g) res = [] - param_new_grad_dict = dict() for p, g in param_grads: if g is None: continue with p.block.program._optimized_guard( - [p, g]), framework.name_scope('append_graident_clip_@CLIP'): + [p, g]), framework.name_scope('graident_clip_@CLIP'): param, new_grad = clip_attr._create_operators(param=p, grad=g) - param_new_grad_dict[param.name] = new_grad.name res.append([param, new_grad]) - # change wrong mapping relation between param & grad in clip op - clip_flag = '@CLIP' - block_id_list = [] - for p, g in param_grads: - if g is None: - continue - block_id = p.block.idx - if block_id in block_id_list: + _correct_clip_op_role_var(res) + return res + + +# change wrong mapping relation between param & grad in clip op +def _correct_clip_op_role_var(params_grads): + for param, grad in params_grads: + if grad is None: continue - block_id_list.append(block_id) - for op in p.block.program.global_block().ops: - if 'op_namescope' in op.all_attrs() and clip_flag in op.attr( + for op in param.block.program.global_block().ops: + if 'op_namescope' in op.all_attrs() and "gradient_clip" in op.attr( "op_namescope"): if op.attr('op_role_var'): param_name = op.attr('op_role_var')[0] - correct_p_g = [param_name, param_new_grad_dict[param_name]] + index = 0 + for i in range(len(params_grads)): + if params_grads[i][0].name == param_name: + index = i + correct_p_g = [param_name, params_grads[index][1].name] op._set_attr('op_role_var', correct_p_g) - return res ClipByValue = GradientClipByValue diff --git a/python/paddle/fluid/dygraph_grad_clip.py b/python/paddle/fluid/dygraph_grad_clip.py deleted file mode 100644 index c90795e09f9..00000000000 --- a/python/paddle/fluid/dygraph_grad_clip.py +++ /dev/null @@ -1,289 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import copy -import six - -import functools - -from . import layers -from . import framework -from . import core -from .dygraph import base as imperative_base - -__all__ = [ - 'GradClipByValue', - 'GradClipByNorm', - 'GradClipByGlobalNorm', -] - - -class GradClipBase(object): - def __str__(self): - raise NotImplementedError() - - def _clip(self, para_and_grad): - raise NotImplementedError - - @imperative_base.no_grad - def __call__(self, para_and_grad): - return self._clip(para_and_grad) - - -class GradClipByValue(GradClipBase): - """ - Clips gradient values to the range [min_value, max_value]. - - Given a gradient g, this operation clips its value to min_value and max_value. - - - Any values less than min_value are set to min_value. - - Any values greater than max_value are set to max_value. - - Args: - max_value (float): The maximum value to clip by. - min (float, optional): The minimum value to clip by. if not set by user, \ - will be set to -max_value(max_value MUST be positive) by framework. - - Examples: - .. code-block:: python - - import numpy as np - import paddle - import paddle.fluid as fluid - - from paddle.fluid.dygraph.base import to_variable - from paddle.fluid.dygraph.nn import Linear - - from paddle.fluid.clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm - - from paddle.fluid.optimizer import SGDOptimizer - - with fluid.dygraph.guard(): - value_clip = GradClipByValue( -1.0, 1.0 ) - sgd = SGDOptimizer(learning_rate=1.0) - - init_value = np.random.uniform( -1, 1, (10, 10)).astype('float32') - - linear = Linear( 10, 10) - - out = linear( to_variable(init_value) ) - - loss = fluid.layers.reduce_mean( out ) - - loss.backward() - sgd.minimize(loss, grad_clip = value_clip) - - """ - - @imperative_base.no_grad - def __init__(self, min_value, max_value=None): - - if min_value is None: - assert (max_value > 0.0) - min_value = -max_value - else: - min_value = float(min_value) - self.max_value = max_value - self.min_value = min_value - - def __str__(self): - return "ClipByValue, min = %f, max=%f" % (self.min_value, - self.max_value) - - def _clip(self, para_and_grad): - out = [] - for p, g in para_and_grad: - if g is None: - out.append((p, g)) - continue - - new_grad = layers.clip(x=g, min=self.min_value, max=self.max_value) - - out.append((p, new_grad)) - - return out - - -class GradClipByNorm(GradClipBase): - """ - Clips tensor values to a maximum L2-norm. - - This operator limits the L2 norm of the input :math:`X` within :math:`max\_norm`. - If the L2 norm of :math:`X` is less than or equal to :math:`max\_norm`, :math:`Out` - will be the same as :math:`X`. If the L2 norm of :math:`X` is greater than - :math:`max\_norm`, :math:`X` will be linearly scaled to make the L2 norm of - :math:`Out` equal to :math:`max\_norm`, as shown in the following formula: - - .. math:: - - Out = \\frac{max\_norm * X}{norm(X)}, - - where :math:`norm(X)` represents the L2 norm of :math:`X`. - - Args: - clip_norm (float): The maximum norm value - - Examples: - .. code-block:: python - - import numpy as np - import paddle - import paddle.fluid as fluid - - from paddle.fluid.dygraph.base import to_variable - from paddle.fluid.dygraph.nn import Linear - - from paddle.fluid.clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm - - from paddle.fluid.optimizer import SGDOptimizer - - with fluid.dygraph.guard(): - norm_clip = GradClipByNorm( 5.0 ) - sgd = SGDOptimizer(learning_rate=1.0) - - init_value = np.random.uniform( -1, 1, (10, 10)).astype('float32') - - linear = Linear( 10, 10) - - out = linear( to_variable(init_value) ) - - loss = fluid.layers.reduce_mean( out ) - - loss.backward() - sgd.minimize(loss, grad_clip = norm_clip) - - """ - - @imperative_base.no_grad - def __init__(self, clip_norm): - self.clip_norm = clip_norm - - def __str__(self): - return "ClipByNorm, clip_norm=%f" % self.clip_norm - - def _clip(self, para_and_grad): - out = [] - - for p, g in para_and_grad: - if g is None: - out.append((p, g)) - continue - new_g = layers.clip_by_norm(x=g, max_norm=self.clip_norm) - - out.append((p, new_g)) - - return out - - -class GradClipByGlobalNorm(GradClipBase): - """ - Clips values of multiple tensors by the ratio of the sum of their norms. - - Given a list of tensors t_list, and a clipping ratio max_global_norm, this - operation returns a list of clipped tensors list_clipped. - - To perform the clipping, the values :math:`t\_list[i]` are set to: - - .. math:: - - t\_list[i] = t\_list[i] * \\frac{max\_global\_norm}{\max(global\_norm, max\_global\_norm)} - - where: - - .. math:: - - global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2} - - If :math:`max\_global\_norm > global\_norm` then the entries in t_list remain as they are, - otherwise they're all shrunk by the global ratio. - - Args: - max_global_norm (float): The maximum norm value. - dtype (str, optional): The type of max_global_norm. Default: "float32". - - Examples: - .. code-block:: python - - import numpy as np - import paddle - import paddle.fluid as fluid - - from paddle.fluid.dygraph.base import to_variable - from paddle.fluid.dygraph.nn import Linear - - from paddle.fluid.dygraph_grad_clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm - - from paddle.fluid.optimizer import SGDOptimizer - - with fluid.dygraph.guard(): - gloabl_norm_clip = GradClipByGlobalNorm( 5.0 ) - sgd = SGDOptimizer(learning_rate=1.0) - - init_value = np.random.uniform( -1, 1, (10, 10)).astype('float32') - - linear = Linear( 10, 10) - - out = linear( to_variable(init_value) ) - - loss = fluid.layers.reduce_mean( out ) - - loss.backward() - sgd.minimize(loss, grad_clip = gloabl_norm_clip) - - - """ - - @imperative_base.no_grad - def __init__(self, max_global_norm, dtype='float32'): - self.max_global_norm = layers.fill_constant( - shape=[1], dtype=dtype, value=max_global_norm) - - def __str__(self): - return "ClipByGlobalNorm, max_global_norm=%f" % (self.max_global_norm) - - def _clip(self, para_and_grad): - - out = [] - - norm_arr = [] - for p, g in para_and_grad: - if g is None: - continue - merge_grad = g - if g.type == core.VarDesc.VarType.SELECTED_ROWS: - merge_grad = layers.merge_selected_rows(g) - merge_grad = layers.get_tensor_from_selected_rows(merge_grad) - power = layers.square(merge_grad) - sum_t = layers.reduce_sum(power) - norm_arr.append(sum_t) - - norm_global = layers.concat(norm_arr) - norm_global = layers.reduce_sum(norm_global) - norm_global = layers.sqrt(norm_global) - - clip_scale = self.max_global_norm / (layers.elementwise_max( - x=norm_global, y=self.max_global_norm)) - - for p, g in para_and_grad: - if g is None: - out.append((p, g)) - continue - - new_grad = g * clip_scale - - out.append((p, new_grad)) - - return out diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 5dd2e0fe53c..54433e5c079 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -2409,7 +2409,6 @@ class Block(object): trainable = v.trainable optimize_attr = v.optimize_attr regularizer = v.regularizer - gradient_clip_attr = v.gradient_clip_attr error_clip = v.error_clip elif type(v) == Variable: var_type = "Variable" @@ -2432,7 +2431,6 @@ class Block(object): trainable=trainable, optimize_attr=optimize_attr, regularizer=regularizer, - gradient_clip_attr=gradient_clip_attr, error_clip=error_clip) else: var = Parameter( @@ -2445,7 +2443,6 @@ class Block(object): trainable=trainable, optimize_attr=optimize_attr, regularizer=regularizer, - gradient_clip_attr=gradient_clip_attr, error_clip=error_clip) elif var_type == "Variable": var = Variable( @@ -2723,7 +2720,6 @@ class Block(object): trainable=p.trainable, optimize_attr=p.optimize_attr, regularizer=p.regularizer, - gradient_clip_attr=p.gradient_clip_attr, error_clip=p.error_clip, name=v.name) else: @@ -2737,7 +2733,6 @@ class Block(object): trainable=p.trainable, optimize_attr=p.optimize_attr, regularizer=p.regularizer, - gradient_clip_attr=p.gradient_clip_attr, error_clip=p.error_clip, name=v.name) self.vars[new_p.name] = new_p @@ -4646,8 +4641,6 @@ class Parameter(Variable): Default: {'learning_rate': 1.0} regularizer(WeightDecayRegularizer): The Regularizer which will be applied on the parameter. Default: None - gradient_clip_attr(BaseGradientClipAttr): The gradient clip strategy - which will be applied on the parameter. Default: None do_model_average(bool): True if the model average strategy will be applied on this parameter. """ @@ -4687,8 +4680,6 @@ class Parameter(Variable): self.regularizer = kwargs.get('regularizer', None) - self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None) - self.do_model_average = kwargs.get('do_model_average', None) self.is_distributed = False @@ -4723,7 +4714,7 @@ class Parameter(Variable): if with_details: res_str = Variable.to_string(self, throw_on_error, True) additional_attr = ("trainable", "optimize_attr", "regularizer", - "gradient_clip_attr", "do_model_average") + "do_model_average") for attr_name in additional_attr: res_str += "%s: %s\n" % (attr_name, cpt.to_text(getattr(self, attr_name))) @@ -4752,8 +4743,6 @@ class ParamBase(core.VarBase): Default: {'learning_rate': 1.0} regularizer(WeightDecayRegularizer): The Regularizer which will be applied on the ParamBase. Default: None - gradient_clip_attr(BaseGradientClipAttr): The gradient clip strategy - which will be applied on the ParamBase. Default: None do_model_average(bool): True if the model average strategy will be applied on this ParamBase. """ @@ -4792,8 +4781,6 @@ class ParamBase(core.VarBase): self.regularizer = kwargs.get('regularizer', None) - self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None) - self.do_model_average = kwargs.get('do_model_average', None) self.is_distributed = False diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index f46a52b6663..a1ba7abc6ae 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -24,7 +24,7 @@ from . import framework from . import layers from . import unique_name from .backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name -from .clip import append_gradient_clip_ops, error_clip_callback +from .clip import GradientClipBase, error_clip_callback, append_gradient_clip_ops from .framework import program_guard from .initializer import Constant from .layer_helper import LayerHelper @@ -109,6 +109,8 @@ class Optimizer(object): self._opti_name_list = [] self._accumulators_holder = {} self._param_device_map = dict() + # if pass grad_clip into minimize, it will not be None + self._grad_clip = None @framework.dygraph_only def state_dict(self): @@ -690,12 +692,17 @@ class Optimizer(object): # ... optimizer.apply_gradients(params_grads) """ + params_grads = sorted(params_grads, key=lambda x: x[0].name) params_grads, table_param_and_grad, table_optimize_op = \ self._process_distribute_lookuptable(params_grads) - params_grads = append_gradient_clip_ops(params_grads) + # 'minimize(grad_clip)' or 'set_gradient_clip' + if self._grad_clip is not None: + params_grads = self._grad_clip(params_grads) + else: + params_grads = append_gradient_clip_ops(params_grads) # Add regularization if any params_grads = append_regularization_ops(params_grads, @@ -712,19 +719,19 @@ class Optimizer(object): """ Second part of `minimize`, appending optimization operators for given `params_grads` pairs. - Args: loss (Variable): loss variable to run optimizations. startup_program (Program): startup_program for initializing parameters in `parameter_list`. params_grads (list): list of (param, grad) pair to do optimization. - Returns: list: A list of operators appended to the current program. """ if framework.in_dygraph_mode(): with program_guard(framework.default_main_program(), framework.default_startup_program()): + if self._grad_clip is not None: + params_grads = self._grad_clip(params_grads) params_grads = append_regularization_ops(params_grads, self.regularization) optimize_ops = self._create_optimization_pass(params_grads) @@ -809,16 +816,19 @@ class Optimizer(object): Please refer to the example of current Optimizer. """ assert isinstance(loss, Variable), "The loss should be an Variable." + if grad_clip is not None: + if not isinstance(grad_clip, GradientClipBase): + raise TypeError( + "'grad_clip' should be an instance of GradientClipBase's derived class" + ) + self._grad_clip = grad_clip + params_grads = self.backward( loss, startup_program=startup_program, parameter_list=parameter_list, no_grad_set=no_grad_set) - if grad_clip is not None and framework.in_dygraph_mode(): - # TODO(hongyu): FIX later, this is only for dygraph, should be work for static mode - params_grads = grad_clip(params_grads) - optimize_ops = self.apply_optimize( loss, startup_program=startup_program, params_grads=params_grads) @@ -1148,6 +1158,7 @@ class DGCMomentumOptimizer(Optimizer): self.regular_type, self.regular_coeff = self._get_regularization_param( self.regularization) + self._grad_clip = None def _get_regularization_param(self, regularization): regular_type = 0 @@ -1404,24 +1415,28 @@ class DGCMomentumOptimizer(Optimizer): dgc_op._set_attr(op_maker.kOpRoleVarAttrName(), [param_var.name, grad_var.name]) + @imperative_base.no_grad def apply_gradients(self, params_grads): params_grads = sorted(params_grads, key=lambda x: x[0].name) - params_grads, table_param_and_grad, table_optimize_op = \ self._process_distribute_lookuptable(params_grads) not_dgc_params_grads = [] dgc_params_grads = [] + # DGC clip and regularization in optimizer.backward for param, grad in params_grads: if not self._is_use_dgc(param, grad): not_dgc_params_grads.append((param, grad)) else: dgc_params_grads.append((param, grad)) - # DGC clip and regularization in local - not_dgc_params_grads = append_gradient_clip_ops(not_dgc_params_grads) + # 'minimize(grad_clip)' or 'set_gradient_clip' + if self._grad_clip is not None: + not_dgc_params_grads = self._grad_clip(not_dgc_params_grads) + else: + not_dgc_params_grads = append_gradient_clip_ops( + not_dgc_params_grads) - # Add regularization if any not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads, self.regularization) @@ -3942,16 +3957,13 @@ class RecomputeOptimizer(Optimizer): def apply_optimize(self, loss, startup_program, params_grads): """ call the apply_optimize function of self._optimizer - Args: loss (Variable): loss variable to run optimizations. startup_program (Program): startup_program for initializing parameters in `parameter_list`. params_grads (list): list of (param, grad) pair to do optimization. - Examples: .. code-block:: python - import paddle.fluid as fluid def mlp(input_x, input_y, hid_dim=128, label_dim=2): @@ -3979,7 +3991,6 @@ class RecomputeOptimizer(Optimizer): cost, startup_program=None, params_grads=params_grads) print("Finished apply_optimize") - """ return self._optimizer.apply_optimize( @@ -3991,24 +4002,24 @@ class RecomputeOptimizer(Optimizer): parameter_list=None, no_grad_set=None, grad_clip=None): - - assert (isinstance(loss, Variable)), "The loss should be an Variable." + assert isinstance(loss, Variable), "The loss should be an Variable." assert (self._checkpoints is not None ), "You should call _set_checkpoints first" if framework.in_dygraph_mode(): raise NotImplementedError( "DyGraph current does not support recompute") - + if grad_clip is not None: + if not isinstance(grad_clip, GradientClipBase): + raise TypeError( + "'grad_clip' should be an instance of GradientClipBase's derived class" + ) + self._optimizer._grad_clip = grad_clip params_grads = self.backward( loss, startup_program=startup_program, parameter_list=parameter_list, no_grad_set=no_grad_set) - if grad_clip: - # TODO(guru4elephant): should add grad_clip for static graph - pass - optimize_ops = self.apply_optimize( loss, startup_program=startup_program, params_grads=params_grads) diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py index dae6d99ee77..6edc4c5998c 100644 --- a/python/paddle/fluid/param_attr.py +++ b/python/paddle/fluid/param_attr.py @@ -15,6 +15,7 @@ from __future__ import print_function import six +import warnings from .initializer import Initializer, Xavier, Constant from .regularizer import WeightDecayRegularizer @@ -68,7 +69,6 @@ class ParamAttr(object): learning_rate=1.0, regularizer=None, trainable=True, - gradient_clip=None, do_model_average=True): self.name = name if isinstance(self.name, six.string_types) and self.name == "": @@ -78,7 +78,6 @@ class ParamAttr(object): self.learning_rate = learning_rate self.regularizer = regularizer self.trainable = trainable - self.gradient_clip = gradient_clip self.do_model_average = do_model_average def _set_default_initializer(self, initializer): @@ -176,7 +175,6 @@ class ParamAttr(object): }, 'regularizer': self.regularizer, 'trainable': self.trainable, - 'gradient_clip_attr': self.gradient_clip, 'do_model_average': self.do_model_average } if with_initializer: @@ -248,7 +246,6 @@ class WeightNormParamAttr(ParamAttr): learning_rate=1.0, regularizer=None, trainable=True, - gradient_clip=None, do_model_average=False): super(WeightNormParamAttr, self).__init__( name=name, @@ -256,6 +253,5 @@ class WeightNormParamAttr(ParamAttr): learning_rate=learning_rate, regularizer=regularizer, trainable=trainable, - gradient_clip=gradient_clip, do_model_average=do_model_average) self.dim = dim diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index d2e30391d25..746d29b69b9 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -476,15 +476,18 @@ class TestL2Decay(TranspilerTest): size=1000, act=None, param_attr=fluid.ParamAttr( - name='fc_w', - regularizer=fluid.regularizer.L2Decay(), - gradient_clip=fluid.clip.GradientClipByValue(0.1)), + name='fc_w', regularizer=fluid.regularizer.L2Decay()), bias_attr=fluid.ParamAttr(name='fc_b')) y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=y_predict, label=y) avg_cost = fluid.layers.mean(cost) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1) - sgd_optimizer.minimize(avg_cost) + + def filter(param): + return param.name == "fc_w" + + clip = fluid.clip.GradientClipByValue(0.1, need_clip=filter) + sgd_optimizer.minimize(avg_cost, grad_clip=clip) def transpiler_test_impl(self): pserver, startup = self.get_pserver(self.pserver1_ep) diff --git a/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py b/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py index fb80b5c1d2c..39a5b9391e0 100644 --- a/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py +++ b/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py @@ -25,7 +25,7 @@ from paddle.fluid import core from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.dygraph_grad_clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm +from paddle.fluid.clip import GradientClipByValue, GradientClipByNorm, GradientClipByGlobalNorm class TestGradClipByGlobalNorm(unittest.TestCase): @@ -65,7 +65,7 @@ class TestGradClipByGlobalNorm(unittest.TestCase): def get_dygrap_global_norm_result(self): with fluid.dygraph.guard(): - gloabl_norm_clip = GradClipByGlobalNorm(self.max_global_norm) + gloabl_norm_clip = GradientClipByGlobalNorm(self.max_global_norm) p_g_var = [] for p, g in self.para_and_grad: new_p = to_variable(p) @@ -135,7 +135,7 @@ class TestGradClipByNorm(unittest.TestCase): def get_dygrap_norm_result(self): with fluid.dygraph.guard(): - norm_clip = GradClipByNorm(self.max_norm) + norm_clip = GradientClipByNorm(self.max_norm) p_g_var = [] for p, g in self.para_and_grad: new_p = to_variable(p) @@ -200,8 +200,8 @@ class TestGradClipByValue(unittest.TestCase): def get_dygrap_clip_result(self): with fluid.dygraph.guard(): - - value_clip = GradClipByValue(self.min_value, self.max_value) + value_clip = GradientClipByValue( + max=self.max_value, min=self.min_value) p_g_var = [] for p, g in self.para_and_grad: new_p = to_variable(p) @@ -225,7 +225,7 @@ class TestGradClipByValue(unittest.TestCase): for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g): self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8)) - def test_clip_by_norm_2(self): + def test_clip_by_value_2(self): self.init_value() self.init_scale = 0.2 @@ -236,7 +236,7 @@ class TestGradClipByValue(unittest.TestCase): for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g): self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8)) - def test_clip_by_norm_3(self): + def test_clip_by_value_3(self): self.init_value() self.init_scale = 0.5 diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py index 470187e6421..362b527d456 100644 --- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py +++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py @@ -1,10 +1,10 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -54,26 +54,32 @@ class TestGradientClip(unittest.TestCase): self.BATCH_SIZE = 2 reader = fake_imdb_reader(self.word_dict_len, self.BATCH_SIZE * 100) self.train_data = paddle.batch(reader, batch_size=self.BATCH_SIZE) + self.init() + + def init(self): + pass def get_places(self): - places = [core.CPUPlace()] + places = [fluid.CPUPlace()] if core.is_compiled_with_cuda(): - places.append(core.CUDAPlace(0)) + places.append(fluid.CUDAPlace(0)) return places - def check_operators(self, place): - CLIP = 1 + def clip_gradient(self, params_grads): + pass - prog = fluid.framework.Program() - startup_program = fluid.framework.Program() + def check_clip_result(self, out, out_clip): + pass + + def check_gradient_clip(self, place): + prog = fluid.Program() + startup_program = fluid.Program() with fluid.program_guard( main_program=prog, startup_program=startup_program): - image = fluid.layers.data(name='x', shape=[784], dtype='float32') - label = fluid.layers.data(name='y', shape=[1], dtype='int64') - - hidden1 = fluid.layers.fc(input=image, size=128, act='relu') - hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu') - predict = fluid.layers.fc(input=hidden2, size=10, act='softmax') + image = fluid.data(name='x', shape=[-1, 784], dtype='float32') + label = fluid.data(name='y', shape=[-1, 1], dtype='int64') + hidden = fluid.layers.fc(input=image, size=32, act='relu') + predict = fluid.layers.fc(input=hidden, size=10, act='softmax') cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(cost) @@ -84,45 +90,26 @@ class TestGradientClip(unittest.TestCase): p_g = fluid.backward.append_backward(loss=avg_cost) p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip) + p_g = sorted(p_g, key=lambda x: x[0].name) + p_g_clip = sorted(p_g_clip, key=lambda x: x[0].name) with fluid.program_guard( main_program=prog_clip, startup_program=startup_program): - fluid.clip.set_gradient_clip( - fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP)) - p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip) + p_g_clip = self.clip_gradient(p_g_clip) grad_list = [elem[1] for elem in p_g] grad_clip_list = [elem[1] for elem in p_g_clip] - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.mnist.train(), buf_size=8192), - batch_size=128) - + train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=3) exe = fluid.Executor(place) feeder = fluid.DataFeeder(feed_list=[image, label], place=place) exe.run(startup_program) - count = 0 - for data in train_reader(): - count += 1 - if count > 5: - break - out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list) - out_clip = exe.run(prog_clip, - feed=feeder.feed(data), - fetch_list=grad_clip_list) - global_norm = 0 - for v in out: - global_norm += np.sum(np.power(v, 2)) - global_norm = np.sqrt(global_norm) - - global_norm_clip = 0 - for v in out_clip: - global_norm_clip += np.sum(np.power(v, 2)) - global_norm_clip = np.sqrt(global_norm_clip) - - assert np.isclose( - a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3) + data = next(train_reader()) + out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list) + out_clip = exe.run(prog_clip, + feed=feeder.feed(data), + fetch_list=grad_clip_list) + self.check_clip_result(out, out_clip) def check_sparse_gradient_clip(self, place): prog = fluid.framework.Program() @@ -134,11 +121,7 @@ class TestGradientClip(unittest.TestCase): label = fluid.layers.data(name="label", shape=[1], dtype="int64") cost = bow_net(data, label, self.word_dict_len) - fluid.clip.set_gradient_clip( - clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)) - - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01) - sgd_optimizer.minimize(cost) + self.backward_and_optimize(cost) exe = fluid.Executor(place) feeder = fluid.DataFeeder(feed_list=[data, label], place=place) @@ -150,13 +133,345 @@ class TestGradientClip(unittest.TestCase): print(val) self.assertFalse(np.isnan(val)) - def test_operators(self): - self.check_operators(core.CPUPlace()) + def backward_and_optimize(cost): + pass + + +class TestGradientClipByGlobalNorm(TestGradientClip): + def init(self): + self.clip_norm = 0.2 + + def clip_gradient(self, params_grads): + clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm) + print(clip) + return clip(params_grads) + + def check_clip_result(self, out, out_clip): + global_norm = 0 + for v in out: + global_norm += np.sum(np.power(v, 2)) + global_norm = np.sqrt(global_norm) + scale = self.clip_norm / np.maximum(self.clip_norm, global_norm) + res = [] + for i in range(len(out)): + out[i] = scale * out[i] + + for u, v in zip(out, out_clip): + self.assertTrue( + np.allclose( + a=u, b=v, rtol=1e-5, atol=1e-8), + "gradient clip by global norm has wrong results!") + + # test whether the ouput is right when use 'set_gradient_clip' + def test_old_gradient_clip(self): + def func(params_grads): + clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm) + fluid.clip.set_gradient_clip(clip) + return fluid.clip.append_gradient_clip_ops(params_grads) + + self.clip_gradient = func + self.check_gradient_clip(fluid.CPUPlace()) + + # test whether the ouput is right when use 'minimize(grad_clip)' + def test_new_gradient_clip(self): + def func(params_grads): + clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm) + print(clip) + return clip(params_grads) - def test_sparse_gradient_clip(self): + self.clip_gradient = func + self.check_gradient_clip(fluid.CPUPlace()) + + # invoke 'set_gradient_clip' in a wrong order + def test_wrong_API_order(self): + def backward_func(cost): + # no clip gradient + def fileter_func(param): + return param.name == "fc.w_0" + + clip = fluid.clip.GradientClipByGlobalNorm( + clip_norm=5.0, need_clip=fileter_func) + fluid.clip.set_gradient_clip(clip) + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01) + # if 'set_gradient_clip' and 'minimize(grad_clip)' together, 'set_gradient_clip' will be ineffective + sgd_optimizer.minimize(cost, grad_clip=clip) + # 'set_gradient_clip' must before 'minimize', otherwise, 'set_gradient_clip' will be ineffective + fluid.clip.set_gradient_clip(clip) + + self.backward_and_optimize = backward_func for place in self.get_places(): self.check_sparse_gradient_clip(place) + # if grad is None or not need clip + def test_none_grad(self): + def fileter_func(param): + return param.name == "x" + + clip = fluid.clip.GradientClipByGlobalNorm( + self.clip_norm, need_clip=fileter_func) + x = fluid.default_main_program().global_block().create_parameter( + name="x", shape=[2, 3], dtype="float32") + y = fluid.default_main_program().global_block().create_parameter( + name="y", shape=[2, 3], dtype="float32") + + # (x, None) should not be returned + params_grads = [(x, None), (x, y), (y, x)] + params_grads = clip(params_grads) + self.assertTrue( + len(clip(params_grads)) == 2, + "ClipByGlobalNorm: when grad is None, it shouldn't be returned by gradient clip!" + ) + self.assertTrue( + params_grads[0][1].name != 'y', + "ClipByGlobalNorm: param_grad (x, y) should be clipped!") + + # raise typeError + def test_tpyeError(self): + # the type of need_clip must be an funciton + with self.assertRaises(TypeError): + clip = fluid.clip.GradientClipByGlobalNorm( + clip_norm=self.clip_norm, need_clip="test") + + # the type of minimize(grad_clip=) must be an instance of GradientClipBase's derived class + with self.assertRaises(TypeError): + x = fluid.default_main_program().global_block().create_parameter( + name="x", shape=[2, 3], dtype="float32") + loss = fluid.layers.reduce_mean(x) + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1) + sgd_optimizer.minimize(loss, grad_clip="test") + + # the type of RecomputeOptimizer.minimize(grad_clip=) must be an instance of GradientClipBase's derived class + with self.assertRaises(TypeError): + x = fluid.default_main_program().global_block().create_parameter( + name="x", shape=[2, 3], dtype="float32") + loss = fluid.layers.reduce_mean(x) + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1) + recompute_optimizer = fluid.optimizer.RecomputeOptimizer( + sgd_optimizer) + recompute_optimizer._set_checkpoints([x]) + recompute_optimizer.minimize(loss, grad_clip="test") + + +class TestGradientClipByNorm(TestGradientClip): + def init(self): + self.clip_norm = 0.2 + + def clip_gradient(self, params_grads): + clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm) + print(clip) + return clip(params_grads) + + def check_clip_result(self, out, out_clip): + for u, v in zip(out, out_clip): + norm = np.sqrt(np.sum(np.power(u, 2))) + scale = self.clip_norm / np.maximum(self.clip_norm, norm) + u = u * scale + self.assertTrue( + np.allclose( + a=u, b=v, rtol=1e-5, atol=1e-8), + "gradient clip by norm has wrong results!") + + # test whether the ouput is right when use 'minimize(grad_clip)' + def test_gradient_clip(self): + self.check_gradient_clip(fluid.CPUPlace()) + + # if grad is None or not need clip + def test_none_grad(self): + def fileter_func(param): + return param.name == "z" + + clip = fluid.clip.GradientClipByNorm( + self.clip_norm, need_clip=fileter_func) + x = fluid.default_main_program().global_block().create_parameter( + name="x", shape=[2, 3], dtype="float32") + y = fluid.default_main_program().global_block().create_parameter( + name="y", shape=[2, 3], dtype="float32") + + # (x, None) should not be returned + params_grads = [(x, None), (x, y)] + params_grads = clip(params_grads) + self.assertTrue( + len(clip(params_grads)) == 1, + "ClipByNorm: when grad is None, it shouldn't be returned by gradient clip!" + ) + self.assertTrue( + params_grads[0][1].name == 'y', + "ClipByNorm: grad should not be clipped when filtered out!") + + +class TestGradientClipByValue(TestGradientClip): + def init(self): + self.max = 0.2 + self.min = 0.1 + + def clip_gradient(self, params_grads): + clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min) + print(clip) + return clip(params_grads) + + def check_clip_result(self, out, out_clip): + for i, v in enumerate(out): + out[i] = np.clip(v, self.min, self.max) + for u, v in zip(out, out_clip): + u = np.clip(u, self.min, self.max) + self.assertTrue( + np.allclose( + a=u, b=v, rtol=1e-6, atol=1e-8), + "gradient clip by value has wrong results!") + + # test whether the ouput is right when use 'minimize(grad_clip)' + def test_gradient_clip(self): + self.check_gradient_clip(fluid.CPUPlace()) + + # if grad is None or not need clip + def test_none_grad(self): + def fileter_func(param): + return param.name == "z" + + clip = fluid.clip.GradientClipByValue( + self.max, self.min, need_clip=fileter_func) + x = fluid.default_main_program().global_block().create_parameter( + name="x", shape=[2, 3], dtype="float32") + y = fluid.default_main_program().global_block().create_parameter( + name="y", shape=[2, 3], dtype="float32") + + # (x, None) should not be returned + params_grads = [(x, None), (x, y)] + params_grads = clip(params_grads) + self.assertTrue( + len(clip(params_grads)) == 1, + "ClipByValue: when grad is None, it shouldn't be returned by gradient clip!" + ) + self.assertTrue( + params_grads[0][1].name == 'y', + "ClipByValue: grad should not be clipped when filtered out!") + + +class TestDygraphGradientClip(unittest.TestCase): + def test_gradient_clip(self): + with fluid.dygraph.guard(): + linear = fluid.dygraph.Linear(5, 5) + inputs = fluid.layers.uniform_random( + [16, 5], min=-10, max=10).astype('float32') + out = linear(fluid.dygraph.to_variable(inputs)) + loss = fluid.layers.reduce_mean(out) + loss.backward() + sgd_optimizer = fluid.optimizer.SGD( + learning_rate=0.0, parameter_list=linear.parameters()) + self.check_clip_result(loss, sgd_optimizer) + + def check_clip_result(self, loss, optimizer): + pass + + +class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip): + def setUp(self): + # only clip gradient of x (ParamBase) + def fileter_func(param): + return param.name == "x" + + self.clip_norm = 0.8 + self.clip1 = fluid.clip.GradientClipByGlobalNorm( + clip_norm=self.clip_norm, need_clip=fileter_func) + self.clip2 = fluid.clip.GradientClipByGlobalNorm( + clip_norm=self.clip_norm) + + def check_clip_result(self, loss, optimizer): + # if grad is None + x = fluid.dygraph.to_variable( + np.array([2, 3]).astype("float32"), name="x") + y = fluid.dygraph.to_variable( + np.array([3, 4]).astype("float32"), name="y") + assert len(self.clip1([(x, x), (x, y), (x, None)])) == 2 + # get params and grads from network + opt, params_grads = optimizer.minimize(loss, grad_clip=self.clip2) + _, grads = zip(*params_grads) + params_grads = self.clip2(params_grads) + _, grads_clip = zip(*params_grads) + + global_norm = 0 + for u in grads: + u = u.numpy() + global_norm += np.sum(np.power(u, 2)) + global_norm = np.sqrt(global_norm) + + global_norm_clip = 0 + for v in grads_clip: + v = v.numpy() + global_norm_clip += np.sum(np.power(v, 2)) + global_norm_clip = np.sqrt(global_norm_clip) + + a = np.minimum(global_norm, self.clip_norm) + b = global_norm_clip + self.assertTrue( + np.isclose( + a=a, b=b, rtol=1e-6, atol=1e-8), + "gradient clip by global norm has wrong results, expetcd:%f, but recieved:%f" + % (a, b)) + + +class TestDygraphGradientClipByNorm(TestDygraphGradientClip): + def setUp(self): + # only clip gradient of linear_0.w_0 (ParamBase) + def fileter_func(param): + return param.name == "linear_0.w_0" + + self.clip_norm = 0.8 + self.clip = fluid.clip.GradientClipByNorm( + clip_norm=self.clip_norm, need_clip=fileter_func) + + def check_clip_result(self, loss, optimizer): + # if grad is None + x = fluid.dygraph.to_variable(np.array([2, 3]).astype("float32")) + assert len(self.clip([(x, None)])) == 0 + # get params and grads from network + self.clip([(fluid.dygraph.to_variable(np.array([2, 3])), None)]) + params_grads = optimizer.backward(loss) + _, grads = zip(*params_grads) + params_grads = self.clip(params_grads) + _, grads_clip = zip(*params_grads) + + for u, v in zip(grads, grads_clip): + u = u.numpy() + v = v.numpy() + a = np.sqrt(np.sum(np.power(u, 2))) + a = np.minimum(a, self.clip_norm) + b = np.sqrt(np.sum(np.power(v, 2))) + self.assertTrue( + np.isclose( + a=a, b=b, rtol=1e-6, atol=1e-8), + "gradient clip by norm has wrong results, expetcd:%f, but recieved:%f" + % (a, b)) + + +class TestDygraphGradientClipByValue(TestDygraphGradientClip): + def setUp(self): + # only clip gradient of linear_0.w_0 (ParamBase) + def fileter_func(param): + return param.name == "linear_0.w_0" + + self.max = 0.2 + self.min = 0.1 + self.clip = fluid.clip.GradientClipByValue( + max=self.max, min=self.min, need_clip=fileter_func) + + def check_clip_result(self, loss, optimizer): + # if grad is None + x = fluid.dygraph.to_variable(np.array([2, 3]).astype("float32")) + assert len(self.clip([(x, None)])) == 0 + # get params and grads from network + params_grads = optimizer.backward(loss) + _, grads = zip(*params_grads) + params_grads = self.clip(params_grads) + _, grads_clip = zip(*params_grads) + for u, v in zip(grads, grads_clip): + u = np.clip(u.numpy(), self.min, self.max) + v = v.numpy() + self.assertTrue( + np.allclose( + a=u, b=v, rtol=1e-6, atol=1e-8), + "gradient clip by value has wrong results!") + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py index 24981d4b6ab..3d9b4e2ef27 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py @@ -331,7 +331,7 @@ class TestImperativeAutoPrune(unittest.TestCase): model = MyLayer(size, vocab_size, size) optimizer = fluid.optimizer.AdamOptimizer( 0.001, parameter_list=model.parameters()) - grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001) + grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001) indices = fluid.dygraph.to_variable(indices) embed = fluid.dygraph.to_variable(embed) @@ -350,7 +350,7 @@ class TestImperativeAutoPrune(unittest.TestCase): model = MyLayer2(size, vocab_size, size) optimizer = fluid.optimizer.AdamOptimizer( 0.001, parameter_list=model.parameters()) - grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001) + grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001) indices = fluid.dygraph.to_variable(indices) emebd = fluid.dygraph.to_variable(embed) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py index da3240b35aa..727651f9cf5 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py @@ -49,7 +49,7 @@ class TestSimpleNet(unittest.TestCase): with fluid.dygraph.guard(place): backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = sort_sum_gradient - # grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0) + # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0) input_word = np.array([[1, 2], [2, 1]]).astype('int64') input = to_variable(input_word) @@ -83,8 +83,7 @@ class TestSimpleNet(unittest.TestCase): with fluid.dygraph.guard(place): backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = sort_sum_gradient - grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm( - 5.0) + grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0) input_word = np.array([[1, 2], [2, 1]]).astype('int64') input = to_variable(input_word) -- GitLab