未验证 提交 7fda333a 编写于 作者: Z Zhou Wei 提交者: GitHub

add new method of gradient_clip, better to use,test=develop (#23224)

上级 b7b0b359
...@@ -75,7 +75,6 @@ from .transpiler import DistributeTranspiler, \ ...@@ -75,7 +75,6 @@ from .transpiler import DistributeTranspiler, \
memory_optimize, release_memory, DistributeTranspilerConfig memory_optimize, release_memory, DistributeTranspilerConfig
from .lod_tensor import create_lod_tensor, create_random_int_lodtensor from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
from . import clip from . import clip
from . import dygraph_grad_clip
from . import profiler from . import profiler
from . import unique_name from . import unique_name
from . import parallel_executor from . import parallel_executor
...@@ -122,7 +121,6 @@ __all__ = framework.__all__ + executor.__all__ + \ ...@@ -122,7 +121,6 @@ __all__ = framework.__all__ + executor.__all__ + \
'WeightNormParamAttr', 'WeightNormParamAttr',
'DataFeeder', 'DataFeeder',
'clip', 'clip',
'dygraph_grad_clip',
'profiler', 'profiler',
'unique_name', 'unique_name',
'Scope', 'Scope',
......
...@@ -16,19 +16,18 @@ from __future__ import print_function ...@@ -16,19 +16,18 @@ from __future__ import print_function
import copy import copy
import six import six
import warnings
import functools import functools
from . import layers from . import layers
from . import framework from . import framework
from . import core from . import core
from . import name_scope from . import name_scope
from .dygraph import base as imperative_base
__all__ = [ __all__ = [
'set_gradient_clip', 'set_gradient_clip', 'ErrorClipByValue', 'GradientClipByValue',
'ErrorClipByValue', 'GradientClipByNorm', 'GradientClipByGlobalNorm'
'GradientClipByValue',
'GradientClipByNorm',
'GradientClipByGlobalNorm',
] ]
...@@ -116,29 +115,51 @@ def error_clip_callback(block, context): ...@@ -116,29 +115,51 @@ def error_clip_callback(block, context):
error_clip._append_clip_op(block, grad_n) error_clip._append_clip_op(block, grad_n)
class BaseGradientClipAttr(object): class GradientClipBase(object):
def __init__(self, need_clip=None):
if need_clip is not None and not callable(need_clip):
raise TypeError(
"The type of need_clip must be funciton, and it can filter out "
"parameter that does't need gradient clip. This function must return "
"True or False, and True means that clipping is required. Please refer to "
"API documention of GradientClipByGlobalNorm / GradientClipByNorm "
"/GradientClipByValue.")
self._need_clip_func = need_clip
def __str__(self): def __str__(self):
raise NotImplementedError() raise NotImplementedError()
def _process_context(self, context, param, grad): @imperative_base.no_grad
raise NotImplementedError() def _dygraph_clip(self, params_grads):
raise NotImplementedError
def _create_operators(self, param, grad):
raise NotImplementedError()
def _static_clip(self, params_grads):
raise NotImplementedError
class NullGradientClipAttr(BaseGradientClipAttr): def __call__(self, params_grads):
def __str__(self): assert len(
return "Null" params_grads
) > 0, "The number of trainable parameters should be greater than 0."
if framework.in_dygraph_mode():
return self._dygraph_clip(params_grads)
else:
for p, g in params_grads:
if getattr(p, 'gradient_clip_attr', None) is not None:
warnings.warn(
"'set_gradient_clip' will be ineffective, because you have "
"pass 'grad_clip' into 'minimize'. So, 'set_gradient_clip' "
"is redundant and you can remove it.")
break
return self._static_clip(params_grads)
def _process_context(self, context, param, grad): def _process_context(self, context, param, grad):
pass raise NotImplementedError()
def _create_operators(self, param, grad): def _create_operators(self, param, grad):
return param, grad raise NotImplementedError()
class GradientClipByValue(BaseGradientClipAttr): class GradientClipByValue(GradientClipBase):
""" """
Clips gradient values to the range [min, max]. Clips gradient values to the range [min, max].
...@@ -168,17 +189,46 @@ class GradientClipByValue(BaseGradientClipAttr): ...@@ -168,17 +189,46 @@ class GradientClipByValue(BaseGradientClipAttr):
input=x, size=1, param_attr=w_param_attrs) input=x, size=1, param_attr=w_param_attrs)
""" """
def __init__(self, max, min=None): def __init__(self, max, min=None, need_clip=None):
max = float(max) super(GradientClipByValue, self).__init__(need_clip)
if min is None: if min is None:
assert (max > 0.0)
min = -max min = -max
else: self.max = float(max)
min = float(min) self.min = float(min)
self.max = max
self.min = min
def __str__(self): def __str__(self):
return "ByValue, min=%f, max=%f" % (self.min, self.max) return "Gradient Clip By Value, min = %f, max=%f" % (self.min, self.max)
@imperative_base.no_grad
def _dygraph_clip(self, params_grads):
params_and_grads = []
for p, g in params_grads:
if g is None:
continue
if self._need_clip_func is not None and not self._need_clip_func(p):
params_and_grads.append((p, g))
continue
new_grad = layers.clip(x=g, min=self.min, max=self.max)
params_and_grads.append((p, new_grad))
return params_and_grads
def _static_clip(self, params_grads):
params_and_grads = []
with framework.name_scope('gradient_clip'):
for p, g in params_grads:
if g is None:
continue
if self._need_clip_func is not None and not self._need_clip_func(
p):
params_and_grads.append((p, g))
continue
with p.block.program._optimized_guard([p, g]):
new_grad = layers.clip(x=g, min=self.min, max=self.max)
params_and_grads.append((p, new_grad))
_correct_clip_op_role_var(params_and_grads)
return params_and_grads
def _process_context(self, context, param, grad): def _process_context(self, context, param, grad):
pass pass
...@@ -188,7 +238,7 @@ class GradientClipByValue(BaseGradientClipAttr): ...@@ -188,7 +238,7 @@ class GradientClipByValue(BaseGradientClipAttr):
return param, new_grad return param, new_grad
class GradientClipByNorm(BaseGradientClipAttr): class GradientClipByNorm(GradientClipBase):
""" """
Convert the input multidimensional Tensor :math:`X` to a multidimensional Tensor whose L2 norm does not exceed the given two-norm maximum ( :math:`clip\_norm` ). Convert the input multidimensional Tensor :math:`X` to a multidimensional Tensor whose L2 norm does not exceed the given two-norm maximum ( :math:`clip\_norm` ).
...@@ -268,11 +318,42 @@ class GradientClipByNorm(BaseGradientClipAttr): ...@@ -268,11 +318,42 @@ class GradientClipByNorm(BaseGradientClipAttr):
""" """
def __init__(self, clip_norm): def __init__(self, clip_norm, need_clip=None):
self.clip_norm = clip_norm super(GradientClipByNorm, self).__init__(need_clip)
self.clip_norm = float(clip_norm)
def __str__(self): def __str__(self):
return "ByNorm, clip_norm=%f" % self.clip_norm return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm
@imperative_base.no_grad
def _dygraph_clip(self, params_grads):
params_and_grads = []
for p, g in params_grads:
if g is None:
continue
if self._need_clip_func is not None and not self._need_clip_func(p):
params_and_grads.append((p, g))
continue
new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
params_and_grads.append((p, new_grad))
return params_and_grads
def _static_clip(self, params_grads):
params_and_grads = []
with framework.name_scope('gradient_clip'):
for p, g in params_grads:
if g is None:
continue
if self._need_clip_func is not None and not self._need_clip_func(
p):
params_and_grads.append((p, g))
continue
with p.block.program._optimized_guard([p, g]):
new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
params_and_grads.append((p, new_grad))
_correct_clip_op_role_var(params_and_grads)
return params_and_grads
def _process_context(self, context, param, grad): def _process_context(self, context, param, grad):
pass pass
...@@ -282,7 +363,7 @@ class GradientClipByNorm(BaseGradientClipAttr): ...@@ -282,7 +363,7 @@ class GradientClipByNorm(BaseGradientClipAttr):
return param, new_grad return param, new_grad
class GradientClipByGlobalNorm(BaseGradientClipAttr): class GradientClipByGlobalNorm(GradientClipBase):
""" """
Clips values of multiple tensors by the ratio of the sum of their norms. Clips values of multiple tensors by the ratio of the sum of their norms.
...@@ -371,16 +452,104 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): ...@@ -371,16 +452,104 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
""" """
def __init__(self, clip_norm, group_name="default_group"): def __init__(self, clip_norm, group_name="default_group", need_clip=None):
if not isinstance(group_name, six.string_types): super(GradientClipByGlobalNorm, self).__init__(need_clip)
raise TypeError("'group_name' must be a %s." % (six.string_types)) self.clip_norm = float(clip_norm)
self.clip_norm = clip_norm
self.group_name = group_name self.group_name = group_name
def __str__(self): def __str__(self):
return "ByGlobalNorm, group_name=%s, clip_norm=%f" % (self.group_name, return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)
self.clip_norm)
@imperative_base.no_grad
def _dygraph_clip(self, params_grads):
params_and_grads = []
sum_square_list = []
for p, g in params_grads:
if g is None:
continue
if self._need_clip_func is not None and not self._need_clip_func(p):
continue
merge_grad = g
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
merge_grad = layers.merge_selected_rows(g)
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
square = layers.square(merge_grad)
sum_square = layers.reduce_sum(square)
sum_square_list.append(sum_square)
# all parameters have been filterd out
if len(sum_square_list) == 0:
return params_grads
global_norm_var = layers.concat(sum_square_list)
global_norm_var = layers.reduce_sum(global_norm_var)
global_norm_var = layers.sqrt(global_norm_var)
max_global_norm = layers.fill_constant(
shape=[1], dtype='float32', value=self.clip_norm)
clip_var = layers.elementwise_div(
x=max_global_norm,
y=layers.elementwise_max(
x=global_norm_var, y=max_global_norm))
for p, g in params_grads:
if g is None:
continue
if self._need_clip_func is not None and not self._need_clip_func(p):
params_and_grads.append((p, g))
continue
new_grad = layers.elementwise_mul(x=g, y=clip_var)
params_and_grads.append((p, new_grad))
return params_and_grads
def _static_clip(self, params_grads):
params_and_grads = []
sum_square_list = []
with framework.name_scope('gradient_clip'):
for p, g in params_grads:
if g is None:
continue
if self._need_clip_func is not None and not self._need_clip_func(
p):
continue
merge_grad = g
with p.block.program._optimized_guard([p, g]):
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
merge_grad = layers.merge_selected_rows(g)
merge_grad = layers.get_tensor_from_selected_rows(
merge_grad)
square = layers.square(merge_grad)
sum_square = layers.reduce_sum(input=square)
sum_square_list.append(sum_square)
# all parameters have been filterd out
if len(sum_square_list) == 0:
return params_grads
with p.block.program._optimized_guard([p, g]):
global_norm_var = layers.sums(sum_square_list)
global_norm_var = layers.sqrt(x=global_norm_var)
max_global_norm = layers.fill_constant(
shape=[1], dtype="float32", value=self.clip_norm)
scale_var = layers.elementwise_div(
x=max_global_norm,
y=layers.elementwise_max(
x=max_global_norm, y=global_norm_var))
for p, g in params_grads:
if g is None:
continue
if self._need_clip_func is not None and not self._need_clip_func(
p):
params_and_grads.append((p, g))
continue
with p.block.program._optimized_guard([p, g]):
new_grad = layers.elementwise_mul(x=g, y=scale_var)
params_and_grads.append((p, new_grad))
_correct_clip_op_role_var(params_and_grads)
return params_and_grads
def _process_context(self, context, param, grad): def _process_context(self, context, param, grad):
if self.group_name not in context: if self.group_name not in context:
...@@ -486,12 +655,28 @@ def set_gradient_clip(clip, param_list=None, program=None): ...@@ -486,12 +655,28 @@ def set_gradient_clip(clip, param_list=None, program=None):
sgd = fluid.optimizer.SGD(learning_rate=1e-3) sgd = fluid.optimizer.SGD(learning_rate=1e-3)
sgd.minimize(loss) sgd.minimize(loss)
""" """
if not isinstance(clip, BaseGradientClipAttr): warnings.warn("Caution! 'set_gradient_clip' is not recommended "
"and may be deprecated in future! "
"We recommend a new strategy: clip gradient by "
"'optimizer.minimize(loss, grad_clip=clip)'. "
"This method can reduce the mistakes, please "
"see documention of 'optimzier.minimize'.")
if not isinstance(clip, GradientClipBase):
raise TypeError( raise TypeError(
"'clip' should be an instance of BaseGradientClipAttr's derived class" "'clip' should be an instance of GradientClipBase's derived class")
)
if program is None: if program is None:
program = framework.default_main_program() program = framework.default_main_program()
for op in program.block(0).ops:
if 'op_namescope' in op.all_attrs() and "optimizer" in op.attr(
"op_namescope"):
warnings.warn(
"'minimize' has been invoked before, this will make 'set_gradient_clip' "
"be ineffective! Please invoke 'set_gradient_clip' before 'minimize'."
)
break
if param_list is None: if param_list is None:
param_list = program.block(0).all_parameters() param_list = program.block(0).all_parameters()
if all(isinstance(elem, six.string_types) for elem in param_list): if all(isinstance(elem, six.string_types) for elem in param_list):
...@@ -511,46 +696,45 @@ def append_gradient_clip_ops(param_grads): ...@@ -511,46 +696,45 @@ def append_gradient_clip_ops(param_grads):
if g is None: if g is None:
continue continue
with p.block.program._optimized_guard( with p.block.program._optimized_guard(
[p, g]), framework.name_scope('append_clip_@CLIP'): [p, g]), framework.name_scope('gradient_clip_@CLIP'):
clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr()) clip_attr = getattr(p, 'gradient_clip_attr', None)
if clip_attr is None: if clip_attr is None:
clip_attr = NullGradientClipAttr() return param_grads
if not isinstance(clip_attr, BaseGradientClipAttr): if not isinstance(clip_attr, GradientClipBase):
raise TypeError( raise TypeError(
"clip attribute should be an instance of BaseGradientClipAttr" "clip attribute should be an instance of GradientClipBase")
)
clip_attr._process_context(context=context, param=p, grad=g) clip_attr._process_context(context=context, param=p, grad=g)
res = [] res = []
param_new_grad_dict = dict()
for p, g in param_grads: for p, g in param_grads:
if g is None: if g is None:
continue continue
with p.block.program._optimized_guard( with p.block.program._optimized_guard(
[p, g]), framework.name_scope('append_graident_clip_@CLIP'): [p, g]), framework.name_scope('graident_clip_@CLIP'):
param, new_grad = clip_attr._create_operators(param=p, grad=g) param, new_grad = clip_attr._create_operators(param=p, grad=g)
param_new_grad_dict[param.name] = new_grad.name
res.append([param, new_grad]) res.append([param, new_grad])
# change wrong mapping relation between param & grad in clip op _correct_clip_op_role_var(res)
clip_flag = '@CLIP' return res
block_id_list = []
for p, g in param_grads:
if g is None: # change wrong mapping relation between param & grad in clip op
continue def _correct_clip_op_role_var(params_grads):
block_id = p.block.idx for param, grad in params_grads:
if block_id in block_id_list: if grad is None:
continue continue
block_id_list.append(block_id) for op in param.block.program.global_block().ops:
for op in p.block.program.global_block().ops: if 'op_namescope' in op.all_attrs() and "gradient_clip" in op.attr(
if 'op_namescope' in op.all_attrs() and clip_flag in op.attr(
"op_namescope"): "op_namescope"):
if op.attr('op_role_var'): if op.attr('op_role_var'):
param_name = op.attr('op_role_var')[0] param_name = op.attr('op_role_var')[0]
correct_p_g = [param_name, param_new_grad_dict[param_name]] index = 0
for i in range(len(params_grads)):
if params_grads[i][0].name == param_name:
index = i
correct_p_g = [param_name, params_grads[index][1].name]
op._set_attr('op_role_var', correct_p_g) op._set_attr('op_role_var', correct_p_g)
return res
ClipByValue = GradientClipByValue ClipByValue = GradientClipByValue
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import copy
import six
import functools
from . import layers
from . import framework
from . import core
from .dygraph import base as imperative_base
__all__ = [
'GradClipByValue',
'GradClipByNorm',
'GradClipByGlobalNorm',
]
class GradClipBase(object):
def __str__(self):
raise NotImplementedError()
def _clip(self, para_and_grad):
raise NotImplementedError
@imperative_base.no_grad
def __call__(self, para_and_grad):
return self._clip(para_and_grad)
class GradClipByValue(GradClipBase):
"""
Clips gradient values to the range [min_value, max_value].
Given a gradient g, this operation clips its value to min_value and max_value.
- Any values less than min_value are set to min_value.
- Any values greater than max_value are set to max_value.
Args:
max_value (float): The maximum value to clip by.
min (float, optional): The minimum value to clip by. if not set by user, \
will be set to -max_value(max_value MUST be positive) by framework.
Examples:
.. code-block:: python
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dygraph.nn import Linear
from paddle.fluid.clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm
from paddle.fluid.optimizer import SGDOptimizer
with fluid.dygraph.guard():
value_clip = GradClipByValue( -1.0, 1.0 )
sgd = SGDOptimizer(learning_rate=1.0)
init_value = np.random.uniform( -1, 1, (10, 10)).astype('float32')
linear = Linear( 10, 10)
out = linear( to_variable(init_value) )
loss = fluid.layers.reduce_mean( out )
loss.backward()
sgd.minimize(loss, grad_clip = value_clip)
"""
@imperative_base.no_grad
def __init__(self, min_value, max_value=None):
if min_value is None:
assert (max_value > 0.0)
min_value = -max_value
else:
min_value = float(min_value)
self.max_value = max_value
self.min_value = min_value
def __str__(self):
return "ClipByValue, min = %f, max=%f" % (self.min_value,
self.max_value)
def _clip(self, para_and_grad):
out = []
for p, g in para_and_grad:
if g is None:
out.append((p, g))
continue
new_grad = layers.clip(x=g, min=self.min_value, max=self.max_value)
out.append((p, new_grad))
return out
class GradClipByNorm(GradClipBase):
"""
Clips tensor values to a maximum L2-norm.
This operator limits the L2 norm of the input :math:`X` within :math:`max\_norm`.
If the L2 norm of :math:`X` is less than or equal to :math:`max\_norm`, :math:`Out`
will be the same as :math:`X`. If the L2 norm of :math:`X` is greater than
:math:`max\_norm`, :math:`X` will be linearly scaled to make the L2 norm of
:math:`Out` equal to :math:`max\_norm`, as shown in the following formula:
.. math::
Out = \\frac{max\_norm * X}{norm(X)},
where :math:`norm(X)` represents the L2 norm of :math:`X`.
Args:
clip_norm (float): The maximum norm value
Examples:
.. code-block:: python
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dygraph.nn import Linear
from paddle.fluid.clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm
from paddle.fluid.optimizer import SGDOptimizer
with fluid.dygraph.guard():
norm_clip = GradClipByNorm( 5.0 )
sgd = SGDOptimizer(learning_rate=1.0)
init_value = np.random.uniform( -1, 1, (10, 10)).astype('float32')
linear = Linear( 10, 10)
out = linear( to_variable(init_value) )
loss = fluid.layers.reduce_mean( out )
loss.backward()
sgd.minimize(loss, grad_clip = norm_clip)
"""
@imperative_base.no_grad
def __init__(self, clip_norm):
self.clip_norm = clip_norm
def __str__(self):
return "ClipByNorm, clip_norm=%f" % self.clip_norm
def _clip(self, para_and_grad):
out = []
for p, g in para_and_grad:
if g is None:
out.append((p, g))
continue
new_g = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
out.append((p, new_g))
return out
class GradClipByGlobalNorm(GradClipBase):
"""
Clips values of multiple tensors by the ratio of the sum of their norms.
Given a list of tensors t_list, and a clipping ratio max_global_norm, this
operation returns a list of clipped tensors list_clipped.
To perform the clipping, the values :math:`t\_list[i]` are set to:
.. math::
t\_list[i] = t\_list[i] * \\frac{max\_global\_norm}{\max(global\_norm, max\_global\_norm)}
where:
.. math::
global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
If :math:`max\_global\_norm > global\_norm` then the entries in t_list remain as they are,
otherwise they're all shrunk by the global ratio.
Args:
max_global_norm (float): The maximum norm value.
dtype (str, optional): The type of max_global_norm. Default: "float32".
Examples:
.. code-block:: python
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dygraph.nn import Linear
from paddle.fluid.dygraph_grad_clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm
from paddle.fluid.optimizer import SGDOptimizer
with fluid.dygraph.guard():
gloabl_norm_clip = GradClipByGlobalNorm( 5.0 )
sgd = SGDOptimizer(learning_rate=1.0)
init_value = np.random.uniform( -1, 1, (10, 10)).astype('float32')
linear = Linear( 10, 10)
out = linear( to_variable(init_value) )
loss = fluid.layers.reduce_mean( out )
loss.backward()
sgd.minimize(loss, grad_clip = gloabl_norm_clip)
"""
@imperative_base.no_grad
def __init__(self, max_global_norm, dtype='float32'):
self.max_global_norm = layers.fill_constant(
shape=[1], dtype=dtype, value=max_global_norm)
def __str__(self):
return "ClipByGlobalNorm, max_global_norm=%f" % (self.max_global_norm)
def _clip(self, para_and_grad):
out = []
norm_arr = []
for p, g in para_and_grad:
if g is None:
continue
merge_grad = g
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
merge_grad = layers.merge_selected_rows(g)
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
power = layers.square(merge_grad)
sum_t = layers.reduce_sum(power)
norm_arr.append(sum_t)
norm_global = layers.concat(norm_arr)
norm_global = layers.reduce_sum(norm_global)
norm_global = layers.sqrt(norm_global)
clip_scale = self.max_global_norm / (layers.elementwise_max(
x=norm_global, y=self.max_global_norm))
for p, g in para_and_grad:
if g is None:
out.append((p, g))
continue
new_grad = g * clip_scale
out.append((p, new_grad))
return out
...@@ -2409,7 +2409,6 @@ class Block(object): ...@@ -2409,7 +2409,6 @@ class Block(object):
trainable = v.trainable trainable = v.trainable
optimize_attr = v.optimize_attr optimize_attr = v.optimize_attr
regularizer = v.regularizer regularizer = v.regularizer
gradient_clip_attr = v.gradient_clip_attr
error_clip = v.error_clip error_clip = v.error_clip
elif type(v) == Variable: elif type(v) == Variable:
var_type = "Variable" var_type = "Variable"
...@@ -2432,7 +2431,6 @@ class Block(object): ...@@ -2432,7 +2431,6 @@ class Block(object):
trainable=trainable, trainable=trainable,
optimize_attr=optimize_attr, optimize_attr=optimize_attr,
regularizer=regularizer, regularizer=regularizer,
gradient_clip_attr=gradient_clip_attr,
error_clip=error_clip) error_clip=error_clip)
else: else:
var = Parameter( var = Parameter(
...@@ -2445,7 +2443,6 @@ class Block(object): ...@@ -2445,7 +2443,6 @@ class Block(object):
trainable=trainable, trainable=trainable,
optimize_attr=optimize_attr, optimize_attr=optimize_attr,
regularizer=regularizer, regularizer=regularizer,
gradient_clip_attr=gradient_clip_attr,
error_clip=error_clip) error_clip=error_clip)
elif var_type == "Variable": elif var_type == "Variable":
var = Variable( var = Variable(
...@@ -2723,7 +2720,6 @@ class Block(object): ...@@ -2723,7 +2720,6 @@ class Block(object):
trainable=p.trainable, trainable=p.trainable,
optimize_attr=p.optimize_attr, optimize_attr=p.optimize_attr,
regularizer=p.regularizer, regularizer=p.regularizer,
gradient_clip_attr=p.gradient_clip_attr,
error_clip=p.error_clip, error_clip=p.error_clip,
name=v.name) name=v.name)
else: else:
...@@ -2737,7 +2733,6 @@ class Block(object): ...@@ -2737,7 +2733,6 @@ class Block(object):
trainable=p.trainable, trainable=p.trainable,
optimize_attr=p.optimize_attr, optimize_attr=p.optimize_attr,
regularizer=p.regularizer, regularizer=p.regularizer,
gradient_clip_attr=p.gradient_clip_attr,
error_clip=p.error_clip, error_clip=p.error_clip,
name=v.name) name=v.name)
self.vars[new_p.name] = new_p self.vars[new_p.name] = new_p
...@@ -4646,8 +4641,6 @@ class Parameter(Variable): ...@@ -4646,8 +4641,6 @@ class Parameter(Variable):
Default: {'learning_rate': 1.0} Default: {'learning_rate': 1.0}
regularizer(WeightDecayRegularizer): The Regularizer which will regularizer(WeightDecayRegularizer): The Regularizer which will
be applied on the parameter. Default: None be applied on the parameter. Default: None
gradient_clip_attr(BaseGradientClipAttr): The gradient clip strategy
which will be applied on the parameter. Default: None
do_model_average(bool): True if the model average strategy will do_model_average(bool): True if the model average strategy will
be applied on this parameter. be applied on this parameter.
""" """
...@@ -4687,8 +4680,6 @@ class Parameter(Variable): ...@@ -4687,8 +4680,6 @@ class Parameter(Variable):
self.regularizer = kwargs.get('regularizer', None) self.regularizer = kwargs.get('regularizer', None)
self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
self.do_model_average = kwargs.get('do_model_average', None) self.do_model_average = kwargs.get('do_model_average', None)
self.is_distributed = False self.is_distributed = False
...@@ -4723,7 +4714,7 @@ class Parameter(Variable): ...@@ -4723,7 +4714,7 @@ class Parameter(Variable):
if with_details: if with_details:
res_str = Variable.to_string(self, throw_on_error, True) res_str = Variable.to_string(self, throw_on_error, True)
additional_attr = ("trainable", "optimize_attr", "regularizer", additional_attr = ("trainable", "optimize_attr", "regularizer",
"gradient_clip_attr", "do_model_average") "do_model_average")
for attr_name in additional_attr: for attr_name in additional_attr:
res_str += "%s: %s\n" % (attr_name, res_str += "%s: %s\n" % (attr_name,
cpt.to_text(getattr(self, attr_name))) cpt.to_text(getattr(self, attr_name)))
...@@ -4752,8 +4743,6 @@ class ParamBase(core.VarBase): ...@@ -4752,8 +4743,6 @@ class ParamBase(core.VarBase):
Default: {'learning_rate': 1.0} Default: {'learning_rate': 1.0}
regularizer(WeightDecayRegularizer): The Regularizer which will regularizer(WeightDecayRegularizer): The Regularizer which will
be applied on the ParamBase. Default: None be applied on the ParamBase. Default: None
gradient_clip_attr(BaseGradientClipAttr): The gradient clip strategy
which will be applied on the ParamBase. Default: None
do_model_average(bool): True if the model average strategy will do_model_average(bool): True if the model average strategy will
be applied on this ParamBase. be applied on this ParamBase.
""" """
...@@ -4792,8 +4781,6 @@ class ParamBase(core.VarBase): ...@@ -4792,8 +4781,6 @@ class ParamBase(core.VarBase):
self.regularizer = kwargs.get('regularizer', None) self.regularizer = kwargs.get('regularizer', None)
self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
self.do_model_average = kwargs.get('do_model_average', None) self.do_model_average = kwargs.get('do_model_average', None)
self.is_distributed = False self.is_distributed = False
......
...@@ -24,7 +24,7 @@ from . import framework ...@@ -24,7 +24,7 @@ from . import framework
from . import layers from . import layers
from . import unique_name from . import unique_name
from .backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name from .backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
from .clip import append_gradient_clip_ops, error_clip_callback from .clip import GradientClipBase, error_clip_callback, append_gradient_clip_ops
from .framework import program_guard from .framework import program_guard
from .initializer import Constant from .initializer import Constant
from .layer_helper import LayerHelper from .layer_helper import LayerHelper
...@@ -109,6 +109,8 @@ class Optimizer(object): ...@@ -109,6 +109,8 @@ class Optimizer(object):
self._opti_name_list = [] self._opti_name_list = []
self._accumulators_holder = {} self._accumulators_holder = {}
self._param_device_map = dict() self._param_device_map = dict()
# if pass grad_clip into minimize, it will not be None
self._grad_clip = None
@framework.dygraph_only @framework.dygraph_only
def state_dict(self): def state_dict(self):
...@@ -690,12 +692,17 @@ class Optimizer(object): ...@@ -690,12 +692,17 @@ class Optimizer(object):
# ... # ...
optimizer.apply_gradients(params_grads) optimizer.apply_gradients(params_grads)
""" """
params_grads = sorted(params_grads, key=lambda x: x[0].name) params_grads = sorted(params_grads, key=lambda x: x[0].name)
params_grads, table_param_and_grad, table_optimize_op = \ params_grads, table_param_and_grad, table_optimize_op = \
self._process_distribute_lookuptable(params_grads) self._process_distribute_lookuptable(params_grads)
params_grads = append_gradient_clip_ops(params_grads) # 'minimize(grad_clip)' or 'set_gradient_clip'
if self._grad_clip is not None:
params_grads = self._grad_clip(params_grads)
else:
params_grads = append_gradient_clip_ops(params_grads)
# Add regularization if any # Add regularization if any
params_grads = append_regularization_ops(params_grads, params_grads = append_regularization_ops(params_grads,
...@@ -712,19 +719,19 @@ class Optimizer(object): ...@@ -712,19 +719,19 @@ class Optimizer(object):
""" """
Second part of `minimize`, appending optimization operators for Second part of `minimize`, appending optimization operators for
given `params_grads` pairs. given `params_grads` pairs.
Args: Args:
loss (Variable): loss variable to run optimizations. loss (Variable): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters startup_program (Program): startup_program for initializing parameters
in `parameter_list`. in `parameter_list`.
params_grads (list): list of (param, grad) pair to do optimization. params_grads (list): list of (param, grad) pair to do optimization.
Returns: Returns:
list: A list of operators appended to the current program. list: A list of operators appended to the current program.
""" """
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
with program_guard(framework.default_main_program(), with program_guard(framework.default_main_program(),
framework.default_startup_program()): framework.default_startup_program()):
if self._grad_clip is not None:
params_grads = self._grad_clip(params_grads)
params_grads = append_regularization_ops(params_grads, params_grads = append_regularization_ops(params_grads,
self.regularization) self.regularization)
optimize_ops = self._create_optimization_pass(params_grads) optimize_ops = self._create_optimization_pass(params_grads)
...@@ -809,16 +816,19 @@ class Optimizer(object): ...@@ -809,16 +816,19 @@ class Optimizer(object):
Please refer to the example of current Optimizer. Please refer to the example of current Optimizer.
""" """
assert isinstance(loss, Variable), "The loss should be an Variable." assert isinstance(loss, Variable), "The loss should be an Variable."
if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase):
raise TypeError(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
self._grad_clip = grad_clip
params_grads = self.backward( params_grads = self.backward(
loss, loss,
startup_program=startup_program, startup_program=startup_program,
parameter_list=parameter_list, parameter_list=parameter_list,
no_grad_set=no_grad_set) no_grad_set=no_grad_set)
if grad_clip is not None and framework.in_dygraph_mode():
# TODO(hongyu): FIX later, this is only for dygraph, should be work for static mode
params_grads = grad_clip(params_grads)
optimize_ops = self.apply_optimize( optimize_ops = self.apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads) loss, startup_program=startup_program, params_grads=params_grads)
...@@ -1148,6 +1158,7 @@ class DGCMomentumOptimizer(Optimizer): ...@@ -1148,6 +1158,7 @@ class DGCMomentumOptimizer(Optimizer):
self.regular_type, self.regular_coeff = self._get_regularization_param( self.regular_type, self.regular_coeff = self._get_regularization_param(
self.regularization) self.regularization)
self._grad_clip = None
def _get_regularization_param(self, regularization): def _get_regularization_param(self, regularization):
regular_type = 0 regular_type = 0
...@@ -1404,24 +1415,28 @@ class DGCMomentumOptimizer(Optimizer): ...@@ -1404,24 +1415,28 @@ class DGCMomentumOptimizer(Optimizer):
dgc_op._set_attr(op_maker.kOpRoleVarAttrName(), dgc_op._set_attr(op_maker.kOpRoleVarAttrName(),
[param_var.name, grad_var.name]) [param_var.name, grad_var.name])
@imperative_base.no_grad
def apply_gradients(self, params_grads): def apply_gradients(self, params_grads):
params_grads = sorted(params_grads, key=lambda x: x[0].name) params_grads = sorted(params_grads, key=lambda x: x[0].name)
params_grads, table_param_and_grad, table_optimize_op = \ params_grads, table_param_and_grad, table_optimize_op = \
self._process_distribute_lookuptable(params_grads) self._process_distribute_lookuptable(params_grads)
not_dgc_params_grads = [] not_dgc_params_grads = []
dgc_params_grads = [] dgc_params_grads = []
# DGC clip and regularization in optimizer.backward
for param, grad in params_grads: for param, grad in params_grads:
if not self._is_use_dgc(param, grad): if not self._is_use_dgc(param, grad):
not_dgc_params_grads.append((param, grad)) not_dgc_params_grads.append((param, grad))
else: else:
dgc_params_grads.append((param, grad)) dgc_params_grads.append((param, grad))
# DGC clip and regularization in local # 'minimize(grad_clip)' or 'set_gradient_clip'
not_dgc_params_grads = append_gradient_clip_ops(not_dgc_params_grads) if self._grad_clip is not None:
not_dgc_params_grads = self._grad_clip(not_dgc_params_grads)
else:
not_dgc_params_grads = append_gradient_clip_ops(
not_dgc_params_grads)
# Add regularization if any
not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads, not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads,
self.regularization) self.regularization)
...@@ -3942,16 +3957,13 @@ class RecomputeOptimizer(Optimizer): ...@@ -3942,16 +3957,13 @@ class RecomputeOptimizer(Optimizer):
def apply_optimize(self, loss, startup_program, params_grads): def apply_optimize(self, loss, startup_program, params_grads):
""" """
call the apply_optimize function of self._optimizer call the apply_optimize function of self._optimizer
Args: Args:
loss (Variable): loss variable to run optimizations. loss (Variable): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters startup_program (Program): startup_program for initializing parameters
in `parameter_list`. in `parameter_list`.
params_grads (list): list of (param, grad) pair to do optimization. params_grads (list): list of (param, grad) pair to do optimization.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid import paddle.fluid as fluid
def mlp(input_x, input_y, hid_dim=128, label_dim=2): def mlp(input_x, input_y, hid_dim=128, label_dim=2):
...@@ -3979,7 +3991,6 @@ class RecomputeOptimizer(Optimizer): ...@@ -3979,7 +3991,6 @@ class RecomputeOptimizer(Optimizer):
cost, startup_program=None, params_grads=params_grads) cost, startup_program=None, params_grads=params_grads)
print("Finished apply_optimize") print("Finished apply_optimize")
""" """
return self._optimizer.apply_optimize( return self._optimizer.apply_optimize(
...@@ -3991,24 +4002,24 @@ class RecomputeOptimizer(Optimizer): ...@@ -3991,24 +4002,24 @@ class RecomputeOptimizer(Optimizer):
parameter_list=None, parameter_list=None,
no_grad_set=None, no_grad_set=None,
grad_clip=None): grad_clip=None):
assert isinstance(loss, Variable), "The loss should be an Variable."
assert (isinstance(loss, Variable)), "The loss should be an Variable."
assert (self._checkpoints is not None assert (self._checkpoints is not None
), "You should call _set_checkpoints first" ), "You should call _set_checkpoints first"
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
raise NotImplementedError( raise NotImplementedError(
"DyGraph current does not support recompute") "DyGraph current does not support recompute")
if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase):
raise TypeError(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
self._optimizer._grad_clip = grad_clip
params_grads = self.backward( params_grads = self.backward(
loss, loss,
startup_program=startup_program, startup_program=startup_program,
parameter_list=parameter_list, parameter_list=parameter_list,
no_grad_set=no_grad_set) no_grad_set=no_grad_set)
if grad_clip:
# TODO(guru4elephant): should add grad_clip for static graph
pass
optimize_ops = self.apply_optimize( optimize_ops = self.apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads) loss, startup_program=startup_program, params_grads=params_grads)
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
from __future__ import print_function from __future__ import print_function
import six import six
import warnings
from .initializer import Initializer, Xavier, Constant from .initializer import Initializer, Xavier, Constant
from .regularizer import WeightDecayRegularizer from .regularizer import WeightDecayRegularizer
...@@ -68,7 +69,6 @@ class ParamAttr(object): ...@@ -68,7 +69,6 @@ class ParamAttr(object):
learning_rate=1.0, learning_rate=1.0,
regularizer=None, regularizer=None,
trainable=True, trainable=True,
gradient_clip=None,
do_model_average=True): do_model_average=True):
self.name = name self.name = name
if isinstance(self.name, six.string_types) and self.name == "": if isinstance(self.name, six.string_types) and self.name == "":
...@@ -78,7 +78,6 @@ class ParamAttr(object): ...@@ -78,7 +78,6 @@ class ParamAttr(object):
self.learning_rate = learning_rate self.learning_rate = learning_rate
self.regularizer = regularizer self.regularizer = regularizer
self.trainable = trainable self.trainable = trainable
self.gradient_clip = gradient_clip
self.do_model_average = do_model_average self.do_model_average = do_model_average
def _set_default_initializer(self, initializer): def _set_default_initializer(self, initializer):
...@@ -176,7 +175,6 @@ class ParamAttr(object): ...@@ -176,7 +175,6 @@ class ParamAttr(object):
}, },
'regularizer': self.regularizer, 'regularizer': self.regularizer,
'trainable': self.trainable, 'trainable': self.trainable,
'gradient_clip_attr': self.gradient_clip,
'do_model_average': self.do_model_average 'do_model_average': self.do_model_average
} }
if with_initializer: if with_initializer:
...@@ -248,7 +246,6 @@ class WeightNormParamAttr(ParamAttr): ...@@ -248,7 +246,6 @@ class WeightNormParamAttr(ParamAttr):
learning_rate=1.0, learning_rate=1.0,
regularizer=None, regularizer=None,
trainable=True, trainable=True,
gradient_clip=None,
do_model_average=False): do_model_average=False):
super(WeightNormParamAttr, self).__init__( super(WeightNormParamAttr, self).__init__(
name=name, name=name,
...@@ -256,6 +253,5 @@ class WeightNormParamAttr(ParamAttr): ...@@ -256,6 +253,5 @@ class WeightNormParamAttr(ParamAttr):
learning_rate=learning_rate, learning_rate=learning_rate,
regularizer=regularizer, regularizer=regularizer,
trainable=trainable, trainable=trainable,
gradient_clip=gradient_clip,
do_model_average=do_model_average) do_model_average=do_model_average)
self.dim = dim self.dim = dim
...@@ -476,15 +476,18 @@ class TestL2Decay(TranspilerTest): ...@@ -476,15 +476,18 @@ class TestL2Decay(TranspilerTest):
size=1000, size=1000,
act=None, act=None,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name='fc_w', name='fc_w', regularizer=fluid.regularizer.L2Decay()),
regularizer=fluid.regularizer.L2Decay(),
gradient_clip=fluid.clip.GradientClipByValue(0.1)),
bias_attr=fluid.ParamAttr(name='fc_b')) bias_attr=fluid.ParamAttr(name='fc_b'))
y = fluid.layers.data(name='y', shape=[1], dtype='float32') y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y) cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
sgd_optimizer.minimize(avg_cost)
def filter(param):
return param.name == "fc_w"
clip = fluid.clip.GradientClipByValue(0.1, need_clip=filter)
sgd_optimizer.minimize(avg_cost, grad_clip=clip)
def transpiler_test_impl(self): def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep) pserver, startup = self.get_pserver(self.pserver1_ep)
......
...@@ -25,7 +25,7 @@ from paddle.fluid import core ...@@ -25,7 +25,7 @@ from paddle.fluid import core
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dygraph_grad_clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm from paddle.fluid.clip import GradientClipByValue, GradientClipByNorm, GradientClipByGlobalNorm
class TestGradClipByGlobalNorm(unittest.TestCase): class TestGradClipByGlobalNorm(unittest.TestCase):
...@@ -65,7 +65,7 @@ class TestGradClipByGlobalNorm(unittest.TestCase): ...@@ -65,7 +65,7 @@ class TestGradClipByGlobalNorm(unittest.TestCase):
def get_dygrap_global_norm_result(self): def get_dygrap_global_norm_result(self):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
gloabl_norm_clip = GradClipByGlobalNorm(self.max_global_norm) gloabl_norm_clip = GradientClipByGlobalNorm(self.max_global_norm)
p_g_var = [] p_g_var = []
for p, g in self.para_and_grad: for p, g in self.para_and_grad:
new_p = to_variable(p) new_p = to_variable(p)
...@@ -135,7 +135,7 @@ class TestGradClipByNorm(unittest.TestCase): ...@@ -135,7 +135,7 @@ class TestGradClipByNorm(unittest.TestCase):
def get_dygrap_norm_result(self): def get_dygrap_norm_result(self):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
norm_clip = GradClipByNorm(self.max_norm) norm_clip = GradientClipByNorm(self.max_norm)
p_g_var = [] p_g_var = []
for p, g in self.para_and_grad: for p, g in self.para_and_grad:
new_p = to_variable(p) new_p = to_variable(p)
...@@ -200,8 +200,8 @@ class TestGradClipByValue(unittest.TestCase): ...@@ -200,8 +200,8 @@ class TestGradClipByValue(unittest.TestCase):
def get_dygrap_clip_result(self): def get_dygrap_clip_result(self):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
value_clip = GradientClipByValue(
value_clip = GradClipByValue(self.min_value, self.max_value) max=self.max_value, min=self.min_value)
p_g_var = [] p_g_var = []
for p, g in self.para_and_grad: for p, g in self.para_and_grad:
new_p = to_variable(p) new_p = to_variable(p)
...@@ -225,7 +225,7 @@ class TestGradClipByValue(unittest.TestCase): ...@@ -225,7 +225,7 @@ class TestGradClipByValue(unittest.TestCase):
for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g): for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g):
self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8)) self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8))
def test_clip_by_norm_2(self): def test_clip_by_value_2(self):
self.init_value() self.init_value()
self.init_scale = 0.2 self.init_scale = 0.2
...@@ -236,7 +236,7 @@ class TestGradClipByValue(unittest.TestCase): ...@@ -236,7 +236,7 @@ class TestGradClipByValue(unittest.TestCase):
for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g): for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g):
self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8)) self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8))
def test_clip_by_norm_3(self): def test_clip_by_value_3(self):
self.init_value() self.init_value()
self.init_scale = 0.5 self.init_scale = 0.5
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
...@@ -54,26 +54,32 @@ class TestGradientClip(unittest.TestCase): ...@@ -54,26 +54,32 @@ class TestGradientClip(unittest.TestCase):
self.BATCH_SIZE = 2 self.BATCH_SIZE = 2
reader = fake_imdb_reader(self.word_dict_len, self.BATCH_SIZE * 100) reader = fake_imdb_reader(self.word_dict_len, self.BATCH_SIZE * 100)
self.train_data = paddle.batch(reader, batch_size=self.BATCH_SIZE) self.train_data = paddle.batch(reader, batch_size=self.BATCH_SIZE)
self.init()
def init(self):
pass
def get_places(self): def get_places(self):
places = [core.CPUPlace()] places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(fluid.CUDAPlace(0))
return places return places
def check_operators(self, place): def clip_gradient(self, params_grads):
CLIP = 1 pass
prog = fluid.framework.Program() def check_clip_result(self, out, out_clip):
startup_program = fluid.framework.Program() pass
def check_gradient_clip(self, place):
prog = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard( with fluid.program_guard(
main_program=prog, startup_program=startup_program): main_program=prog, startup_program=startup_program):
image = fluid.layers.data(name='x', shape=[784], dtype='float32') image = fluid.data(name='x', shape=[-1, 784], dtype='float32')
label = fluid.layers.data(name='y', shape=[1], dtype='int64') label = fluid.data(name='y', shape=[-1, 1], dtype='int64')
hidden = fluid.layers.fc(input=image, size=32, act='relu')
hidden1 = fluid.layers.fc(input=image, size=128, act='relu') predict = fluid.layers.fc(input=hidden, size=10, act='softmax')
hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label) cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
...@@ -84,45 +90,26 @@ class TestGradientClip(unittest.TestCase): ...@@ -84,45 +90,26 @@ class TestGradientClip(unittest.TestCase):
p_g = fluid.backward.append_backward(loss=avg_cost) p_g = fluid.backward.append_backward(loss=avg_cost)
p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip) p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
p_g = sorted(p_g, key=lambda x: x[0].name)
p_g_clip = sorted(p_g_clip, key=lambda x: x[0].name)
with fluid.program_guard( with fluid.program_guard(
main_program=prog_clip, startup_program=startup_program): main_program=prog_clip, startup_program=startup_program):
fluid.clip.set_gradient_clip( p_g_clip = self.clip_gradient(p_g_clip)
fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP))
p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
grad_list = [elem[1] for elem in p_g] grad_list = [elem[1] for elem in p_g]
grad_clip_list = [elem[1] for elem in p_g_clip] grad_clip_list = [elem[1] for elem in p_g_clip]
train_reader = paddle.batch( train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=3)
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=8192),
batch_size=128)
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[image, label], place=place) feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
exe.run(startup_program) exe.run(startup_program)
count = 0 data = next(train_reader())
for data in train_reader(): out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list)
count += 1 out_clip = exe.run(prog_clip,
if count > 5: feed=feeder.feed(data),
break fetch_list=grad_clip_list)
out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list) self.check_clip_result(out, out_clip)
out_clip = exe.run(prog_clip,
feed=feeder.feed(data),
fetch_list=grad_clip_list)
global_norm = 0
for v in out:
global_norm += np.sum(np.power(v, 2))
global_norm = np.sqrt(global_norm)
global_norm_clip = 0
for v in out_clip:
global_norm_clip += np.sum(np.power(v, 2))
global_norm_clip = np.sqrt(global_norm_clip)
assert np.isclose(
a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3)
def check_sparse_gradient_clip(self, place): def check_sparse_gradient_clip(self, place):
prog = fluid.framework.Program() prog = fluid.framework.Program()
...@@ -134,11 +121,7 @@ class TestGradientClip(unittest.TestCase): ...@@ -134,11 +121,7 @@ class TestGradientClip(unittest.TestCase):
label = fluid.layers.data(name="label", shape=[1], dtype="int64") label = fluid.layers.data(name="label", shape=[1], dtype="int64")
cost = bow_net(data, label, self.word_dict_len) cost = bow_net(data, label, self.word_dict_len)
fluid.clip.set_gradient_clip( self.backward_and_optimize(cost)
clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0))
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
sgd_optimizer.minimize(cost)
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[data, label], place=place) feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
...@@ -150,13 +133,345 @@ class TestGradientClip(unittest.TestCase): ...@@ -150,13 +133,345 @@ class TestGradientClip(unittest.TestCase):
print(val) print(val)
self.assertFalse(np.isnan(val)) self.assertFalse(np.isnan(val))
def test_operators(self): def backward_and_optimize(cost):
self.check_operators(core.CPUPlace()) pass
class TestGradientClipByGlobalNorm(TestGradientClip):
def init(self):
self.clip_norm = 0.2
def clip_gradient(self, params_grads):
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
print(clip)
return clip(params_grads)
def check_clip_result(self, out, out_clip):
global_norm = 0
for v in out:
global_norm += np.sum(np.power(v, 2))
global_norm = np.sqrt(global_norm)
scale = self.clip_norm / np.maximum(self.clip_norm, global_norm)
res = []
for i in range(len(out)):
out[i] = scale * out[i]
for u, v in zip(out, out_clip):
self.assertTrue(
np.allclose(
a=u, b=v, rtol=1e-5, atol=1e-8),
"gradient clip by global norm has wrong results!")
# test whether the ouput is right when use 'set_gradient_clip'
def test_old_gradient_clip(self):
def func(params_grads):
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
fluid.clip.set_gradient_clip(clip)
return fluid.clip.append_gradient_clip_ops(params_grads)
self.clip_gradient = func
self.check_gradient_clip(fluid.CPUPlace())
# test whether the ouput is right when use 'minimize(grad_clip)'
def test_new_gradient_clip(self):
def func(params_grads):
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
print(clip)
return clip(params_grads)
def test_sparse_gradient_clip(self): self.clip_gradient = func
self.check_gradient_clip(fluid.CPUPlace())
# invoke 'set_gradient_clip' in a wrong order
def test_wrong_API_order(self):
def backward_func(cost):
# no clip gradient
def fileter_func(param):
return param.name == "fc.w_0"
clip = fluid.clip.GradientClipByGlobalNorm(
clip_norm=5.0, need_clip=fileter_func)
fluid.clip.set_gradient_clip(clip)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
# if 'set_gradient_clip' and 'minimize(grad_clip)' together, 'set_gradient_clip' will be ineffective
sgd_optimizer.minimize(cost, grad_clip=clip)
# 'set_gradient_clip' must before 'minimize', otherwise, 'set_gradient_clip' will be ineffective
fluid.clip.set_gradient_clip(clip)
self.backward_and_optimize = backward_func
for place in self.get_places(): for place in self.get_places():
self.check_sparse_gradient_clip(place) self.check_sparse_gradient_clip(place)
# if grad is None or not need clip
def test_none_grad(self):
def fileter_func(param):
return param.name == "x"
clip = fluid.clip.GradientClipByGlobalNorm(
self.clip_norm, need_clip=fileter_func)
x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32")
y = fluid.default_main_program().global_block().create_parameter(
name="y", shape=[2, 3], dtype="float32")
# (x, None) should not be returned
params_grads = [(x, None), (x, y), (y, x)]
params_grads = clip(params_grads)
self.assertTrue(
len(clip(params_grads)) == 2,
"ClipByGlobalNorm: when grad is None, it shouldn't be returned by gradient clip!"
)
self.assertTrue(
params_grads[0][1].name != 'y',
"ClipByGlobalNorm: param_grad (x, y) should be clipped!")
# raise typeError
def test_tpyeError(self):
# the type of need_clip must be an funciton
with self.assertRaises(TypeError):
clip = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm, need_clip="test")
# the type of minimize(grad_clip=) must be an instance of GradientClipBase's derived class
with self.assertRaises(TypeError):
x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32")
loss = fluid.layers.reduce_mean(x)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
sgd_optimizer.minimize(loss, grad_clip="test")
# the type of RecomputeOptimizer.minimize(grad_clip=) must be an instance of GradientClipBase's derived class
with self.assertRaises(TypeError):
x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32")
loss = fluid.layers.reduce_mean(x)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
recompute_optimizer = fluid.optimizer.RecomputeOptimizer(
sgd_optimizer)
recompute_optimizer._set_checkpoints([x])
recompute_optimizer.minimize(loss, grad_clip="test")
class TestGradientClipByNorm(TestGradientClip):
def init(self):
self.clip_norm = 0.2
def clip_gradient(self, params_grads):
clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
print(clip)
return clip(params_grads)
def check_clip_result(self, out, out_clip):
for u, v in zip(out, out_clip):
norm = np.sqrt(np.sum(np.power(u, 2)))
scale = self.clip_norm / np.maximum(self.clip_norm, norm)
u = u * scale
self.assertTrue(
np.allclose(
a=u, b=v, rtol=1e-5, atol=1e-8),
"gradient clip by norm has wrong results!")
# test whether the ouput is right when use 'minimize(grad_clip)'
def test_gradient_clip(self):
self.check_gradient_clip(fluid.CPUPlace())
# if grad is None or not need clip
def test_none_grad(self):
def fileter_func(param):
return param.name == "z"
clip = fluid.clip.GradientClipByNorm(
self.clip_norm, need_clip=fileter_func)
x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32")
y = fluid.default_main_program().global_block().create_parameter(
name="y", shape=[2, 3], dtype="float32")
# (x, None) should not be returned
params_grads = [(x, None), (x, y)]
params_grads = clip(params_grads)
self.assertTrue(
len(clip(params_grads)) == 1,
"ClipByNorm: when grad is None, it shouldn't be returned by gradient clip!"
)
self.assertTrue(
params_grads[0][1].name == 'y',
"ClipByNorm: grad should not be clipped when filtered out!")
class TestGradientClipByValue(TestGradientClip):
def init(self):
self.max = 0.2
self.min = 0.1
def clip_gradient(self, params_grads):
clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
print(clip)
return clip(params_grads)
def check_clip_result(self, out, out_clip):
for i, v in enumerate(out):
out[i] = np.clip(v, self.min, self.max)
for u, v in zip(out, out_clip):
u = np.clip(u, self.min, self.max)
self.assertTrue(
np.allclose(
a=u, b=v, rtol=1e-6, atol=1e-8),
"gradient clip by value has wrong results!")
# test whether the ouput is right when use 'minimize(grad_clip)'
def test_gradient_clip(self):
self.check_gradient_clip(fluid.CPUPlace())
# if grad is None or not need clip
def test_none_grad(self):
def fileter_func(param):
return param.name == "z"
clip = fluid.clip.GradientClipByValue(
self.max, self.min, need_clip=fileter_func)
x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32")
y = fluid.default_main_program().global_block().create_parameter(
name="y", shape=[2, 3], dtype="float32")
# (x, None) should not be returned
params_grads = [(x, None), (x, y)]
params_grads = clip(params_grads)
self.assertTrue(
len(clip(params_grads)) == 1,
"ClipByValue: when grad is None, it shouldn't be returned by gradient clip!"
)
self.assertTrue(
params_grads[0][1].name == 'y',
"ClipByValue: grad should not be clipped when filtered out!")
class TestDygraphGradientClip(unittest.TestCase):
def test_gradient_clip(self):
with fluid.dygraph.guard():
linear = fluid.dygraph.Linear(5, 5)
inputs = fluid.layers.uniform_random(
[16, 5], min=-10, max=10).astype('float32')
out = linear(fluid.dygraph.to_variable(inputs))
loss = fluid.layers.reduce_mean(out)
loss.backward()
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=0.0, parameter_list=linear.parameters())
self.check_clip_result(loss, sgd_optimizer)
def check_clip_result(self, loss, optimizer):
pass
class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
def setUp(self):
# only clip gradient of x (ParamBase)
def fileter_func(param):
return param.name == "x"
self.clip_norm = 0.8
self.clip1 = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm, need_clip=fileter_func)
self.clip2 = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm)
def check_clip_result(self, loss, optimizer):
# if grad is None
x = fluid.dygraph.to_variable(
np.array([2, 3]).astype("float32"), name="x")
y = fluid.dygraph.to_variable(
np.array([3, 4]).astype("float32"), name="y")
assert len(self.clip1([(x, x), (x, y), (x, None)])) == 2
# get params and grads from network
opt, params_grads = optimizer.minimize(loss, grad_clip=self.clip2)
_, grads = zip(*params_grads)
params_grads = self.clip2(params_grads)
_, grads_clip = zip(*params_grads)
global_norm = 0
for u in grads:
u = u.numpy()
global_norm += np.sum(np.power(u, 2))
global_norm = np.sqrt(global_norm)
global_norm_clip = 0
for v in grads_clip:
v = v.numpy()
global_norm_clip += np.sum(np.power(v, 2))
global_norm_clip = np.sqrt(global_norm_clip)
a = np.minimum(global_norm, self.clip_norm)
b = global_norm_clip
self.assertTrue(
np.isclose(
a=a, b=b, rtol=1e-6, atol=1e-8),
"gradient clip by global norm has wrong results, expetcd:%f, but recieved:%f"
% (a, b))
class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
def setUp(self):
# only clip gradient of linear_0.w_0 (ParamBase)
def fileter_func(param):
return param.name == "linear_0.w_0"
self.clip_norm = 0.8
self.clip = fluid.clip.GradientClipByNorm(
clip_norm=self.clip_norm, need_clip=fileter_func)
def check_clip_result(self, loss, optimizer):
# if grad is None
x = fluid.dygraph.to_variable(np.array([2, 3]).astype("float32"))
assert len(self.clip([(x, None)])) == 0
# get params and grads from network
self.clip([(fluid.dygraph.to_variable(np.array([2, 3])), None)])
params_grads = optimizer.backward(loss)
_, grads = zip(*params_grads)
params_grads = self.clip(params_grads)
_, grads_clip = zip(*params_grads)
for u, v in zip(grads, grads_clip):
u = u.numpy()
v = v.numpy()
a = np.sqrt(np.sum(np.power(u, 2)))
a = np.minimum(a, self.clip_norm)
b = np.sqrt(np.sum(np.power(v, 2)))
self.assertTrue(
np.isclose(
a=a, b=b, rtol=1e-6, atol=1e-8),
"gradient clip by norm has wrong results, expetcd:%f, but recieved:%f"
% (a, b))
class TestDygraphGradientClipByValue(TestDygraphGradientClip):
def setUp(self):
# only clip gradient of linear_0.w_0 (ParamBase)
def fileter_func(param):
return param.name == "linear_0.w_0"
self.max = 0.2
self.min = 0.1
self.clip = fluid.clip.GradientClipByValue(
max=self.max, min=self.min, need_clip=fileter_func)
def check_clip_result(self, loss, optimizer):
# if grad is None
x = fluid.dygraph.to_variable(np.array([2, 3]).astype("float32"))
assert len(self.clip([(x, None)])) == 0
# get params and grads from network
params_grads = optimizer.backward(loss)
_, grads = zip(*params_grads)
params_grads = self.clip(params_grads)
_, grads_clip = zip(*params_grads)
for u, v in zip(grads, grads_clip):
u = np.clip(u.numpy(), self.min, self.max)
v = v.numpy()
self.assertTrue(
np.allclose(
a=u, b=v, rtol=1e-6, atol=1e-8),
"gradient clip by value has wrong results!")
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -331,7 +331,7 @@ class TestImperativeAutoPrune(unittest.TestCase): ...@@ -331,7 +331,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
model = MyLayer(size, vocab_size, size) model = MyLayer(size, vocab_size, size)
optimizer = fluid.optimizer.AdamOptimizer( optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters()) 0.001, parameter_list=model.parameters())
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001) grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
indices = fluid.dygraph.to_variable(indices) indices = fluid.dygraph.to_variable(indices)
embed = fluid.dygraph.to_variable(embed) embed = fluid.dygraph.to_variable(embed)
...@@ -350,7 +350,7 @@ class TestImperativeAutoPrune(unittest.TestCase): ...@@ -350,7 +350,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
model = MyLayer2(size, vocab_size, size) model = MyLayer2(size, vocab_size, size)
optimizer = fluid.optimizer.AdamOptimizer( optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters()) 0.001, parameter_list=model.parameters())
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001) grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
indices = fluid.dygraph.to_variable(indices) indices = fluid.dygraph.to_variable(indices)
emebd = fluid.dygraph.to_variable(embed) emebd = fluid.dygraph.to_variable(embed)
......
...@@ -49,7 +49,7 @@ class TestSimpleNet(unittest.TestCase): ...@@ -49,7 +49,7 @@ class TestSimpleNet(unittest.TestCase):
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = sort_sum_gradient backward_strategy.sort_sum_gradient = sort_sum_gradient
# grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0) # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
input_word = np.array([[1, 2], [2, 1]]).astype('int64') input_word = np.array([[1, 2], [2, 1]]).astype('int64')
input = to_variable(input_word) input = to_variable(input_word)
...@@ -83,8 +83,7 @@ class TestSimpleNet(unittest.TestCase): ...@@ -83,8 +83,7 @@ class TestSimpleNet(unittest.TestCase):
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = sort_sum_gradient backward_strategy.sort_sum_gradient = sort_sum_gradient
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm( grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
5.0)
input_word = np.array([[1, 2], [2, 1]]).astype('int64') input_word = np.array([[1, 2], [2, 1]]).astype('int64')
input = to_variable(input_word) input = to_variable(input_word)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册