Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
994438b1
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
994438b1
编写于
10月 09, 2020
作者:
Q
Qi Li
提交者:
GitHub
10月 09, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
change clip grad api, test=develop (#27767)
上级
365c2c9c
变更
6
显示空白变更内容
内联
并排
Showing
6 changed file
with
121 addition
and
174 deletion
+121
-174
python/paddle/fluid/clip.py
python/paddle/fluid/clip.py
+62
-96
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+9
-1
python/paddle/fluid/param_attr.py
python/paddle/fluid/param_attr.py
+19
-10
python/paddle/fluid/tests/unittests/test_gradient_clip.py
python/paddle/fluid/tests/unittests/test_gradient_clip.py
+15
-51
python/paddle/nn/__init__.py
python/paddle/nn/__init__.py
+10
-10
python/paddle/nn/clip.py
python/paddle/nn/clip.py
+6
-6
未找到文件。
python/paddle/fluid/clip.py
浏览文件 @
994438b1
...
...
@@ -26,8 +26,8 @@ from . import name_scope
from
.dygraph
import
base
as
imperative_base
__all__
=
[
'set_gradient_clip'
,
'ErrorClipByValue'
,
'
GradientClip
ByValue'
,
'
GradientClipByNorm'
,
'GradientClip
ByGlobalNorm'
'set_gradient_clip'
,
'ErrorClipByValue'
,
'
ClipGrad
ByValue'
,
'
ClipGradByNorm'
,
'ClipGrad
ByGlobalNorm'
]
...
...
@@ -115,16 +115,9 @@ def error_clip_callback(block, context):
error_clip
.
_append_clip_op
(
block
,
grad_n
)
class
GradientClipBase
(
object
):
def
__init__
(
self
,
need_clip
=
None
):
if
need_clip
is
not
None
and
not
callable
(
need_clip
):
raise
TypeError
(
"The type of need_clip must be funciton, and it can filter out "
"parameter that does't need gradient clip. This function must return "
"True or False, and True means that clipping is required. Please refer to "
"API documention of GradientClipByGlobalNorm / GradientClipByNorm "
"/GradientClipByValue."
)
self
.
_need_clip_func
=
need_clip
class
ClipGradBase
(
object
):
def
__init__
(
self
):
super
(
ClipGradBase
,
self
).
__init__
()
def
__str__
(
self
):
raise
NotImplementedError
()
...
...
@@ -144,7 +137,7 @@ class GradientClipBase(object):
if
getattr
(
p
,
'gradient_clip_attr'
,
None
)
is
not
None
:
warnings
.
warn
(
"'set_gradient_clip' will be ineffective, because you have "
"set '
grad_clip' in 'optimize
r'. So, 'set_gradient_clip' "
"set '
need_clip' in 'ParamAtt
r'. So, 'set_gradient_clip' "
"is redundant and you can remove it."
)
break
return
self
.
_static_clip
(
params_grads
)
...
...
@@ -156,7 +149,7 @@ class GradientClipBase(object):
raise
NotImplementedError
()
class
GradientClipByValue
(
GradientClip
Base
):
class
ClipGradByValue
(
ClipGrad
Base
):
"""
Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
...
...
@@ -164,19 +157,20 @@ class GradientClipByValue(GradientClipBase):
- Any values greater than max are set to ``max``.
The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters
in ``Program`` . If ``need_clip``
is not None, then only part of gradients can be selected for gradient clipping
.
The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters
set in ``optimizer``.
If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped
.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`).
Note:
``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args:
max (float): The maximum value to clip by.
min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
automatically. In this case, ``max`` must be greater than 0.
need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool``
(True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None,
and gradients of all parameters in the network will be clipped.
Examples:
.. code-block:: python
...
...
@@ -184,29 +178,20 @@ class GradientClipByValue(GradientClipBase):
import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(10, 10)
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
loss = paddle.mean(out)
loss.backward()
# clip all parameters in network:
clip = paddle.nn.GradientClipByValue(min=-1, max=1)
# clip a part of parameters in network: (e.g. linear_0.w_0)
# pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
# def fileter_func(ParamBase):
# # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
# return ParamBase.name == "linear_0.w_0"
# # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
# return ParamBase.name == linear.weight.name
# clip = paddle.nn.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)
clip = paddle.nn.ClipGradByValue(min=-1, max=1)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step()
"""
def
__init__
(
self
,
max
,
min
=
None
,
need_clip
=
None
):
super
(
GradientClipByValue
,
self
).
__init__
(
need_clip
)
def
__init__
(
self
,
max
,
min
=
None
):
super
(
ClipGradByValue
,
self
).
__init__
(
)
if
min
is
None
:
assert
(
max
>
0.0
)
min
=
-
max
...
...
@@ -214,7 +199,7 @@ class GradientClipByValue(GradientClipBase):
self
.
min
=
float
(
min
)
def
__str__
(
self
):
return
"
Gradient Clip
By Value, min = %f, max=%f"
%
(
self
.
min
,
self
.
max
)
return
"
Clip Gradient
By Value, min = %f, max=%f"
%
(
self
.
min
,
self
.
max
)
@
imperative_base
.
no_grad
def
_dygraph_clip
(
self
,
params_grads
):
...
...
@@ -222,7 +207,7 @@ class GradientClipByValue(GradientClipBase):
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
self
.
_need_clip_func
is
not
None
and
not
self
.
_need_clip_func
(
p
)
:
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
new_grad
=
layers
.
clip
(
x
=
g
,
min
=
self
.
min
,
max
=
self
.
max
)
...
...
@@ -236,8 +221,7 @@ class GradientClipByValue(GradientClipBase):
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
self
.
_need_clip_func
is
not
None
and
not
self
.
_need_clip_func
(
p
):
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
...
...
@@ -256,7 +240,7 @@ class GradientClipByValue(GradientClipBase):
return
param
,
new_grad
class
GradientClipByNorm
(
GradientClip
Base
):
class
ClipGradByNorm
(
ClipGrad
Base
):
"""
Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
...
...
@@ -264,8 +248,8 @@ class GradientClipByNorm(GradientClipBase):
- If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters
in ``Program`` . If ``need_clip``
is not None, then only part of gradients can be selected for gradient clipping
.
The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters
set in ``optimizer``.
If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped
.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`).
...
...
@@ -287,11 +271,12 @@ class GradientClipByNorm(GradientClipBase):
.. math::
norm(X) = (
\\
sum_{i=1}^{n}|x\_i|^2)^{
\\
frac{1}{2}}
Note:
``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args:
clip_norm(float): The maximum norm value.
need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool``
(True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None,
and gradients of all parameters in the network will be clipped.
Examples:
.. code-block:: python
...
...
@@ -299,29 +284,20 @@ class GradientClipByNorm(GradientClipBase):
import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(10, 10)
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
loss = paddle.mean(out)
loss.backward()
# clip all parameters in network:
clip = paddle.nn.GradientClipByNorm(clip_norm=1.0)
# clip a part of parameters in network: (e.g. linear_0.w_0)
# pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
# def fileter_func(ParamBase):
# # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
# return ParamBase.name == "linear_0.w_0"
# # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
# return ParamBase.name == linear.weight.name
# clip = paddle.nn.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func)
clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step()
"""
def
__init__
(
self
,
clip_norm
,
need_clip
=
None
):
super
(
GradientClipByNorm
,
self
).
__init__
(
need_clip
)
def
__init__
(
self
,
clip_norm
):
super
(
ClipGradByNorm
,
self
).
__init__
(
)
self
.
clip_norm
=
float
(
clip_norm
)
def
__str__
(
self
):
...
...
@@ -333,7 +309,7 @@ class GradientClipByNorm(GradientClipBase):
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
self
.
_need_clip_func
is
not
None
and
not
self
.
_need_clip_func
(
p
)
:
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
new_grad
=
layers
.
clip_by_norm
(
x
=
g
,
max_norm
=
self
.
clip_norm
)
...
...
@@ -347,8 +323,7 @@ class GradientClipByNorm(GradientClipBase):
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
self
.
_need_clip_func
is
not
None
and
not
self
.
_need_clip_func
(
p
):
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
...
...
@@ -367,7 +342,7 @@ class GradientClipByNorm(GradientClipBase):
return
param
,
new_grad
class
GradientClipByGlobalNorm
(
GradientClip
Base
):
class
ClipGradByGlobalNorm
(
ClipGrad
Base
):
"""
Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
:math:`t\_list` , and limit it to ``clip_norm`` .
...
...
@@ -376,8 +351,8 @@ class GradientClipByGlobalNorm(GradientClipBase):
- If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters
in ``Program`` . If ``need_clip``
is not None, then only part of gradients can be selected for gradient clipping
.
The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters
set in ``optimizer``.
If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped
.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`).
...
...
@@ -394,12 +369,13 @@ class GradientClipByGlobalNorm(GradientClipBase):
global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
Note:
``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args:
clip_norm (float): The maximum norm value.
group_name (str, optional): The group name for this clip. Default value is ``default_group``
need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool``
(True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None,
and gradients of all parameters in the network will be clipped.
group_name (str, optional): The group name for this clip. Default value is ``default_group``.
Examples:
.. code-block:: python
...
...
@@ -407,29 +383,20 @@ class GradientClipByGlobalNorm(GradientClipBase):
import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(10, 10)
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
loss = paddle.mean(out)
loss.backward()
# clip all parameters in network:
clip = paddle.nn.GradientClipByGlobalNorm(clip_norm=1.0)
# clip a part of parameters in network: (e.g. linear_0.w_0)
# pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
# def fileter_func(ParamBase):
# # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
# return ParamBase.name == "linear_0.w_0"
# # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
# return ParamBase.name == linear.weight.name
# clip = paddle.nn.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step()
"""
def
__init__
(
self
,
clip_norm
,
group_name
=
"default_group"
,
need_clip
=
None
):
super
(
GradientClipByGlobalNorm
,
self
).
__init__
(
need_clip
)
def
__init__
(
self
,
clip_norm
,
group_name
=
"default_group"
):
super
(
ClipGradByGlobalNorm
,
self
).
__init__
(
)
self
.
clip_norm
=
float
(
clip_norm
)
self
.
group_name
=
group_name
...
...
@@ -443,7 +410,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
self
.
_need_clip_func
is
not
None
and
not
self
.
_need_clip_func
(
p
)
:
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
continue
merge_grad
=
g
if
g
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
...
...
@@ -469,7 +436,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
self
.
_need_clip_func
is
not
None
and
not
self
.
_need_clip_func
(
p
)
:
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
new_grad
=
layers
.
elementwise_mul
(
x
=
g
,
y
=
clip_var
)
...
...
@@ -484,8 +451,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
self
.
_need_clip_func
is
not
None
and
not
self
.
_need_clip_func
(
p
):
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
continue
merge_grad
=
g
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
...
...
@@ -518,8 +484,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
self
.
_need_clip_func
is
not
None
and
not
self
.
_need_clip_func
(
p
):
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
...
...
@@ -670,9 +635,9 @@ def set_gradient_clip(clip, param_list=None, program=None):
"This method can reduce the mistakes, please "
"refer to documention of 'optimizer'."
)
if
not
isinstance
(
clip
,
GradientClip
Base
):
if
not
isinstance
(
clip
,
ClipGrad
Base
):
raise
TypeError
(
"'clip' should be an instance of
GradientClip
Base's derived class"
)
"'clip' should be an instance of
ClipGrad
Base's derived class"
)
if
program
is
None
:
program
=
framework
.
default_main_program
()
...
...
@@ -708,7 +673,7 @@ def append_gradient_clip_ops(param_grads):
clip_attr
=
getattr
(
p
,
'gradient_clip_attr'
,
None
)
if
clip_attr
is
None
:
return
param_grads
if
not
isinstance
(
clip_attr
,
GradientClip
Base
):
if
not
isinstance
(
clip_attr
,
ClipGrad
Base
):
raise
TypeError
(
"clip attribute should be an instance of GradientClipBase"
)
...
...
@@ -754,6 +719,7 @@ def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
op
.
_set_attr
(
'op_role_var'
,
correct_p_g
)
ClipByValue
=
GradientClipByValue
ClipByNorm
=
GradientClipByNorm
ClipByGlobalNorm
=
GradientClipByGlobalNorm
GradientClipBase
=
ClipGradBase
GradientClipByValue
=
ClipGradByValue
GradientClipByNorm
=
ClipGradByNorm
GradientClipByGlobalNorm
=
ClipGradByGlobalNorm
python/paddle/fluid/framework.py
浏览文件 @
994438b1
...
...
@@ -5123,6 +5123,8 @@ class Parameter(Variable):
be applied on the parameter. Default: None
do_model_average(bool): True if the model average strategy will
be applied on this parameter.
need_clip (bool): Whether the parameter gradient need to be cliped
in optimizer. Default is True.
"""
def
__init__
(
self
,
...
...
@@ -5162,6 +5164,8 @@ class Parameter(Variable):
self
.
do_model_average
=
kwargs
.
get
(
'do_model_average'
,
None
)
self
.
need_clip
=
kwargs
.
get
(
'need_clip'
,
True
)
self
.
is_distributed
=
False
def
__str__
(
self
):
...
...
@@ -5194,7 +5198,7 @@ class Parameter(Variable):
if
with_details
:
res_str
=
Variable
.
to_string
(
self
,
throw_on_error
,
True
)
additional_attr
=
(
"trainable"
,
"optimize_attr"
,
"regularizer"
,
"do_model_average"
)
"do_model_average"
,
"need_clip"
)
for
attr_name
in
additional_attr
:
res_str
+=
"%s: %s
\n
"
%
(
attr_name
,
cpt
.
to_text
(
getattr
(
self
,
attr_name
)))
...
...
@@ -5226,6 +5230,8 @@ class ParamBase(core.VarBase):
be applied on the ParamBase. Default: None
do_model_average(bool): True if the model average strategy will
be applied on this ParamBase.
need_clip (bool): Whether the parameter gradient need to be cliped
in optimizer. Default is True.
"""
@
dygraph_only
...
...
@@ -5265,6 +5271,8 @@ class ParamBase(core.VarBase):
self
.
do_model_average
=
kwargs
.
get
(
'do_model_average'
,
None
)
self
.
need_clip
=
kwargs
.
get
(
'need_clip'
,
True
)
self
.
is_distributed
=
False
# self.block = default_main_program().global_block()
...
...
python/paddle/fluid/param_attr.py
浏览文件 @
994438b1
...
...
@@ -36,8 +36,8 @@ class ParamAttr(object):
Note:
``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
It is recommended to set ``grad_clip`` in ``optimizer`` to clip gradient.
There are three clipping strategies: :ref:`api_
fluid_clip
_GradientClipByGlobalNorm` ,
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
There are three clipping strategies: :ref:`api_
paddle_nn
_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
Parameters:
...
...
@@ -57,6 +57,7 @@ class ParamAttr(object):
trainable (bool): Whether this parameter is trainable. Default True.
do_model_average (bool): Whether this parameter should do model average
when model average is enabled. Default False.
need_clip (bool): Whether the parameter gradient need to be cliped in optimizer. Default is True.
Examples:
.. code-block:: python
...
...
@@ -78,7 +79,8 @@ class ParamAttr(object):
learning_rate
=
1.0
,
regularizer
=
None
,
trainable
=
True
,
do_model_average
=
True
):
do_model_average
=
True
,
need_clip
=
True
):
if
sys
.
version_info
.
major
==
2
:
check_type
(
name
,
"name"
,
(
str
,
type
(
None
),
unicode
),
"ParamAttr"
)
...
...
@@ -87,6 +89,7 @@ class ParamAttr(object):
check_type
(
learning_rate
,
"learning_rate"
,
(
float
,
int
),
"ParamAttr"
)
check_type
(
trainable
,
"trainable"
,
(
bool
),
"ParamAttr"
)
check_type
(
do_model_average
,
"do_model_average"
,
(
bool
),
"ParamAttr"
)
check_type
(
need_clip
,
"need_clip"
,
(
bool
),
"ParamAttr"
)
check_type
(
initializer
,
"initializer"
,
(
Initializer
,
type
(
None
)),
"ParamAttr"
)
check_type
(
regularizer
,
"regularizer"
,
...
...
@@ -101,6 +104,7 @@ class ParamAttr(object):
self
.
regularizer
=
regularizer
self
.
trainable
=
trainable
self
.
do_model_average
=
do_model_average
self
.
need_clip
=
need_clip
def
_set_default_initializer
(
self
,
initializer
):
"""
...
...
@@ -197,7 +201,8 @@ class ParamAttr(object):
},
'regularizer'
:
self
.
regularizer
,
'trainable'
:
self
.
trainable
,
'do_model_average'
:
self
.
do_model_average
'do_model_average'
:
self
.
do_model_average
,
'need_clip'
:
self
.
need_clip
}
if
with_initializer
:
kwargs
[
'initializer'
]
=
self
.
initializer
...
...
@@ -219,9 +224,9 @@ class WeightNormParamAttr(ParamAttr):
<https://arxiv.org/pdf/1602.07868.pdf>`_.
Note:
``gradient_clip`` of ``
WeightNorm
ParamAttr`` HAS BEEN DEPRECATED since 2.0.
It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient.
There are three clipping strategies: :ref:`api_
fluid_clip
_GradientClipByGlobalNorm` ,
``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
There are three clipping strategies: :ref:`api_
paddle_nn
_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
...
...
@@ -248,6 +253,7 @@ class WeightNormParamAttr(ParamAttr):
trainable(bool, optional): Whether this parameter is trainable. Default True.
do_model_average(bool, optional): Whether this parameter should do model average.
Default False.
need_clip (bool, optional): Whether the parameter gradient need to be cliped in optimizer. Default is True.
Examples:
.. code-block:: python
...
...
@@ -267,7 +273,8 @@ class WeightNormParamAttr(ParamAttr):
learning_rate=1.0,
regularizer=paddle.regularizer.L2Decay(0.1),
trainable=True,
do_model_average=False))
do_model_average=False,
need_clip=True))
"""
# List to record the parameters reparameterized by weight normalization.
...
...
@@ -283,12 +290,14 @@ class WeightNormParamAttr(ParamAttr):
learning_rate
=
1.0
,
regularizer
=
None
,
trainable
=
True
,
do_model_average
=
False
):
do_model_average
=
False
,
need_clip
=
True
):
super
(
WeightNormParamAttr
,
self
).
__init__
(
name
=
name
,
initializer
=
initializer
,
learning_rate
=
learning_rate
,
regularizer
=
regularizer
,
trainable
=
trainable
,
do_model_average
=
do_model_average
)
do_model_average
=
do_model_average
,
need_clip
=
need_clip
)
self
.
dim
=
dim
python/paddle/fluid/tests/unittests/test_gradient_clip.py
浏览文件 @
994438b1
...
...
@@ -185,12 +185,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# invoke 'set_gradient_clip' in a wrong order
def
test_wrong_API_order
(
self
):
def
backward_func
(
cost
):
# no clip gradient
def
fileter_func
(
param
):
return
param
.
name
==
"fc.w_0"
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
5.0
,
need_clip
=
fileter_func
)
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
5.0
)
fluid
.
clip
.
set_gradient_clip
(
clip
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.01
,
grad_clip
=
clip
)
...
...
@@ -205,11 +200,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# if grad is None or not need clip
def
test_none_grad
(
self
):
def
fileter_func
(
param
):
return
param
.
name
==
"x"
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
self
.
clip_norm
,
need_clip
=
fileter_func
)
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
self
.
clip_norm
)
x
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
name
=
"x"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
y
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
...
...
@@ -228,11 +219,6 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# raise typeError
def
test_tpyeError
(
self
):
# the type of need_clip must be an funciton
with
self
.
assertRaises
(
TypeError
):
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
self
.
clip_norm
,
need_clip
=
"test"
)
# the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class
with
self
.
assertRaises
(
TypeError
):
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.1
,
...
...
@@ -264,26 +250,22 @@ class TestGradientClipByNorm(TestGradientClip):
# if grad is None or not need clip
def
test_none_grad
(
self
):
def
fileter_func
(
param
):
return
param
.
name
==
"z"
clip
=
fluid
.
clip
.
GradientClipByNorm
(
self
.
clip_norm
,
need_clip
=
fileter_func
)
clip
=
fluid
.
clip
.
GradientClipByNorm
(
self
.
clip_norm
)
x
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
name
=
"x"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
name
=
"x"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
,
need_clip
=
False
)
y
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
name
=
"y"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
name
=
"y"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
,
need_clip
=
False
)
# (x, None) should not be returned
params_grads
=
[(
x
,
None
),
(
x
,
y
)]
params_grads
=
clip
(
params_grads
)
self
.
assertTrue
(
len
(
clip
(
params_grads
))
==
1
,
"ClipByNorm: when grad is None, it shouldn't be returned by gradient clip!"
"Clip
Grad
ByNorm: when grad is None, it shouldn't be returned by gradient clip!"
)
self
.
assertTrue
(
params_grads
[
0
][
1
].
name
==
'y'
,
"ClipByNorm: grad should not be clipped when filtered out!"
)
"Clip
Grad
ByNorm: grad should not be clipped when filtered out!"
)
class
TestGradientClipByValue
(
TestGradientClip
):
...
...
@@ -312,26 +294,22 @@ class TestGradientClipByValue(TestGradientClip):
# if grad is None or not need clip
def
test_none_grad
(
self
):
def
fileter_func
(
param
):
return
param
.
name
==
"z"
clip
=
fluid
.
clip
.
GradientClipByValue
(
self
.
max
,
self
.
min
,
need_clip
=
fileter_func
)
clip
=
fluid
.
clip
.
GradientClipByValue
(
self
.
max
,
self
.
min
)
x
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
name
=
"x"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
name
=
"x"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
,
need_clip
=
False
)
y
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
name
=
"y"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
name
=
"y"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
,
need_clip
=
False
)
# (x, None) should not be returned
params_grads
=
[(
x
,
None
),
(
x
,
y
)]
params_grads
=
clip
(
params_grads
)
self
.
assertTrue
(
len
(
clip
(
params_grads
))
==
1
,
"ClipByValue: when grad is None, it shouldn't be returned by gradient clip!"
"Clip
Grad
ByValue: when grad is None, it shouldn't be returned by gradient clip!"
)
self
.
assertTrue
(
params_grads
[
0
][
1
].
name
==
'y'
,
"ClipByValue: grad should not be clipped when filtered out!"
)
"Clip
Grad
ByValue: grad should not be clipped when filtered out!"
)
class
TestDygraphGradientClip
(
unittest
.
TestCase
):
...
...
@@ -355,13 +333,9 @@ class TestDygraphGradientClip(unittest.TestCase):
class
TestDygraphGradientClipByGlobalNorm
(
TestDygraphGradientClip
):
def
setUp
(
self
):
# only clip gradient of x (ParamBase)
def
fileter_func
(
param
):
return
param
.
name
==
"x"
self
.
clip_norm
=
0.8
self
.
clip1
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
self
.
clip_norm
,
need_clip
=
fileter_func
)
clip_norm
=
self
.
clip_norm
)
self
.
clip2
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
self
.
clip_norm
)
...
...
@@ -401,13 +375,8 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
class
TestDygraphGradientClipByNorm
(
TestDygraphGradientClip
):
def
setUp
(
self
):
# only clip gradient of linear_0.w_0 (ParamBase)
def
fileter_func
(
param
):
return
param
.
name
==
"linear_0.w_0"
self
.
clip_norm
=
0.8
self
.
clip
=
fluid
.
clip
.
GradientClipByNorm
(
clip_norm
=
self
.
clip_norm
,
need_clip
=
fileter_func
)
self
.
clip
=
fluid
.
clip
.
GradientClipByNorm
(
clip_norm
=
self
.
clip_norm
)
def
check_clip_result
(
self
,
loss
,
optimizer
):
# if grad is None
...
...
@@ -435,14 +404,9 @@ class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
class
TestDygraphGradientClipByValue
(
TestDygraphGradientClip
):
def
setUp
(
self
):
# only clip gradient of linear_0.w_0 (ParamBase)
def
fileter_func
(
param
):
return
param
.
name
==
"linear_0.w_0"
self
.
max
=
0.2
self
.
min
=
0.1
self
.
clip
=
fluid
.
clip
.
GradientClipByValue
(
max
=
self
.
max
,
min
=
self
.
min
,
need_clip
=
fileter_func
)
self
.
clip
=
fluid
.
clip
.
GradientClipByValue
(
max
=
self
.
max
,
min
=
self
.
min
)
def
check_clip_result
(
self
,
loss
,
optimizer
):
# if grad is None
...
...
python/paddle/nn/__init__.py
浏览文件 @
994438b1
...
...
@@ -31,9 +31,9 @@ __all__ += rnn.__all__
__all__
+=
weight_norm_hook
.
__all__
# TODO: define alias in nn directory
from
.clip
import
GradientClip
ByGlobalNorm
#DEFINE_ALIAS
from
.clip
import
GradientClip
ByNorm
#DEFINE_ALIAS
from
.clip
import
GradientClip
ByValue
#DEFINE_ALIAS
from
.clip
import
ClipGrad
ByGlobalNorm
#DEFINE_ALIAS
from
.clip
import
ClipGrad
ByNorm
#DEFINE_ALIAS
from
.clip
import
ClipGrad
ByValue
#DEFINE_ALIAS
# from .clip import set_gradient_clip #DEFINE_ALIAS
from
.clip
import
clip
#DEFINE_ALIAS
from
.clip
import
clip_by_norm
#DEFINE_ALIAS
...
...
@@ -51,13 +51,13 @@ from .decode import beam_search_decode #DEFINE_ALIAS
# from .decode import dynamic_decode #DEFINE_ALIAS
from
.decode
import
gather_tree
#DEFINE_ALIAS
# from .input import Input #DEFINE_ALIAS
from
.layer.activation
import
ELU
from
.layer.activation
import
GELU
from
.layer.activation
import
Tanh
from
.layer.activation
import
Hardshrink
from
.layer.activation
import
Hardtanh
from
.layer.activation
import
PReLU
from
.layer.activation
import
ReLU
from
.layer.activation
import
ELU
#DEFINE_ALIAS
from
.layer.activation
import
GELU
#DEFINE_ALIAS
from
.layer.activation
import
Tanh
#DEFINE_ALIAS
from
.layer.activation
import
Hardshrink
#DEFINE_ALIAS
from
.layer.activation
import
Hardtanh
#DEFINE_ALIAS
from
.layer.activation
import
PReLU
#DEFINE_ALIAS
from
.layer.activation
import
ReLU
#DEFINE_ALIAS
from
.layer.activation
import
ReLU6
#DEFINE_ALIAS
from
.layer.activation
import
SELU
#DEFINE_ALIAS
from
.layer.activation
import
LeakyReLU
#DEFINE_ALIAS
...
...
python/paddle/nn/clip.py
浏览文件 @
994438b1
...
...
@@ -13,18 +13,18 @@
# limitations under the License.
# TODO: define the functions to clip gradient of parameter
from
..fluid.clip
import
GradientClip
ByGlobalNorm
#DEFINE_ALIAS
from
..fluid.clip
import
GradientClip
ByNorm
#DEFINE_ALIAS
from
..fluid.clip
import
GradientClip
ByValue
#DEFINE_ALIAS
from
..fluid.clip
import
ClipGrad
ByGlobalNorm
#DEFINE_ALIAS
from
..fluid.clip
import
ClipGrad
ByNorm
#DEFINE_ALIAS
from
..fluid.clip
import
ClipGrad
ByValue
#DEFINE_ALIAS
from
..fluid.layers
import
clip
#DEFINE_ALIAS
from
..fluid.layers
import
clip_by_norm
#DEFINE_ALIAS
__all__
=
[
# 'ErrorClipByValue',
'
GradientClip
ByGlobalNorm'
,
'
GradientClip
ByNorm'
,
'
GradientClip
ByValue'
,
'
ClipGrad
ByGlobalNorm'
,
'
ClipGrad
ByNorm'
,
'
ClipGrad
ByValue'
,
# 'set_gradient_clip',
'clip'
,
'clip_by_norm'
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录