Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
994438b1
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
994438b1
编写于
10月 09, 2020
作者:
Q
Qi Li
提交者:
GitHub
10月 09, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
change clip grad api, test=develop (#27767)
上级
365c2c9c
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
121 addition
and
174 deletion
+121
-174
python/paddle/fluid/clip.py
python/paddle/fluid/clip.py
+62
-96
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+9
-1
python/paddle/fluid/param_attr.py
python/paddle/fluid/param_attr.py
+19
-10
python/paddle/fluid/tests/unittests/test_gradient_clip.py
python/paddle/fluid/tests/unittests/test_gradient_clip.py
+15
-51
python/paddle/nn/__init__.py
python/paddle/nn/__init__.py
+10
-10
python/paddle/nn/clip.py
python/paddle/nn/clip.py
+6
-6
未找到文件。
python/paddle/fluid/clip.py
浏览文件 @
994438b1
...
@@ -26,8 +26,8 @@ from . import name_scope
...
@@ -26,8 +26,8 @@ from . import name_scope
from
.dygraph
import
base
as
imperative_base
from
.dygraph
import
base
as
imperative_base
__all__
=
[
__all__
=
[
'set_gradient_clip'
,
'ErrorClipByValue'
,
'
GradientClip
ByValue'
,
'set_gradient_clip'
,
'ErrorClipByValue'
,
'
ClipGrad
ByValue'
,
'
GradientClipByNorm'
,
'GradientClip
ByGlobalNorm'
'
ClipGradByNorm'
,
'ClipGrad
ByGlobalNorm'
]
]
...
@@ -115,16 +115,9 @@ def error_clip_callback(block, context):
...
@@ -115,16 +115,9 @@ def error_clip_callback(block, context):
error_clip
.
_append_clip_op
(
block
,
grad_n
)
error_clip
.
_append_clip_op
(
block
,
grad_n
)
class
GradientClipBase
(
object
):
class
ClipGradBase
(
object
):
def
__init__
(
self
,
need_clip
=
None
):
def
__init__
(
self
):
if
need_clip
is
not
None
and
not
callable
(
need_clip
):
super
(
ClipGradBase
,
self
).
__init__
()
raise
TypeError
(
"The type of need_clip must be funciton, and it can filter out "
"parameter that does't need gradient clip. This function must return "
"True or False, and True means that clipping is required. Please refer to "
"API documention of GradientClipByGlobalNorm / GradientClipByNorm "
"/GradientClipByValue."
)
self
.
_need_clip_func
=
need_clip
def
__str__
(
self
):
def
__str__
(
self
):
raise
NotImplementedError
()
raise
NotImplementedError
()
...
@@ -144,7 +137,7 @@ class GradientClipBase(object):
...
@@ -144,7 +137,7 @@ class GradientClipBase(object):
if
getattr
(
p
,
'gradient_clip_attr'
,
None
)
is
not
None
:
if
getattr
(
p
,
'gradient_clip_attr'
,
None
)
is
not
None
:
warnings
.
warn
(
warnings
.
warn
(
"'set_gradient_clip' will be ineffective, because you have "
"'set_gradient_clip' will be ineffective, because you have "
"set '
grad_clip' in 'optimize
r'. So, 'set_gradient_clip' "
"set '
need_clip' in 'ParamAtt
r'. So, 'set_gradient_clip' "
"is redundant and you can remove it."
)
"is redundant and you can remove it."
)
break
break
return
self
.
_static_clip
(
params_grads
)
return
self
.
_static_clip
(
params_grads
)
...
@@ -156,7 +149,7 @@ class GradientClipBase(object):
...
@@ -156,7 +149,7 @@ class GradientClipBase(object):
raise
NotImplementedError
()
raise
NotImplementedError
()
class
GradientClipByValue
(
GradientClip
Base
):
class
ClipGradByValue
(
ClipGrad
Base
):
"""
"""
Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
...
@@ -164,19 +157,20 @@ class GradientClipByValue(GradientClipBase):
...
@@ -164,19 +157,20 @@ class GradientClipByValue(GradientClipBase):
- Any values greater than max are set to ``max``.
- Any values greater than max are set to ``max``.
The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters
in ``Program`` . If ``need_clip``
The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters
set in ``optimizer``.
is not None, then only part of gradients can be selected for gradient clipping
.
If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped
.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`).
(for example: :ref:`api_paddle_optimizer_SGD`).
Note:
``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args:
Args:
max (float): The maximum value to clip by.
max (float): The maximum value to clip by.
min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
automatically. In this case, ``max`` must be greater than 0.
automatically. In this case, ``max`` must be greater than 0.
need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool``
(True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None,
and gradients of all parameters in the network will be clipped.
Examples:
Examples:
.. code-block:: python
.. code-block:: python
...
@@ -184,29 +178,20 @@ class GradientClipByValue(GradientClipBase):
...
@@ -184,29 +178,20 @@ class GradientClipByValue(GradientClipBase):
import paddle
import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(10, 10)
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
out = linear(x)
loss = paddle.mean(out)
loss = paddle.mean(out)
loss.backward()
loss.backward()
# clip all parameters in network:
clip = paddle.nn.ClipGradByValue(min=-1, max=1)
clip = paddle.nn.GradientClipByValue(min=-1, max=1)
# clip a part of parameters in network: (e.g. linear_0.w_0)
# pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
# def fileter_func(ParamBase):
# # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
# return ParamBase.name == "linear_0.w_0"
# # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
# return ParamBase.name == linear.weight.name
# clip = paddle.nn.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step()
sdg.step()
"""
"""
def
__init__
(
self
,
max
,
min
=
None
,
need_clip
=
None
):
def
__init__
(
self
,
max
,
min
=
None
):
super
(
GradientClipByValue
,
self
).
__init__
(
need_clip
)
super
(
ClipGradByValue
,
self
).
__init__
(
)
if
min
is
None
:
if
min
is
None
:
assert
(
max
>
0.0
)
assert
(
max
>
0.0
)
min
=
-
max
min
=
-
max
...
@@ -214,7 +199,7 @@ class GradientClipByValue(GradientClipBase):
...
@@ -214,7 +199,7 @@ class GradientClipByValue(GradientClipBase):
self
.
min
=
float
(
min
)
self
.
min
=
float
(
min
)
def
__str__
(
self
):
def
__str__
(
self
):
return
"
Gradient Clip
By Value, min = %f, max=%f"
%
(
self
.
min
,
self
.
max
)
return
"
Clip Gradient
By Value, min = %f, max=%f"
%
(
self
.
min
,
self
.
max
)
@
imperative_base
.
no_grad
@
imperative_base
.
no_grad
def
_dygraph_clip
(
self
,
params_grads
):
def
_dygraph_clip
(
self
,
params_grads
):
...
@@ -222,7 +207,7 @@ class GradientClipByValue(GradientClipBase):
...
@@ -222,7 +207,7 @@ class GradientClipByValue(GradientClipBase):
for
p
,
g
in
params_grads
:
for
p
,
g
in
params_grads
:
if
g
is
None
:
if
g
is
None
:
continue
continue
if
self
.
_need_clip_func
is
not
None
and
not
self
.
_need_clip_func
(
p
)
:
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
params_and_grads
.
append
((
p
,
g
))
continue
continue
new_grad
=
layers
.
clip
(
x
=
g
,
min
=
self
.
min
,
max
=
self
.
max
)
new_grad
=
layers
.
clip
(
x
=
g
,
min
=
self
.
min
,
max
=
self
.
max
)
...
@@ -236,8 +221,7 @@ class GradientClipByValue(GradientClipBase):
...
@@ -236,8 +221,7 @@ class GradientClipByValue(GradientClipBase):
for
p
,
g
in
params_grads
:
for
p
,
g
in
params_grads
:
if
g
is
None
:
if
g
is
None
:
continue
continue
if
self
.
_need_clip_func
is
not
None
and
not
self
.
_need_clip_func
(
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
p
):
params_and_grads
.
append
((
p
,
g
))
params_and_grads
.
append
((
p
,
g
))
continue
continue
...
@@ -256,7 +240,7 @@ class GradientClipByValue(GradientClipBase):
...
@@ -256,7 +240,7 @@ class GradientClipByValue(GradientClipBase):
return
param
,
new_grad
return
param
,
new_grad
class
GradientClipByNorm
(
GradientClip
Base
):
class
ClipGradByNorm
(
ClipGrad
Base
):
"""
"""
Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
...
@@ -264,8 +248,8 @@ class GradientClipByNorm(GradientClipBase):
...
@@ -264,8 +248,8 @@ class GradientClipByNorm(GradientClipBase):
- If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
- If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters
in ``Program`` . If ``need_clip``
The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters
set in ``optimizer``.
is not None, then only part of gradients can be selected for gradient clipping
.
If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped
.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`).
(for example: :ref:`api_paddle_optimizer_SGD`).
...
@@ -287,11 +271,12 @@ class GradientClipByNorm(GradientClipBase):
...
@@ -287,11 +271,12 @@ class GradientClipByNorm(GradientClipBase):
.. math::
.. math::
norm(X) = (
\\
sum_{i=1}^{n}|x\_i|^2)^{
\\
frac{1}{2}}
norm(X) = (
\\
sum_{i=1}^{n}|x\_i|^2)^{
\\
frac{1}{2}}
Note:
``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args:
Args:
clip_norm(float): The maximum norm value.
clip_norm(float): The maximum norm value.
need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool``
(True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None,
and gradients of all parameters in the network will be clipped.
Examples:
Examples:
.. code-block:: python
.. code-block:: python
...
@@ -299,29 +284,20 @@ class GradientClipByNorm(GradientClipBase):
...
@@ -299,29 +284,20 @@ class GradientClipByNorm(GradientClipBase):
import paddle
import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(10, 10)
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
out = linear(x)
loss = paddle.mean(out)
loss = paddle.mean(out)
loss.backward()
loss.backward()
# clip all parameters in network:
clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
clip = paddle.nn.GradientClipByNorm(clip_norm=1.0)
# clip a part of parameters in network: (e.g. linear_0.w_0)
# pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
# def fileter_func(ParamBase):
# # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
# return ParamBase.name == "linear_0.w_0"
# # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
# return ParamBase.name == linear.weight.name
# clip = paddle.nn.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step()
sdg.step()
"""
"""
def
__init__
(
self
,
clip_norm
,
need_clip
=
None
):
def
__init__
(
self
,
clip_norm
):
super
(
GradientClipByNorm
,
self
).
__init__
(
need_clip
)
super
(
ClipGradByNorm
,
self
).
__init__
(
)
self
.
clip_norm
=
float
(
clip_norm
)
self
.
clip_norm
=
float
(
clip_norm
)
def
__str__
(
self
):
def
__str__
(
self
):
...
@@ -333,7 +309,7 @@ class GradientClipByNorm(GradientClipBase):
...
@@ -333,7 +309,7 @@ class GradientClipByNorm(GradientClipBase):
for
p
,
g
in
params_grads
:
for
p
,
g
in
params_grads
:
if
g
is
None
:
if
g
is
None
:
continue
continue
if
self
.
_need_clip_func
is
not
None
and
not
self
.
_need_clip_func
(
p
)
:
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
params_and_grads
.
append
((
p
,
g
))
continue
continue
new_grad
=
layers
.
clip_by_norm
(
x
=
g
,
max_norm
=
self
.
clip_norm
)
new_grad
=
layers
.
clip_by_norm
(
x
=
g
,
max_norm
=
self
.
clip_norm
)
...
@@ -347,8 +323,7 @@ class GradientClipByNorm(GradientClipBase):
...
@@ -347,8 +323,7 @@ class GradientClipByNorm(GradientClipBase):
for
p
,
g
in
params_grads
:
for
p
,
g
in
params_grads
:
if
g
is
None
:
if
g
is
None
:
continue
continue
if
self
.
_need_clip_func
is
not
None
and
not
self
.
_need_clip_func
(
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
p
):
params_and_grads
.
append
((
p
,
g
))
params_and_grads
.
append
((
p
,
g
))
continue
continue
...
@@ -367,7 +342,7 @@ class GradientClipByNorm(GradientClipBase):
...
@@ -367,7 +342,7 @@ class GradientClipByNorm(GradientClipBase):
return
param
,
new_grad
return
param
,
new_grad
class
GradientClipByGlobalNorm
(
GradientClip
Base
):
class
ClipGradByGlobalNorm
(
ClipGrad
Base
):
"""
"""
Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
:math:`t\_list` , and limit it to ``clip_norm`` .
:math:`t\_list` , and limit it to ``clip_norm`` .
...
@@ -376,8 +351,8 @@ class GradientClipByGlobalNorm(GradientClipBase):
...
@@ -376,8 +351,8 @@ class GradientClipByGlobalNorm(GradientClipBase):
- If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
- If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters
in ``Program`` . If ``need_clip``
The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters
set in ``optimizer``.
is not None, then only part of gradients can be selected for gradient clipping
.
If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped
.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`).
(for example: :ref:`api_paddle_optimizer_SGD`).
...
@@ -394,12 +369,13 @@ class GradientClipByGlobalNorm(GradientClipBase):
...
@@ -394,12 +369,13 @@ class GradientClipByGlobalNorm(GradientClipBase):
global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
Note:
``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args:
Args:
clip_norm (float): The maximum norm value.
clip_norm (float): The maximum norm value.
group_name (str, optional): The group name for this clip. Default value is ``default_group``
group_name (str, optional): The group name for this clip. Default value is ``default_group``.
need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool``
(True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None,
and gradients of all parameters in the network will be clipped.
Examples:
Examples:
.. code-block:: python
.. code-block:: python
...
@@ -407,29 +383,20 @@ class GradientClipByGlobalNorm(GradientClipBase):
...
@@ -407,29 +383,20 @@ class GradientClipByGlobalNorm(GradientClipBase):
import paddle
import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(10, 10)
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
out = linear(x)
loss = paddle.mean(out)
loss = paddle.mean(out)
loss.backward()
loss.backward()
# clip all parameters in network:
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
clip = paddle.nn.GradientClipByGlobalNorm(clip_norm=1.0)
# clip a part of parameters in network: (e.g. linear_0.w_0)
# pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
# def fileter_func(ParamBase):
# # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
# return ParamBase.name == "linear_0.w_0"
# # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
# return ParamBase.name == linear.weight.name
# clip = paddle.nn.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step()
sdg.step()
"""
"""
def
__init__
(
self
,
clip_norm
,
group_name
=
"default_group"
,
need_clip
=
None
):
def
__init__
(
self
,
clip_norm
,
group_name
=
"default_group"
):
super
(
GradientClipByGlobalNorm
,
self
).
__init__
(
need_clip
)
super
(
ClipGradByGlobalNorm
,
self
).
__init__
(
)
self
.
clip_norm
=
float
(
clip_norm
)
self
.
clip_norm
=
float
(
clip_norm
)
self
.
group_name
=
group_name
self
.
group_name
=
group_name
...
@@ -443,7 +410,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
...
@@ -443,7 +410,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
for
p
,
g
in
params_grads
:
for
p
,
g
in
params_grads
:
if
g
is
None
:
if
g
is
None
:
continue
continue
if
self
.
_need_clip_func
is
not
None
and
not
self
.
_need_clip_func
(
p
)
:
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
continue
continue
merge_grad
=
g
merge_grad
=
g
if
g
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
if
g
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
...
@@ -469,7 +436,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
...
@@ -469,7 +436,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
for
p
,
g
in
params_grads
:
for
p
,
g
in
params_grads
:
if
g
is
None
:
if
g
is
None
:
continue
continue
if
self
.
_need_clip_func
is
not
None
and
not
self
.
_need_clip_func
(
p
)
:
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
params_and_grads
.
append
((
p
,
g
))
continue
continue
new_grad
=
layers
.
elementwise_mul
(
x
=
g
,
y
=
clip_var
)
new_grad
=
layers
.
elementwise_mul
(
x
=
g
,
y
=
clip_var
)
...
@@ -484,8 +451,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
...
@@ -484,8 +451,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
for
p
,
g
in
params_grads
:
for
p
,
g
in
params_grads
:
if
g
is
None
:
if
g
is
None
:
continue
continue
if
self
.
_need_clip_func
is
not
None
and
not
self
.
_need_clip_func
(
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
p
):
continue
continue
merge_grad
=
g
merge_grad
=
g
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
...
@@ -518,8 +484,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
...
@@ -518,8 +484,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
for
p
,
g
in
params_grads
:
for
p
,
g
in
params_grads
:
if
g
is
None
:
if
g
is
None
:
continue
continue
if
self
.
_need_clip_func
is
not
None
and
not
self
.
_need_clip_func
(
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
p
):
params_and_grads
.
append
((
p
,
g
))
params_and_grads
.
append
((
p
,
g
))
continue
continue
...
@@ -670,9 +635,9 @@ def set_gradient_clip(clip, param_list=None, program=None):
...
@@ -670,9 +635,9 @@ def set_gradient_clip(clip, param_list=None, program=None):
"This method can reduce the mistakes, please "
"This method can reduce the mistakes, please "
"refer to documention of 'optimizer'."
)
"refer to documention of 'optimizer'."
)
if
not
isinstance
(
clip
,
GradientClip
Base
):
if
not
isinstance
(
clip
,
ClipGrad
Base
):
raise
TypeError
(
raise
TypeError
(
"'clip' should be an instance of
GradientClip
Base's derived class"
)
"'clip' should be an instance of
ClipGrad
Base's derived class"
)
if
program
is
None
:
if
program
is
None
:
program
=
framework
.
default_main_program
()
program
=
framework
.
default_main_program
()
...
@@ -708,7 +673,7 @@ def append_gradient_clip_ops(param_grads):
...
@@ -708,7 +673,7 @@ def append_gradient_clip_ops(param_grads):
clip_attr
=
getattr
(
p
,
'gradient_clip_attr'
,
None
)
clip_attr
=
getattr
(
p
,
'gradient_clip_attr'
,
None
)
if
clip_attr
is
None
:
if
clip_attr
is
None
:
return
param_grads
return
param_grads
if
not
isinstance
(
clip_attr
,
GradientClip
Base
):
if
not
isinstance
(
clip_attr
,
ClipGrad
Base
):
raise
TypeError
(
raise
TypeError
(
"clip attribute should be an instance of GradientClipBase"
)
"clip attribute should be an instance of GradientClipBase"
)
...
@@ -754,6 +719,7 @@ def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
...
@@ -754,6 +719,7 @@ def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
op
.
_set_attr
(
'op_role_var'
,
correct_p_g
)
op
.
_set_attr
(
'op_role_var'
,
correct_p_g
)
ClipByValue
=
GradientClipByValue
GradientClipBase
=
ClipGradBase
ClipByNorm
=
GradientClipByNorm
GradientClipByValue
=
ClipGradByValue
ClipByGlobalNorm
=
GradientClipByGlobalNorm
GradientClipByNorm
=
ClipGradByNorm
GradientClipByGlobalNorm
=
ClipGradByGlobalNorm
python/paddle/fluid/framework.py
浏览文件 @
994438b1
...
@@ -5123,6 +5123,8 @@ class Parameter(Variable):
...
@@ -5123,6 +5123,8 @@ class Parameter(Variable):
be applied on the parameter. Default: None
be applied on the parameter. Default: None
do_model_average(bool): True if the model average strategy will
do_model_average(bool): True if the model average strategy will
be applied on this parameter.
be applied on this parameter.
need_clip (bool): Whether the parameter gradient need to be cliped
in optimizer. Default is True.
"""
"""
def
__init__
(
self
,
def
__init__
(
self
,
...
@@ -5162,6 +5164,8 @@ class Parameter(Variable):
...
@@ -5162,6 +5164,8 @@ class Parameter(Variable):
self
.
do_model_average
=
kwargs
.
get
(
'do_model_average'
,
None
)
self
.
do_model_average
=
kwargs
.
get
(
'do_model_average'
,
None
)
self
.
need_clip
=
kwargs
.
get
(
'need_clip'
,
True
)
self
.
is_distributed
=
False
self
.
is_distributed
=
False
def
__str__
(
self
):
def
__str__
(
self
):
...
@@ -5194,7 +5198,7 @@ class Parameter(Variable):
...
@@ -5194,7 +5198,7 @@ class Parameter(Variable):
if
with_details
:
if
with_details
:
res_str
=
Variable
.
to_string
(
self
,
throw_on_error
,
True
)
res_str
=
Variable
.
to_string
(
self
,
throw_on_error
,
True
)
additional_attr
=
(
"trainable"
,
"optimize_attr"
,
"regularizer"
,
additional_attr
=
(
"trainable"
,
"optimize_attr"
,
"regularizer"
,
"do_model_average"
)
"do_model_average"
,
"need_clip"
)
for
attr_name
in
additional_attr
:
for
attr_name
in
additional_attr
:
res_str
+=
"%s: %s
\n
"
%
(
attr_name
,
res_str
+=
"%s: %s
\n
"
%
(
attr_name
,
cpt
.
to_text
(
getattr
(
self
,
attr_name
)))
cpt
.
to_text
(
getattr
(
self
,
attr_name
)))
...
@@ -5226,6 +5230,8 @@ class ParamBase(core.VarBase):
...
@@ -5226,6 +5230,8 @@ class ParamBase(core.VarBase):
be applied on the ParamBase. Default: None
be applied on the ParamBase. Default: None
do_model_average(bool): True if the model average strategy will
do_model_average(bool): True if the model average strategy will
be applied on this ParamBase.
be applied on this ParamBase.
need_clip (bool): Whether the parameter gradient need to be cliped
in optimizer. Default is True.
"""
"""
@
dygraph_only
@
dygraph_only
...
@@ -5265,6 +5271,8 @@ class ParamBase(core.VarBase):
...
@@ -5265,6 +5271,8 @@ class ParamBase(core.VarBase):
self
.
do_model_average
=
kwargs
.
get
(
'do_model_average'
,
None
)
self
.
do_model_average
=
kwargs
.
get
(
'do_model_average'
,
None
)
self
.
need_clip
=
kwargs
.
get
(
'need_clip'
,
True
)
self
.
is_distributed
=
False
self
.
is_distributed
=
False
# self.block = default_main_program().global_block()
# self.block = default_main_program().global_block()
...
...
python/paddle/fluid/param_attr.py
浏览文件 @
994438b1
...
@@ -36,8 +36,8 @@ class ParamAttr(object):
...
@@ -36,8 +36,8 @@ class ParamAttr(object):
Note:
Note:
``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
It is recommended to set ``grad_clip`` in ``optimizer`` to clip gradient.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
There are three clipping strategies: :ref:`api_
fluid_clip
_GradientClipByGlobalNorm` ,
There are three clipping strategies: :ref:`api_
paddle_nn
_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
Parameters:
Parameters:
...
@@ -57,6 +57,7 @@ class ParamAttr(object):
...
@@ -57,6 +57,7 @@ class ParamAttr(object):
trainable (bool): Whether this parameter is trainable. Default True.
trainable (bool): Whether this parameter is trainable. Default True.
do_model_average (bool): Whether this parameter should do model average
do_model_average (bool): Whether this parameter should do model average
when model average is enabled. Default False.
when model average is enabled. Default False.
need_clip (bool): Whether the parameter gradient need to be cliped in optimizer. Default is True.
Examples:
Examples:
.. code-block:: python
.. code-block:: python
...
@@ -78,7 +79,8 @@ class ParamAttr(object):
...
@@ -78,7 +79,8 @@ class ParamAttr(object):
learning_rate
=
1.0
,
learning_rate
=
1.0
,
regularizer
=
None
,
regularizer
=
None
,
trainable
=
True
,
trainable
=
True
,
do_model_average
=
True
):
do_model_average
=
True
,
need_clip
=
True
):
if
sys
.
version_info
.
major
==
2
:
if
sys
.
version_info
.
major
==
2
:
check_type
(
name
,
"name"
,
(
str
,
type
(
None
),
unicode
),
"ParamAttr"
)
check_type
(
name
,
"name"
,
(
str
,
type
(
None
),
unicode
),
"ParamAttr"
)
...
@@ -87,6 +89,7 @@ class ParamAttr(object):
...
@@ -87,6 +89,7 @@ class ParamAttr(object):
check_type
(
learning_rate
,
"learning_rate"
,
(
float
,
int
),
"ParamAttr"
)
check_type
(
learning_rate
,
"learning_rate"
,
(
float
,
int
),
"ParamAttr"
)
check_type
(
trainable
,
"trainable"
,
(
bool
),
"ParamAttr"
)
check_type
(
trainable
,
"trainable"
,
(
bool
),
"ParamAttr"
)
check_type
(
do_model_average
,
"do_model_average"
,
(
bool
),
"ParamAttr"
)
check_type
(
do_model_average
,
"do_model_average"
,
(
bool
),
"ParamAttr"
)
check_type
(
need_clip
,
"need_clip"
,
(
bool
),
"ParamAttr"
)
check_type
(
initializer
,
"initializer"
,
(
Initializer
,
type
(
None
)),
check_type
(
initializer
,
"initializer"
,
(
Initializer
,
type
(
None
)),
"ParamAttr"
)
"ParamAttr"
)
check_type
(
regularizer
,
"regularizer"
,
check_type
(
regularizer
,
"regularizer"
,
...
@@ -101,6 +104,7 @@ class ParamAttr(object):
...
@@ -101,6 +104,7 @@ class ParamAttr(object):
self
.
regularizer
=
regularizer
self
.
regularizer
=
regularizer
self
.
trainable
=
trainable
self
.
trainable
=
trainable
self
.
do_model_average
=
do_model_average
self
.
do_model_average
=
do_model_average
self
.
need_clip
=
need_clip
def
_set_default_initializer
(
self
,
initializer
):
def
_set_default_initializer
(
self
,
initializer
):
"""
"""
...
@@ -197,7 +201,8 @@ class ParamAttr(object):
...
@@ -197,7 +201,8 @@ class ParamAttr(object):
},
},
'regularizer'
:
self
.
regularizer
,
'regularizer'
:
self
.
regularizer
,
'trainable'
:
self
.
trainable
,
'trainable'
:
self
.
trainable
,
'do_model_average'
:
self
.
do_model_average
'do_model_average'
:
self
.
do_model_average
,
'need_clip'
:
self
.
need_clip
}
}
if
with_initializer
:
if
with_initializer
:
kwargs
[
'initializer'
]
=
self
.
initializer
kwargs
[
'initializer'
]
=
self
.
initializer
...
@@ -219,9 +224,9 @@ class WeightNormParamAttr(ParamAttr):
...
@@ -219,9 +224,9 @@ class WeightNormParamAttr(ParamAttr):
<https://arxiv.org/pdf/1602.07868.pdf>`_.
<https://arxiv.org/pdf/1602.07868.pdf>`_.
Note:
Note:
``gradient_clip`` of ``
WeightNorm
ParamAttr`` HAS BEEN DEPRECATED since 2.0.
``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
There are three clipping strategies: :ref:`api_
fluid_clip
_GradientClipByGlobalNorm` ,
There are three clipping strategies: :ref:`api_
paddle_nn
_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
...
@@ -248,6 +253,7 @@ class WeightNormParamAttr(ParamAttr):
...
@@ -248,6 +253,7 @@ class WeightNormParamAttr(ParamAttr):
trainable(bool, optional): Whether this parameter is trainable. Default True.
trainable(bool, optional): Whether this parameter is trainable. Default True.
do_model_average(bool, optional): Whether this parameter should do model average.
do_model_average(bool, optional): Whether this parameter should do model average.
Default False.
Default False.
need_clip (bool, optional): Whether the parameter gradient need to be cliped in optimizer. Default is True.
Examples:
Examples:
.. code-block:: python
.. code-block:: python
...
@@ -267,7 +273,8 @@ class WeightNormParamAttr(ParamAttr):
...
@@ -267,7 +273,8 @@ class WeightNormParamAttr(ParamAttr):
learning_rate=1.0,
learning_rate=1.0,
regularizer=paddle.regularizer.L2Decay(0.1),
regularizer=paddle.regularizer.L2Decay(0.1),
trainable=True,
trainable=True,
do_model_average=False))
do_model_average=False,
need_clip=True))
"""
"""
# List to record the parameters reparameterized by weight normalization.
# List to record the parameters reparameterized by weight normalization.
...
@@ -283,12 +290,14 @@ class WeightNormParamAttr(ParamAttr):
...
@@ -283,12 +290,14 @@ class WeightNormParamAttr(ParamAttr):
learning_rate
=
1.0
,
learning_rate
=
1.0
,
regularizer
=
None
,
regularizer
=
None
,
trainable
=
True
,
trainable
=
True
,
do_model_average
=
False
):
do_model_average
=
False
,
need_clip
=
True
):
super
(
WeightNormParamAttr
,
self
).
__init__
(
super
(
WeightNormParamAttr
,
self
).
__init__
(
name
=
name
,
name
=
name
,
initializer
=
initializer
,
initializer
=
initializer
,
learning_rate
=
learning_rate
,
learning_rate
=
learning_rate
,
regularizer
=
regularizer
,
regularizer
=
regularizer
,
trainable
=
trainable
,
trainable
=
trainable
,
do_model_average
=
do_model_average
)
do_model_average
=
do_model_average
,
need_clip
=
need_clip
)
self
.
dim
=
dim
self
.
dim
=
dim
python/paddle/fluid/tests/unittests/test_gradient_clip.py
浏览文件 @
994438b1
...
@@ -185,12 +185,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
...
@@ -185,12 +185,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# invoke 'set_gradient_clip' in a wrong order
# invoke 'set_gradient_clip' in a wrong order
def
test_wrong_API_order
(
self
):
def
test_wrong_API_order
(
self
):
def
backward_func
(
cost
):
def
backward_func
(
cost
):
# no clip gradient
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
5.0
)
def
fileter_func
(
param
):
return
param
.
name
==
"fc.w_0"
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
5.0
,
need_clip
=
fileter_func
)
fluid
.
clip
.
set_gradient_clip
(
clip
)
fluid
.
clip
.
set_gradient_clip
(
clip
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.01
,
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.01
,
grad_clip
=
clip
)
grad_clip
=
clip
)
...
@@ -205,11 +200,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
...
@@ -205,11 +200,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# if grad is None or not need clip
# if grad is None or not need clip
def
test_none_grad
(
self
):
def
test_none_grad
(
self
):
def
fileter_func
(
param
):
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
self
.
clip_norm
)
return
param
.
name
==
"x"
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
self
.
clip_norm
,
need_clip
=
fileter_func
)
x
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
x
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
name
=
"x"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
name
=
"x"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
y
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
y
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
...
@@ -228,11 +219,6 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
...
@@ -228,11 +219,6 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# raise typeError
# raise typeError
def
test_tpyeError
(
self
):
def
test_tpyeError
(
self
):
# the type of need_clip must be an funciton
with
self
.
assertRaises
(
TypeError
):
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
self
.
clip_norm
,
need_clip
=
"test"
)
# the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class
# the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class
with
self
.
assertRaises
(
TypeError
):
with
self
.
assertRaises
(
TypeError
):
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.1
,
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.1
,
...
@@ -264,26 +250,22 @@ class TestGradientClipByNorm(TestGradientClip):
...
@@ -264,26 +250,22 @@ class TestGradientClipByNorm(TestGradientClip):
# if grad is None or not need clip
# if grad is None or not need clip
def
test_none_grad
(
self
):
def
test_none_grad
(
self
):
def
fileter_func
(
param
):
clip
=
fluid
.
clip
.
GradientClipByNorm
(
self
.
clip_norm
)
return
param
.
name
==
"z"
clip
=
fluid
.
clip
.
GradientClipByNorm
(
self
.
clip_norm
,
need_clip
=
fileter_func
)
x
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
x
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
name
=
"x"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
name
=
"x"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
,
need_clip
=
False
)
y
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
y
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
name
=
"y"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
name
=
"y"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
,
need_clip
=
False
)
# (x, None) should not be returned
# (x, None) should not be returned
params_grads
=
[(
x
,
None
),
(
x
,
y
)]
params_grads
=
[(
x
,
None
),
(
x
,
y
)]
params_grads
=
clip
(
params_grads
)
params_grads
=
clip
(
params_grads
)
self
.
assertTrue
(
self
.
assertTrue
(
len
(
clip
(
params_grads
))
==
1
,
len
(
clip
(
params_grads
))
==
1
,
"ClipByNorm: when grad is None, it shouldn't be returned by gradient clip!"
"Clip
Grad
ByNorm: when grad is None, it shouldn't be returned by gradient clip!"
)
)
self
.
assertTrue
(
self
.
assertTrue
(
params_grads
[
0
][
1
].
name
==
'y'
,
params_grads
[
0
][
1
].
name
==
'y'
,
"ClipByNorm: grad should not be clipped when filtered out!"
)
"Clip
Grad
ByNorm: grad should not be clipped when filtered out!"
)
class
TestGradientClipByValue
(
TestGradientClip
):
class
TestGradientClipByValue
(
TestGradientClip
):
...
@@ -312,26 +294,22 @@ class TestGradientClipByValue(TestGradientClip):
...
@@ -312,26 +294,22 @@ class TestGradientClipByValue(TestGradientClip):
# if grad is None or not need clip
# if grad is None or not need clip
def
test_none_grad
(
self
):
def
test_none_grad
(
self
):
def
fileter_func
(
param
):
clip
=
fluid
.
clip
.
GradientClipByValue
(
self
.
max
,
self
.
min
)
return
param
.
name
==
"z"
clip
=
fluid
.
clip
.
GradientClipByValue
(
self
.
max
,
self
.
min
,
need_clip
=
fileter_func
)
x
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
x
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
name
=
"x"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
name
=
"x"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
,
need_clip
=
False
)
y
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
y
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
name
=
"y"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
name
=
"y"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
,
need_clip
=
False
)
# (x, None) should not be returned
# (x, None) should not be returned
params_grads
=
[(
x
,
None
),
(
x
,
y
)]
params_grads
=
[(
x
,
None
),
(
x
,
y
)]
params_grads
=
clip
(
params_grads
)
params_grads
=
clip
(
params_grads
)
self
.
assertTrue
(
self
.
assertTrue
(
len
(
clip
(
params_grads
))
==
1
,
len
(
clip
(
params_grads
))
==
1
,
"ClipByValue: when grad is None, it shouldn't be returned by gradient clip!"
"Clip
Grad
ByValue: when grad is None, it shouldn't be returned by gradient clip!"
)
)
self
.
assertTrue
(
self
.
assertTrue
(
params_grads
[
0
][
1
].
name
==
'y'
,
params_grads
[
0
][
1
].
name
==
'y'
,
"ClipByValue: grad should not be clipped when filtered out!"
)
"Clip
Grad
ByValue: grad should not be clipped when filtered out!"
)
class
TestDygraphGradientClip
(
unittest
.
TestCase
):
class
TestDygraphGradientClip
(
unittest
.
TestCase
):
...
@@ -355,13 +333,9 @@ class TestDygraphGradientClip(unittest.TestCase):
...
@@ -355,13 +333,9 @@ class TestDygraphGradientClip(unittest.TestCase):
class
TestDygraphGradientClipByGlobalNorm
(
TestDygraphGradientClip
):
class
TestDygraphGradientClipByGlobalNorm
(
TestDygraphGradientClip
):
def
setUp
(
self
):
def
setUp
(
self
):
# only clip gradient of x (ParamBase)
def
fileter_func
(
param
):
return
param
.
name
==
"x"
self
.
clip_norm
=
0.8
self
.
clip_norm
=
0.8
self
.
clip1
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
self
.
clip1
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
self
.
clip_norm
,
need_clip
=
fileter_func
)
clip_norm
=
self
.
clip_norm
)
self
.
clip2
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
self
.
clip2
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
self
.
clip_norm
)
clip_norm
=
self
.
clip_norm
)
...
@@ -401,13 +375,8 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
...
@@ -401,13 +375,8 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
class
TestDygraphGradientClipByNorm
(
TestDygraphGradientClip
):
class
TestDygraphGradientClipByNorm
(
TestDygraphGradientClip
):
def
setUp
(
self
):
def
setUp
(
self
):
# only clip gradient of linear_0.w_0 (ParamBase)
def
fileter_func
(
param
):
return
param
.
name
==
"linear_0.w_0"
self
.
clip_norm
=
0.8
self
.
clip_norm
=
0.8
self
.
clip
=
fluid
.
clip
.
GradientClipByNorm
(
self
.
clip
=
fluid
.
clip
.
GradientClipByNorm
(
clip_norm
=
self
.
clip_norm
)
clip_norm
=
self
.
clip_norm
,
need_clip
=
fileter_func
)
def
check_clip_result
(
self
,
loss
,
optimizer
):
def
check_clip_result
(
self
,
loss
,
optimizer
):
# if grad is None
# if grad is None
...
@@ -435,14 +404,9 @@ class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
...
@@ -435,14 +404,9 @@ class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
class
TestDygraphGradientClipByValue
(
TestDygraphGradientClip
):
class
TestDygraphGradientClipByValue
(
TestDygraphGradientClip
):
def
setUp
(
self
):
def
setUp
(
self
):
# only clip gradient of linear_0.w_0 (ParamBase)
def
fileter_func
(
param
):
return
param
.
name
==
"linear_0.w_0"
self
.
max
=
0.2
self
.
max
=
0.2
self
.
min
=
0.1
self
.
min
=
0.1
self
.
clip
=
fluid
.
clip
.
GradientClipByValue
(
self
.
clip
=
fluid
.
clip
.
GradientClipByValue
(
max
=
self
.
max
,
min
=
self
.
min
)
max
=
self
.
max
,
min
=
self
.
min
,
need_clip
=
fileter_func
)
def
check_clip_result
(
self
,
loss
,
optimizer
):
def
check_clip_result
(
self
,
loss
,
optimizer
):
# if grad is None
# if grad is None
...
...
python/paddle/nn/__init__.py
浏览文件 @
994438b1
...
@@ -31,9 +31,9 @@ __all__ += rnn.__all__
...
@@ -31,9 +31,9 @@ __all__ += rnn.__all__
__all__
+=
weight_norm_hook
.
__all__
__all__
+=
weight_norm_hook
.
__all__
# TODO: define alias in nn directory
# TODO: define alias in nn directory
from
.clip
import
GradientClip
ByGlobalNorm
#DEFINE_ALIAS
from
.clip
import
ClipGrad
ByGlobalNorm
#DEFINE_ALIAS
from
.clip
import
GradientClip
ByNorm
#DEFINE_ALIAS
from
.clip
import
ClipGrad
ByNorm
#DEFINE_ALIAS
from
.clip
import
GradientClip
ByValue
#DEFINE_ALIAS
from
.clip
import
ClipGrad
ByValue
#DEFINE_ALIAS
# from .clip import set_gradient_clip #DEFINE_ALIAS
# from .clip import set_gradient_clip #DEFINE_ALIAS
from
.clip
import
clip
#DEFINE_ALIAS
from
.clip
import
clip
#DEFINE_ALIAS
from
.clip
import
clip_by_norm
#DEFINE_ALIAS
from
.clip
import
clip_by_norm
#DEFINE_ALIAS
...
@@ -51,13 +51,13 @@ from .decode import beam_search_decode #DEFINE_ALIAS
...
@@ -51,13 +51,13 @@ from .decode import beam_search_decode #DEFINE_ALIAS
# from .decode import dynamic_decode #DEFINE_ALIAS
# from .decode import dynamic_decode #DEFINE_ALIAS
from
.decode
import
gather_tree
#DEFINE_ALIAS
from
.decode
import
gather_tree
#DEFINE_ALIAS
# from .input import Input #DEFINE_ALIAS
# from .input import Input #DEFINE_ALIAS
from
.layer.activation
import
ELU
from
.layer.activation
import
ELU
#DEFINE_ALIAS
from
.layer.activation
import
GELU
from
.layer.activation
import
GELU
#DEFINE_ALIAS
from
.layer.activation
import
Tanh
from
.layer.activation
import
Tanh
#DEFINE_ALIAS
from
.layer.activation
import
Hardshrink
from
.layer.activation
import
Hardshrink
#DEFINE_ALIAS
from
.layer.activation
import
Hardtanh
from
.layer.activation
import
Hardtanh
#DEFINE_ALIAS
from
.layer.activation
import
PReLU
from
.layer.activation
import
PReLU
#DEFINE_ALIAS
from
.layer.activation
import
ReLU
from
.layer.activation
import
ReLU
#DEFINE_ALIAS
from
.layer.activation
import
ReLU6
#DEFINE_ALIAS
from
.layer.activation
import
ReLU6
#DEFINE_ALIAS
from
.layer.activation
import
SELU
#DEFINE_ALIAS
from
.layer.activation
import
SELU
#DEFINE_ALIAS
from
.layer.activation
import
LeakyReLU
#DEFINE_ALIAS
from
.layer.activation
import
LeakyReLU
#DEFINE_ALIAS
...
...
python/paddle/nn/clip.py
浏览文件 @
994438b1
...
@@ -13,18 +13,18 @@
...
@@ -13,18 +13,18 @@
# limitations under the License.
# limitations under the License.
# TODO: define the functions to clip gradient of parameter
# TODO: define the functions to clip gradient of parameter
from
..fluid.clip
import
GradientClip
ByGlobalNorm
#DEFINE_ALIAS
from
..fluid.clip
import
ClipGrad
ByGlobalNorm
#DEFINE_ALIAS
from
..fluid.clip
import
GradientClip
ByNorm
#DEFINE_ALIAS
from
..fluid.clip
import
ClipGrad
ByNorm
#DEFINE_ALIAS
from
..fluid.clip
import
GradientClip
ByValue
#DEFINE_ALIAS
from
..fluid.clip
import
ClipGrad
ByValue
#DEFINE_ALIAS
from
..fluid.layers
import
clip
#DEFINE_ALIAS
from
..fluid.layers
import
clip
#DEFINE_ALIAS
from
..fluid.layers
import
clip_by_norm
#DEFINE_ALIAS
from
..fluid.layers
import
clip_by_norm
#DEFINE_ALIAS
__all__
=
[
__all__
=
[
# 'ErrorClipByValue',
# 'ErrorClipByValue',
'
GradientClip
ByGlobalNorm'
,
'
ClipGrad
ByGlobalNorm'
,
'
GradientClip
ByNorm'
,
'
ClipGrad
ByNorm'
,
'
GradientClip
ByValue'
,
'
ClipGrad
ByValue'
,
# 'set_gradient_clip',
# 'set_gradient_clip',
'clip'
,
'clip'
,
'clip_by_norm'
'clip_by_norm'
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录