Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
2bca3295
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
2bca3295
编写于
4月 16, 2020
作者:
Z
Zhou Wei
提交者:
GitHub
4月 16, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
move the initialize position of grad_clip to optimizer(__init__),and speed up clip (#23839)
上级
21fc789e
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
198 addition
and
127 deletion
+198
-127
python/paddle/fluid/clip.py
python/paddle/fluid/clip.py
+52
-41
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+97
-39
python/paddle/fluid/param_attr.py
python/paddle/fluid/param_attr.py
+1
-1
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
+13
-2
python/paddle/fluid/tests/unittests/test_gradient_clip.py
python/paddle/fluid/tests/unittests/test_gradient_clip.py
+23
-34
python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
...addle/fluid/tests/unittests/test_imperative_auto_prune.py
+6
-6
python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
...le/fluid/tests/unittests/test_imperative_selected_rows.py
+6
-4
未找到文件。
python/paddle/fluid/clip.py
浏览文件 @
2bca3295
...
...
@@ -137,9 +137,6 @@ class GradientClipBase(object):
raise
NotImplementedError
def
__call__
(
self
,
params_grads
):
assert
len
(
params_grads
)
>
0
,
"The number of trainable parameters should be greater than 0."
if
framework
.
in_dygraph_mode
():
return
self
.
_dygraph_clip
(
params_grads
)
else
:
...
...
@@ -147,7 +144,7 @@ class GradientClipBase(object):
if
getattr
(
p
,
'gradient_clip_attr'
,
None
)
is
not
None
:
warnings
.
warn
(
"'set_gradient_clip' will be ineffective, because you have "
"
pass 'grad_clip' into 'minimize
'. So, 'set_gradient_clip' "
"
set 'grad_clip' in 'optimizer
'. So, 'set_gradient_clip' "
"is redundant and you can remove it."
)
break
return
self
.
_static_clip
(
params_grads
)
...
...
@@ -170,7 +167,7 @@ class GradientClipByValue(GradientClipBase):
The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
is not None, then only part of gradients can be selected for gradient clipping.
Gradient clip will takes effect after being set in ``optimizer
.minimize(grad_clip)
`` , see the document ``optimizer``
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_fluid_optimizer_SGDOptimizer`).
Args:
...
...
@@ -208,8 +205,8 @@ class GradientClipByValue(GradientClipBase):
# return Parameter.name=="fc_0.w_0"
# clip = fluid.clip.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1)
sgd_optimizer.minimize(loss
, grad_clip=clip
)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1
, grad_clip=clip
)
sgd_optimizer.minimize(loss)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
...
...
@@ -242,8 +239,8 @@ class GradientClipByValue(GradientClipBase):
# clip = fluid.clip.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=0.1, parameter_list=linear.parameters())
sgd_optimizer.minimize(loss
, grad_clip=clip
)
learning_rate=0.1, parameter_list=linear.parameters()
, grad_clip=clip
)
sgd_optimizer.minimize(loss)
"""
def
__init__
(
self
,
max
,
min
=
None
,
need_clip
=
None
):
...
...
@@ -272,6 +269,7 @@ class GradientClipByValue(GradientClipBase):
def
_static_clip
(
self
,
params_grads
):
params_and_grads
=
[]
param_new_grad_name_dict
=
dict
()
with
framework
.
name_scope
(
'gradient_clip'
):
for
p
,
g
in
params_grads
:
if
g
is
None
:
...
...
@@ -284,7 +282,8 @@ class GradientClipByValue(GradientClipBase):
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
new_grad
=
layers
.
clip
(
x
=
g
,
min
=
self
.
min
,
max
=
self
.
max
)
params_and_grads
.
append
((
p
,
new_grad
))
_correct_clip_op_role_var
(
params_and_grads
)
param_new_grad_name_dict
[
p
.
name
]
=
new_grad
.
name
_correct_clip_op_role_var
(
params_and_grads
,
param_new_grad_name_dict
)
return
params_and_grads
def
_process_context
(
self
,
context
,
param
,
grad
):
...
...
@@ -306,7 +305,7 @@ class GradientClipByNorm(GradientClipBase):
The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
is not None, then only part of gradients can be selected for gradient clipping.
Gradient clip will takes effect after being set in ``optimizer
.minimize(grad_clip)
`` , see the document ``optimizer``
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_fluid_optimizer_SGDOptimizer`).
The clipping formula is:
...
...
@@ -359,8 +358,8 @@ class GradientClipByNorm(GradientClipBase):
# return Parameter.name=="fc_0.w_0"
# clip = fluid.clip.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1)
sgd_optimizer.minimize(loss
, grad_clip=clip
)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1
, grad_clip=clip
)
sgd_optimizer.minimize(loss)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
...
...
@@ -394,8 +393,8 @@ class GradientClipByNorm(GradientClipBase):
# clip = fluid.clip.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=0.1, parameter_list=linear.parameters())
sgd_optimizer.minimize(loss
, grad_clip=clip
)
learning_rate=0.1, parameter_list=linear.parameters()
, grad_clip=clip
)
sgd_optimizer.minimize(loss)
"""
...
...
@@ -422,6 +421,7 @@ class GradientClipByNorm(GradientClipBase):
def
_static_clip
(
self
,
params_grads
):
params_and_grads
=
[]
with
framework
.
name_scope
(
'gradient_clip'
):
param_new_grad_name_dict
=
dict
()
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
...
...
@@ -432,8 +432,9 @@ class GradientClipByNorm(GradientClipBase):
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
new_grad
=
layers
.
clip_by_norm
(
x
=
g
,
max_norm
=
self
.
clip_norm
)
param_new_grad_name_dict
[
p
.
name
]
=
new_grad
.
name
params_and_grads
.
append
((
p
,
new_grad
))
_correct_clip_op_role_var
(
params_and_grads
)
_correct_clip_op_role_var
(
params_and_grads
,
param_new_grad_name_dict
)
return
params_and_grads
def
_process_context
(
self
,
context
,
param
,
grad
):
...
...
@@ -456,7 +457,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
is not None, then only part of gradients can be selected for gradient clipping.
Gradient clip will takes effect after being set in ``optimizer
.minimize(grad_clip)
`` , see the document ``optimizer``
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_fluid_optimizer_SGDOptimizer`).
The clipping formula is:
...
...
@@ -505,8 +506,8 @@ class GradientClipByGlobalNorm(GradientClipBase):
# return Parameter.name=="fc_0.w_0"
# clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1)
sgd_optimizer.minimize(loss
, grad_clip=clip
)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1
, grad_clip=clip
)
sgd_optimizer.minimize(loss)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
...
...
@@ -539,8 +540,8 @@ class GradientClipByGlobalNorm(GradientClipBase):
# clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=0.1, parameter_list=linear.parameters())
sgd_optimizer.minimize(loss
, grad_clip=clip
)
learning_rate=0.1, parameter_list=linear.parameters()
, grad_clip=clip
)
sgd_optimizer.minimize(loss)
"""
...
...
@@ -628,6 +629,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
y
=
layers
.
elementwise_max
(
x
=
max_global_norm
,
y
=
global_norm_var
))
param_new_grad_name_dict
=
dict
()
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
...
...
@@ -638,9 +640,10 @@ class GradientClipByGlobalNorm(GradientClipBase):
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
new_grad
=
layers
.
elementwise_mul
(
x
=
g
,
y
=
scale_var
)
param_new_grad_name_dict
[
p
.
name
]
=
new_grad
.
name
params_and_grads
.
append
((
p
,
new_grad
))
_correct_clip_op_role_var
(
params_and_grads
)
_correct_clip_op_role_var
(
params_and_grads
,
param_new_grad_name_dict
)
return
params_and_grads
def
_process_context
(
self
,
context
,
param
,
grad
):
...
...
@@ -692,9 +695,10 @@ def set_gradient_clip(clip, param_list=None, program=None):
This API must be used after building network, and before ``minimize`` ,
and it may be removed in future releases, so it is not recommended.
It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient.
There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
this is a better method to clip gradient. There are three clipping strategies:
:ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` .
To specify parameters that require gradient clip.
...
...
@@ -757,7 +761,7 @@ def set_gradient_clip(clip, param_list=None, program=None):
sgd = fluid.optimizer.SGD(learning_rate=1e-3)
sgd.minimize(loss)
# network 4: use 'set_gradient_clip' and '
min
imize(grad_clip=clip)' together
# network 4: use 'set_gradient_clip' and '
opt
imize(grad_clip=clip)' together
with fluid.program_guard(fluid.Program(), fluid.Program()):
loss = network()
clip1 = fluid.clip.GradientClipByValue(min=-1.0, max=1.0)
...
...
@@ -765,8 +769,8 @@ def set_gradient_clip(clip, param_list=None, program=None):
# Set the gradient clipping strategy: clip1
fluid.clip.set_gradient_clip(clip1)
# Set the gradient clipping strategy: clip2
sgd = fluid.optimizer.SGD(learning_rate=1e-3)
sgd.minimize(loss
, grad_clip=clip2
)
sgd = fluid.optimizer.SGD(learning_rate=1e-3
, grad_clip=clip2
)
sgd.minimize(loss)
# 'set_gradient_clip' will not take effect when setting has a conflict,
# and the gradient clipping strategy will be 'clip2'
...
...
@@ -774,10 +778,10 @@ def set_gradient_clip(clip, param_list=None, program=None):
"""
warnings
.
warn
(
"Caution! 'set_gradient_clip' is not recommended "
"and may be deprecated in future! "
"We recommend a new strategy:
clip gradient by
"
"
'optimizer.minimize(loss, grad_clip=clip)
'. "
"We recommend a new strategy:
set 'grad_clip'
"
"
when initializing the 'optimizer
'. "
"This method can reduce the mistakes, please "
"
see documention of 'optimzier.minimize
'."
)
"
refer to documention of 'optimizer
'."
)
if
not
isinstance
(
clip
,
GradientClipBase
):
raise
TypeError
(
...
...
@@ -824,33 +828,40 @@ def append_gradient_clip_ops(param_grads):
clip_attr
.
_process_context
(
context
=
context
,
param
=
p
,
grad
=
g
)
res
=
[]
param_new_grad_name_dict
=
dict
()
for
p
,
g
in
param_grads
:
if
g
is
None
:
continue
with
p
.
block
.
program
.
_optimized_guard
(
[
p
,
g
]),
framework
.
name_scope
(
'graident_clip_@CLIP'
):
param
,
new_grad
=
clip_attr
.
_create_operators
(
param
=
p
,
grad
=
g
)
param_new_grad_name_dict
[
param
.
name
]
=
new_grad
.
name
res
.
append
([
param
,
new_grad
])
_correct_clip_op_role_var
(
res
)
_correct_clip_op_role_var
(
res
,
param_new_grad_name_dict
)
return
res
# change wrong mapping relation between param & grad in clip op
def
_correct_clip_op_role_var
(
params_grads
):
def
_correct_clip_op_role_var
(
params_grads
,
param_new_grad_name_dict
):
block_id_list
=
[]
if
len
(
param_new_grad_name_dict
)
==
0
:
return
for
param
,
grad
in
params_grads
:
if
grad
is
None
:
continue
block_id
=
param
.
block
.
idx
if
block_id
in
block_id_list
:
continue
block_id_list
.
append
(
block_id
)
for
op
in
param
.
block
.
program
.
global_block
().
ops
:
if
'op_namescope'
in
op
.
all_attrs
()
and
"gradient_clip"
in
op
.
attr
(
"op_namescope"
):
if
op
.
attr
(
'op_role_var'
):
param_name
=
op
.
attr
(
'op_role_var'
)[
0
]
index
=
0
for
i
in
range
(
len
(
params_grads
)):
if
params_grads
[
i
][
0
].
name
==
param_name
:
index
=
i
correct_p_g
=
[
param_name
,
params_grads
[
index
][
1
].
name
]
"op_namescope"
)
and
op
.
attr
(
'op_role_var'
):
param_name
=
op
.
attr
(
'op_role_var'
)[
0
]
if
param_name
in
param_new_grad_name_dict
:
correct_p_g
=
[
param_name
,
param_new_grad_name_dict
[
param_name
]
]
op
.
_set_attr
(
'op_role_var'
,
correct_p_g
)
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
2bca3295
...
...
@@ -24,7 +24,7 @@ from . import framework
from
.
import
layers
from
.
import
unique_name
from
.backward
import
append_backward
,
_some_in_set_
,
_append_grad_suffix_
,
_get_no_grad_set_name
from
.clip
import
GradientClipBase
,
error_clip_callback
,
append_gradient_clip_ops
from
.clip
import
GradientClipBase
,
GradientClipByNorm
,
error_clip_callback
,
append_gradient_clip_ops
from
.framework
import
program_guard
from
.initializer
import
Constant
from
.layer_helper
import
LayerHelper
...
...
@@ -64,6 +64,7 @@ class Optimizer(object):
learning_rate
,
parameter_list
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
self
.
_parameter_list
=
parameter_list
if
framework
.
in_dygraph_mode
():
...
...
@@ -88,7 +89,13 @@ class Optimizer(object):
type
(
learning_rate
))
self
.
_name
=
name
if
grad_clip
is
not
None
:
if
not
isinstance
(
grad_clip
,
GradientClipBase
):
raise
TypeError
(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
self
.
regularization
=
regularization
self
.
_grad_clip
=
grad_clip
self
.
_learning_rate
=
learning_rate
# the learning rate type should be inferenced from loss
self
.
_dtype
=
None
...
...
@@ -107,8 +114,6 @@ class Optimizer(object):
self
.
_opti_name_list
=
[]
self
.
_accumulators_holder
=
{}
self
.
_param_device_map
=
dict
()
# if pass grad_clip into minimize, it will not be None
self
.
_grad_clip
=
None
@
framework
.
dygraph_only
def
state_dict
(
self
):
...
...
@@ -787,8 +792,7 @@ class Optimizer(object):
loss
,
startup_program
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
,
grad_clip
=
None
):
no_grad_set
=
None
):
"""
Add operations to minimize ``loss`` by updating ``parameter_list``.
...
...
@@ -801,12 +805,7 @@ class Optimizer(object):
to minimize ``loss``. The default value is None, at this time all parameters
will be updated.
no_grad_set (set, optional): Set of ``Variable`` or ``Variable.name`` that don't need
to be updated. The default value is None.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no
gradient clipping.
to be updated. The default value is None.
Returns:
tuple: tuple (optimize_ops, params_grads), A list of operators appended
...
...
@@ -820,12 +819,7 @@ class Optimizer(object):
Please refer to the example of current Optimizer.
"""
assert
isinstance
(
loss
,
Variable
),
"The loss should be an Variable."
if
grad_clip
is
not
None
:
if
not
isinstance
(
grad_clip
,
GradientClipBase
):
raise
TypeError
(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
self
.
_grad_clip
=
grad_clip
parameter_list
=
parameter_list
if
parameter_list
\
else
self
.
_parameter_list
params_grads
=
self
.
backward
(
...
...
@@ -859,6 +853,10 @@ class SGDOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information.
\
For details, please refer to :ref:`api_guide_Name`. Default is None.
...
...
@@ -896,12 +894,14 @@ class SGDOptimizer(Optimizer):
learning_rate
,
parameter_list
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
assert
learning_rate
is
not
None
super
(
SGDOptimizer
,
self
).
__init__
(
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
self
.
type
=
"sgd"
...
...
@@ -962,6 +962,10 @@ class MomentumOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information.
\
For details, please refer to :ref:`api_guide_Name`. Default is None.
...
...
@@ -1002,6 +1006,7 @@ class MomentumOptimizer(Optimizer):
parameter_list
=
None
,
use_nesterov
=
False
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
assert
learning_rate
is
not
None
assert
momentum
is
not
None
...
...
@@ -1009,6 +1014,7 @@ class MomentumOptimizer(Optimizer):
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
self
.
type
=
"momentum"
self
.
_momentum
=
momentum
...
...
@@ -1097,13 +1103,14 @@ class DGCMomentumOptimizer(Optimizer):
This parameter is required in dygraph mode.
\
The default value is None in static mode, at this time all parameters will be updated.
use_nesterov (bool): Enables Nesterov momentum. True means use Nesterov. Default is False.
local_grad_clip_norm (float, optional): Local gradient clip norm value. Optional, default is None, represent no need clip.
num_trainers (int, optional): The number of training nodes. Optional, default is None.
regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method:
\
:ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set
\
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
grad_clip (GradientClipByNorm, optional): Gradient cliping strategy. ``DGCMomentumOptimizer`` only support
:ref:`api_fluid_clip_GradientClipByNorm` , and if not, it will raise TypeError. Default None,
meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information.
\
For details, please refer to :ref:`api_guide_Name`. Default is None.
...
...
@@ -1130,9 +1137,9 @@ class DGCMomentumOptimizer(Optimizer):
sparsity
=
[
0.999
],
parameter_list
=
None
,
use_nesterov
=
False
,
local_grad_clip_norm
=
None
,
num_trainers
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
if
framework
.
in_dygraph_mode
():
raise
Exception
(
"In dygraph, don't support DGCMomentumOptimizer."
)
...
...
@@ -1146,6 +1153,7 @@ class DGCMomentumOptimizer(Optimizer):
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
self
.
type
=
"dgc_momentum"
self
.
_momentum
=
momentum
...
...
@@ -1159,20 +1167,23 @@ class DGCMomentumOptimizer(Optimizer):
self
.
_rampup_begin_step_var
=
None
self
.
_global_step_var
=
None
self
.
_local_grad_clip_norm
=
None
self
.
_clip_norm
=
None
if
local_grad_clip_norm
is
not
None
:
assert
isinstance
(
num_trainers
,
int
)
assert
isinstance
(
local_grad_clip_norm
,
float
)
assert
num_trainers
>
0
self
.
_dgc_clip_norm
=
None
if
grad_clip
is
not
None
:
if
not
isinstance
(
grad_clip
,
GradientClipByNorm
):
raise
TypeError
(
"The type of grad_clip should be 'GradientClipByNorm', because DGCMomentumOptimizer only support GradientClipByNorm"
)
assert
isinstance
(
num_trainers
,
int
),
"The type of num_trainers should be 'int', but received %s"
%
type
(
value
)
assert
num_trainers
>
0
,
"The value of num_trainers should be greater than 0!"
self
.
_local_grad_clip_norm
=
local_grad_clip_norm
self
.
_num_trainers
=
num_trainers
self
.
_
clip_norm
=
local_grad_
clip_norm
*
(
num_trainers
**-
0.5
)
self
.
_
dgc_clip_norm
=
grad_clip
.
clip_norm
*
(
num_trainers
**-
0.5
)
self
.
regular_type
,
self
.
regular_coeff
=
self
.
_get_regularization_param
(
self
.
regularization
)
self
.
_grad_clip
=
None
def
_get_regularization_param
(
self
,
regularization
):
regular_type
=
0
...
...
@@ -1342,8 +1353,8 @@ class DGCMomentumOptimizer(Optimizer):
op
.
_remove_attr
(
op_maker
.
kOpRoleVarAttrName
())
clip_var
=
grad_var
if
self
.
_
local_grad
_clip_norm
is
not
None
:
clip_var
=
self
.
_append_clip_norm
(
grad_var
,
self
.
_clip_norm
)
if
self
.
_
dgc
_clip_norm
is
not
None
:
clip_var
=
self
.
_append_clip_norm
(
grad_var
,
self
.
_
dgc_
clip_norm
)
self
.
_dgc_op
(
param_var
,
clip_var
,
grad_var
,
u_var
,
v_var
,
k_var
,
encoded_var
,
gather_var
)
...
...
@@ -1494,6 +1505,10 @@ class LarsMomentumOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information.
\
For details, please refer to :ref:`api_guide_Name`. Default is None.
...
...
@@ -1526,6 +1541,7 @@ class LarsMomentumOptimizer(Optimizer):
lars_weight_decay
=
0.0005
,
parameter_list
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
assert
learning_rate
is
not
None
assert
momentum
is
not
None
...
...
@@ -1533,6 +1549,7 @@ class LarsMomentumOptimizer(Optimizer):
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
self
.
type
=
"lars_momentum"
self
.
_momentum
=
momentum
...
...
@@ -1607,6 +1624,10 @@ class AdagradOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
...
...
@@ -1639,6 +1660,7 @@ class AdagradOptimizer(Optimizer):
epsilon
=
1.0e-6
,
parameter_list
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
,
initial_accumulator_value
=
0.0
):
assert
learning_rate
is
not
None
...
...
@@ -1647,6 +1669,7 @@ class AdagradOptimizer(Optimizer):
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
self
.
type
=
"adagrad"
self
.
_epsilon
=
epsilon
...
...
@@ -1726,6 +1749,10 @@ class AdamOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
...
...
@@ -1835,6 +1862,7 @@ class AdamOptimizer(Optimizer):
epsilon
=
1e-8
,
parameter_list
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
,
lazy_mode
=
False
):
assert
learning_rate
is
not
None
...
...
@@ -1845,6 +1873,7 @@ class AdamOptimizer(Optimizer):
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
self
.
type
=
"adam"
self
.
_beta1
=
beta1
...
...
@@ -1986,6 +2015,10 @@ class AdamaxOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
...
...
@@ -2031,6 +2064,7 @@ class AdamaxOptimizer(Optimizer):
epsilon
=
1e-8
,
parameter_list
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
assert
learning_rate
is
not
None
assert
beta1
is
not
None
...
...
@@ -2040,6 +2074,7 @@ class AdamaxOptimizer(Optimizer):
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
self
.
type
=
"adamax"
self
.
_beta1
=
beta1
...
...
@@ -2238,6 +2273,10 @@ class DecayedAdagradOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
...
...
@@ -2264,6 +2303,7 @@ class DecayedAdagradOptimizer(Optimizer):
epsilon
=
1.0e-6
,
parameter_list
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
assert
learning_rate
is
not
None
assert
decay
is
not
None
...
...
@@ -2273,6 +2313,7 @@ class DecayedAdagradOptimizer(Optimizer):
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
self
.
type
=
"decayed_adagrad"
self
.
_decay
=
decay
...
...
@@ -2337,6 +2378,10 @@ class AdadeltaOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): The default value is None. Normally there is no need for user
to set this property. For more information, please refer to
:ref:`api_guide_Name` .
...
...
@@ -2367,6 +2412,7 @@ class AdadeltaOptimizer(Optimizer):
rho
=
0.95
,
parameter_list
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
if
learning_rate
is
None
:
raise
ValueError
(
"learning_rate is not set."
)
...
...
@@ -2378,6 +2424,7 @@ class AdadeltaOptimizer(Optimizer):
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
self
.
type
=
"adadelta"
self
.
_epsilon
=
epsilon
...
...
@@ -2488,6 +2535,10 @@ class RMSPropOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information.
\
For details, please refer to :ref:`api_guide_Name`. Default is None.
...
...
@@ -2536,11 +2587,13 @@ class RMSPropOptimizer(Optimizer):
centered
=
False
,
parameter_list
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
super
(
RMSPropOptimizer
,
self
).
__init__
(
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
if
learning_rate
is
None
:
raise
ValueError
(
"learning_rate is not set."
)
...
...
@@ -2656,6 +2709,10 @@ class FtrlOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information.
\
For details, please refer to :ref:`api_guide_Name`. Default is None.
...
...
@@ -2704,11 +2761,13 @@ class FtrlOptimizer(Optimizer):
lr_power
=-
0.5
,
parameter_list
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
super
(
FtrlOptimizer
,
self
).
__init__
(
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
if
learning_rate
is
None
:
raise
ValueError
(
"learning_rate is not set."
)
...
...
@@ -2798,6 +2857,10 @@ class LambOptimizer(AdamOptimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
exclude_from_weight_decay_fn (function|None): Exclude a parameter from weight
decay when **exclude_from_weight_decay_fn(parameter)** returns true.
Default None.
...
...
@@ -2834,6 +2897,7 @@ class LambOptimizer(AdamOptimizer):
epsilon
=
1e-6
,
parameter_list
=
None
,
regularization
=
None
,
grad_clip
=
None
,
exclude_from_weight_decay_fn
=
None
,
name
=
None
):
assert
learning_rate
is
not
None
...
...
@@ -2845,6 +2909,7 @@ class LambOptimizer(AdamOptimizer):
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
beta1
=
beta1
,
beta2
=
beta2
,
epsilon
=
epsilon
,
...
...
@@ -4043,20 +4108,13 @@ class RecomputeOptimizer(Optimizer):
loss
,
startup_program
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
,
grad_clip
=
None
):
no_grad_set
=
None
):
assert
isinstance
(
loss
,
Variable
),
"The loss should be an Variable."
assert
(
self
.
_checkpoints
is
not
None
),
"You should call _set_checkpoints first"
if
framework
.
in_dygraph_mode
():
raise
NotImplementedError
(
"DyGraph current does not support recompute"
)
if
grad_clip
is
not
None
:
if
not
isinstance
(
grad_clip
,
GradientClipBase
):
raise
TypeError
(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
self
.
_optimizer
.
_grad_clip
=
grad_clip
params_grads
=
self
.
backward
(
loss
,
startup_program
=
startup_program
,
...
...
python/paddle/fluid/param_attr.py
浏览文件 @
2bca3295
...
...
@@ -34,7 +34,7 @@ class ParamAttr(object):
Note:
``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
It is recommended to
use ``minimize(loss, grad_clip=clip)
`` to clip gradient.
It is recommended to
set ``grad_clip`` in ``optimizer
`` to clip gradient.
There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
...
...
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
浏览文件 @
2bca3295
...
...
@@ -19,6 +19,7 @@ import unittest
import
paddle.fluid.framework
as
framework
import
paddle.fluid.optimizer
as
optimizer
import
paddle.fluid.regularizer
as
regularizer
import
paddle.fluid.clip
as
clip
import
paddle.compat
as
cpt
from
paddle.fluid.backward
import
append_backward
from
paddle.fluid.transpiler.details
import
program_to_code
...
...
@@ -70,9 +71,9 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
learning_rate
=
learning_rate
,
momentum
=
0.2
,
rampup_begin_step
=
0
,
local_grad_clip_norm
=
1.0
,
num_trainers
=
2
,
regularization
=
regularization
)
regularization
=
regularization
,
grad_clip
=
clip
.
GradientClipByNorm
(
1.0
))
if
use_recompute
:
dgc_momentum_optimizer
=
optimizer
.
RecomputeOptimizer
(
...
...
@@ -124,6 +125,16 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
#with open("test_dgc_optimizer_" + name + str(use_recompute) + ".log", "w") as f:
# program_to_code(program, fout=f)
def
test_tpyeError
(
self
):
# the type of DGCMomentumOptimizer(grad_clip=) must be 'GradientClipByNorm'
with
self
.
assertRaises
(
TypeError
):
dgc_momentum_optimizer
=
self
.
MockDGCMomentum
(
learning_rate
=
0.01
,
momentum
=
0.2
,
rampup_begin_step
=
0
,
num_trainers
=
2
,
grad_clip
=
clip
.
GradientClipByGlobalNorm
(
1.0
))
def
test_momentum_without_dgc
(
self
):
self
.
check_dgc_momentum_optimizer
(
regularization
=
regularizer
.
L1Decay
(
1e-4
))
...
...
python/paddle/fluid/tests/unittests/test_gradient_clip.py
浏览文件 @
2bca3295
...
...
@@ -76,8 +76,8 @@ class TestGradientClip(unittest.TestCase):
startup_program
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main_program
=
prog
,
startup_program
=
startup_program
):
image
=
fluid
.
data
(
name
=
'x'
,
shape
=
[
-
1
,
784
],
dtype
=
'float32'
)
label
=
fluid
.
data
(
name
=
'y'
,
shape
=
[
-
1
,
1
],
dtype
=
'int64'
)
image
=
fluid
.
data
(
name
=
"a"
,
shape
=
[
-
1
,
784
],
dtype
=
'float32'
)
label
=
fluid
.
data
(
name
=
"b"
,
shape
=
[
-
1
,
1
],
dtype
=
'int64'
)
hidden
=
fluid
.
layers
.
fc
(
input
=
image
,
size
=
32
,
act
=
'relu'
)
predict
=
fluid
.
layers
.
fc
(
input
=
hidden
,
size
=
10
,
act
=
'softmax'
)
...
...
@@ -112,13 +112,13 @@ class TestGradientClip(unittest.TestCase):
self
.
check_clip_result
(
out
,
out_clip
)
def
check_sparse_gradient_clip
(
self
,
place
):
prog
=
fluid
.
framework
.
Program
()
startup_program
=
fluid
.
framework
.
Program
()
prog
=
fluid
.
Program
()
startup_program
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main_program
=
prog
,
startup_program
=
startup_program
):
data
=
fluid
.
layers
.
data
(
name
=
"words"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
data
=
fluid
.
data
(
name
=
"words"
,
shape
=
[
-
1
,
1
],
dtype
=
"int64"
,
lod_level
=
1
)
label
=
fluid
.
data
(
name
=
"label"
,
shape
=
[
-
1
,
1
],
dtype
=
"int64"
)
cost
=
bow_net
(
data
,
label
,
self
.
word_dict_len
)
self
.
backward_and_optimize
(
cost
)
...
...
@@ -172,7 +172,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
self
.
clip_gradient
=
func
self
.
check_gradient_clip
(
fluid
.
CPUPlace
())
# test whether the ouput is right when use
'minimize(grad_clip)'
# test whether the ouput is right when use
grad_clip
def
test_new_gradient_clip
(
self
):
def
func
(
params_grads
):
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
self
.
clip_norm
)
...
...
@@ -192,9 +192,10 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
5.0
,
need_clip
=
fileter_func
)
fluid
.
clip
.
set_gradient_clip
(
clip
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.01
)
# if 'set_gradient_clip' and 'minimize(grad_clip)' together, 'set_gradient_clip' will be ineffective
sgd_optimizer
.
minimize
(
cost
,
grad_clip
=
clip
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.01
,
grad_clip
=
clip
)
# if 'set_gradient_clip' and 'optimize(grad_clip)' together, 'set_gradient_clip' will be ineffective
sgd_optimizer
.
minimize
(
cost
)
# 'set_gradient_clip' must before 'minimize', otherwise, 'set_gradient_clip' will be ineffective
fluid
.
clip
.
set_gradient_clip
(
clip
)
...
...
@@ -232,24 +233,10 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
self
.
clip_norm
,
need_clip
=
"test"
)
# the type of minimize(grad_clip=) must be an instance of GradientClipBase's derived class
with
self
.
assertRaises
(
TypeError
):
x
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
name
=
"x"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
loss
=
fluid
.
layers
.
reduce_mean
(
x
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.1
)
sgd_optimizer
.
minimize
(
loss
,
grad_clip
=
"test"
)
# the type of RecomputeOptimizer.minimize(grad_clip=) must be an instance of GradientClipBase's derived class
# the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class
with
self
.
assertRaises
(
TypeError
):
x
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
name
=
"x"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
loss
=
fluid
.
layers
.
reduce_mean
(
x
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.1
)
recompute_optimizer
=
fluid
.
optimizer
.
RecomputeOptimizer
(
sgd_optimizer
)
recompute_optimizer
.
_set_checkpoints
([
x
])
recompute_optimizer
.
minimize
(
loss
,
grad_clip
=
"test"
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.1
,
grad_clip
=
"test"
)
class
TestGradientClipByNorm
(
TestGradientClip
):
...
...
@@ -271,7 +258,7 @@ class TestGradientClipByNorm(TestGradientClip):
a
=
u
,
b
=
v
,
rtol
=
1e-5
,
atol
=
1e-8
),
"gradient clip by norm has wrong results!"
)
# test whether the ouput is right when use
'minimize(grad_clip)'
# test whether the ouput is right when use
grad_clip
def
test_gradient_clip
(
self
):
self
.
check_gradient_clip
(
fluid
.
CPUPlace
())
...
...
@@ -319,7 +306,7 @@ class TestGradientClipByValue(TestGradientClip):
a
=
u
,
b
=
v
,
rtol
=
1e-6
,
atol
=
1e-8
),
"gradient clip by value has wrong results!"
)
# test whether the ouput is right when use
'minimize(grad_clip)'
# test whether the ouput is right when use
grad_clip
def
test_gradient_clip
(
self
):
self
.
check_gradient_clip
(
fluid
.
CPUPlace
())
...
...
@@ -357,7 +344,9 @@ class TestDygraphGradientClip(unittest.TestCase):
loss
=
fluid
.
layers
.
reduce_mean
(
out
)
loss
.
backward
()
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.0
,
parameter_list
=
linear
.
parameters
())
learning_rate
=
0.0
,
parameter_list
=
linear
.
parameters
(),
grad_clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
0.1
))
self
.
check_clip_result
(
loss
,
sgd_optimizer
)
def
check_clip_result
(
self
,
loss
,
optimizer
):
...
...
@@ -384,7 +373,7 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
np
.
array
([
3
,
4
]).
astype
(
"float32"
),
name
=
"y"
)
assert
len
(
self
.
clip1
([(
x
,
x
),
(
x
,
y
),
(
x
,
None
)]))
==
2
# get params and grads from network
opt
,
params_grads
=
optimizer
.
minimize
(
loss
,
grad_clip
=
self
.
clip2
)
opt
,
params_grads
=
optimizer
.
minimize
(
loss
)
_
,
grads
=
zip
(
*
params_grads
)
params_grads
=
self
.
clip2
(
params_grads
)
_
,
grads_clip
=
zip
(
*
params_grads
)
...
...
@@ -426,7 +415,7 @@ class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
assert
len
(
self
.
clip
([(
x
,
None
)]))
==
0
# get params and grads from network
self
.
clip
([(
fluid
.
dygraph
.
to_variable
(
np
.
array
([
2
,
3
])),
None
)])
params_grads
=
optimizer
.
backward
(
loss
)
opt
,
params_grads
=
optimizer
.
minimize
(
loss
)
_
,
grads
=
zip
(
*
params_grads
)
params_grads
=
self
.
clip
(
params_grads
)
_
,
grads_clip
=
zip
(
*
params_grads
)
...
...
@@ -460,7 +449,7 @@ class TestDygraphGradientClipByValue(TestDygraphGradientClip):
x
=
fluid
.
dygraph
.
to_variable
(
np
.
array
([
2
,
3
]).
astype
(
"float32"
))
assert
len
(
self
.
clip
([(
x
,
None
)]))
==
0
# get params and grads from network
params_grads
=
optimizer
.
backward
(
loss
)
opt
,
params_grads
=
optimizer
.
minimize
(
loss
)
_
,
grads
=
zip
(
*
params_grads
)
params_grads
=
self
.
clip
(
params_grads
)
_
,
grads_clip
=
zip
(
*
params_grads
)
...
...
python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
浏览文件 @
2bca3295
...
...
@@ -329,9 +329,9 @@ class TestImperativeAutoPrune(unittest.TestCase):
place
=
fluid
.
CPUPlace
()
with
fluid
.
dygraph
.
guard
(
place
):
model
=
MyLayer
(
size
,
vocab_size
,
size
)
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
0.001
,
parameter_list
=
model
.
parameters
())
grad_clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
0.001
)
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
0.001
,
parameter_list
=
model
.
parameters
(),
grad_clip
=
grad_clip
)
indices
=
fluid
.
dygraph
.
to_variable
(
indices
)
embed
=
fluid
.
dygraph
.
to_variable
(
embed
)
...
...
@@ -339,7 +339,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
loss
=
model
.
embed_linear0
(
indices
)
loss
.
backward
()
_
,
params_grads
=
optimizer
.
minimize
(
loss
,
grad_clip
=
grad_clip
)
_
,
params_grads
=
optimizer
.
minimize
(
loss
)
for
items
in
params_grads
:
assert
items
[
0
].
name
is
not
model
.
embed1
.
weight
.
name
assert
items
[
0
].
name
is
not
model
.
linear_1
.
weight
.
name
...
...
@@ -348,9 +348,9 @@ class TestImperativeAutoPrune(unittest.TestCase):
with
fluid
.
dygraph
.
guard
(
place
):
model
=
MyLayer2
(
size
,
vocab_size
,
size
)
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
0.001
,
parameter_list
=
model
.
parameters
())
grad_clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
0.001
)
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
0.001
,
parameter_list
=
model
.
parameters
(),
grad_clip
=
grad_clip
)
indices
=
fluid
.
dygraph
.
to_variable
(
indices
)
emebd
=
fluid
.
dygraph
.
to_variable
(
embed
)
...
...
@@ -358,7 +358,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
loss
=
model
.
embed_linear0
(
indices
)
loss
.
backward
()
optimizer
.
minimize
(
loss
,
grad_clip
=
grad_clip
)
optimizer
.
minimize
(
loss
)
for
items
in
params_grads
:
assert
items
[
0
].
name
is
not
model
.
embed1
.
weight
.
name
assert
items
[
0
].
name
is
not
model
.
linear_1
.
weight
.
name
...
...
python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
浏览文件 @
2bca3295
...
...
@@ -58,14 +58,15 @@ class TestSimpleNet(unittest.TestCase):
simplenet
=
SimpleNet
(
20
,
32
,
dtype
)
adam
=
SGDOptimizer
(
learning_rate
=
0.001
,
parameter_list
=
simplenet
.
parameters
())
parameter_list
=
simplenet
.
parameters
(
))
# grad_clip=grad_clip
input_emb
,
emb
=
simplenet
(
input
)
self
.
assertTrue
(
emb
.
weight
.
gradient
()
is
None
)
self
.
assertTrue
(
input_emb
.
gradient
()
is
None
)
input_emb
.
backward
(
backward_strategy
)
adam
.
minimize
(
input_emb
)
# grad_clip=grad_clip
adam
.
minimize
(
input_emb
)
self
.
assertTrue
(
emb
.
weight
.
gradient
()
is
not
None
)
emb
.
clear_gradients
()
...
...
@@ -92,14 +93,15 @@ class TestSimpleNet(unittest.TestCase):
simplenet
=
SimpleNet
(
20
,
32
,
"float32"
)
adam
=
SGDOptimizer
(
learning_rate
=
0.001
,
parameter_list
=
simplenet
.
parameters
())
parameter_list
=
simplenet
.
parameters
(),
grad_clip
=
grad_clip
)
input_emb
,
emb
=
simplenet
(
input
)
self
.
assertTrue
(
emb
.
weight
.
gradient
()
is
None
)
self
.
assertTrue
(
input_emb
.
gradient
()
is
None
)
input_emb
.
backward
(
backward_strategy
)
adam
.
minimize
(
input_emb
,
grad_clip
=
grad_clip
)
adam
.
minimize
(
input_emb
)
self
.
assertTrue
(
emb
.
weight
.
gradient
()
is
not
None
)
emb
.
clear_gradients
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录