Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
2bca3295
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
2bca3295
编写于
4月 16, 2020
作者:
Z
Zhou Wei
提交者:
GitHub
4月 16, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
move the initialize position of grad_clip to optimizer(__init__),and speed up clip (#23839)
上级
21fc789e
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
198 addition
and
127 deletion
+198
-127
python/paddle/fluid/clip.py
python/paddle/fluid/clip.py
+52
-41
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+97
-39
python/paddle/fluid/param_attr.py
python/paddle/fluid/param_attr.py
+1
-1
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
+13
-2
python/paddle/fluid/tests/unittests/test_gradient_clip.py
python/paddle/fluid/tests/unittests/test_gradient_clip.py
+23
-34
python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
...addle/fluid/tests/unittests/test_imperative_auto_prune.py
+6
-6
python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
...le/fluid/tests/unittests/test_imperative_selected_rows.py
+6
-4
未找到文件。
python/paddle/fluid/clip.py
浏览文件 @
2bca3295
...
@@ -137,9 +137,6 @@ class GradientClipBase(object):
...
@@ -137,9 +137,6 @@ class GradientClipBase(object):
raise
NotImplementedError
raise
NotImplementedError
def
__call__
(
self
,
params_grads
):
def
__call__
(
self
,
params_grads
):
assert
len
(
params_grads
)
>
0
,
"The number of trainable parameters should be greater than 0."
if
framework
.
in_dygraph_mode
():
if
framework
.
in_dygraph_mode
():
return
self
.
_dygraph_clip
(
params_grads
)
return
self
.
_dygraph_clip
(
params_grads
)
else
:
else
:
...
@@ -147,7 +144,7 @@ class GradientClipBase(object):
...
@@ -147,7 +144,7 @@ class GradientClipBase(object):
if
getattr
(
p
,
'gradient_clip_attr'
,
None
)
is
not
None
:
if
getattr
(
p
,
'gradient_clip_attr'
,
None
)
is
not
None
:
warnings
.
warn
(
warnings
.
warn
(
"'set_gradient_clip' will be ineffective, because you have "
"'set_gradient_clip' will be ineffective, because you have "
"
pass 'grad_clip' into 'minimize
'. So, 'set_gradient_clip' "
"
set 'grad_clip' in 'optimizer
'. So, 'set_gradient_clip' "
"is redundant and you can remove it."
)
"is redundant and you can remove it."
)
break
break
return
self
.
_static_clip
(
params_grads
)
return
self
.
_static_clip
(
params_grads
)
...
@@ -170,7 +167,7 @@ class GradientClipByValue(GradientClipBase):
...
@@ -170,7 +167,7 @@ class GradientClipByValue(GradientClipBase):
The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
is not None, then only part of gradients can be selected for gradient clipping.
is not None, then only part of gradients can be selected for gradient clipping.
Gradient clip will takes effect after being set in ``optimizer
.minimize(grad_clip)
`` , see the document ``optimizer``
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_fluid_optimizer_SGDOptimizer`).
(for example: :ref:`api_fluid_optimizer_SGDOptimizer`).
Args:
Args:
...
@@ -208,8 +205,8 @@ class GradientClipByValue(GradientClipBase):
...
@@ -208,8 +205,8 @@ class GradientClipByValue(GradientClipBase):
# return Parameter.name=="fc_0.w_0"
# return Parameter.name=="fc_0.w_0"
# clip = fluid.clip.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)
# clip = fluid.clip.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1
, grad_clip=clip
)
sgd_optimizer.minimize(loss
, grad_clip=clip
)
sgd_optimizer.minimize(loss)
place = fluid.CPUPlace()
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe = fluid.Executor(place)
...
@@ -242,8 +239,8 @@ class GradientClipByValue(GradientClipBase):
...
@@ -242,8 +239,8 @@ class GradientClipByValue(GradientClipBase):
# clip = fluid.clip.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)
# clip = fluid.clip.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGD(
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=0.1, parameter_list=linear.parameters())
learning_rate=0.1, parameter_list=linear.parameters()
, grad_clip=clip
)
sgd_optimizer.minimize(loss
, grad_clip=clip
)
sgd_optimizer.minimize(loss)
"""
"""
def
__init__
(
self
,
max
,
min
=
None
,
need_clip
=
None
):
def
__init__
(
self
,
max
,
min
=
None
,
need_clip
=
None
):
...
@@ -272,6 +269,7 @@ class GradientClipByValue(GradientClipBase):
...
@@ -272,6 +269,7 @@ class GradientClipByValue(GradientClipBase):
def
_static_clip
(
self
,
params_grads
):
def
_static_clip
(
self
,
params_grads
):
params_and_grads
=
[]
params_and_grads
=
[]
param_new_grad_name_dict
=
dict
()
with
framework
.
name_scope
(
'gradient_clip'
):
with
framework
.
name_scope
(
'gradient_clip'
):
for
p
,
g
in
params_grads
:
for
p
,
g
in
params_grads
:
if
g
is
None
:
if
g
is
None
:
...
@@ -284,7 +282,8 @@ class GradientClipByValue(GradientClipBase):
...
@@ -284,7 +282,8 @@ class GradientClipByValue(GradientClipBase):
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
new_grad
=
layers
.
clip
(
x
=
g
,
min
=
self
.
min
,
max
=
self
.
max
)
new_grad
=
layers
.
clip
(
x
=
g
,
min
=
self
.
min
,
max
=
self
.
max
)
params_and_grads
.
append
((
p
,
new_grad
))
params_and_grads
.
append
((
p
,
new_grad
))
_correct_clip_op_role_var
(
params_and_grads
)
param_new_grad_name_dict
[
p
.
name
]
=
new_grad
.
name
_correct_clip_op_role_var
(
params_and_grads
,
param_new_grad_name_dict
)
return
params_and_grads
return
params_and_grads
def
_process_context
(
self
,
context
,
param
,
grad
):
def
_process_context
(
self
,
context
,
param
,
grad
):
...
@@ -306,7 +305,7 @@ class GradientClipByNorm(GradientClipBase):
...
@@ -306,7 +305,7 @@ class GradientClipByNorm(GradientClipBase):
The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
is not None, then only part of gradients can be selected for gradient clipping.
is not None, then only part of gradients can be selected for gradient clipping.
Gradient clip will takes effect after being set in ``optimizer
.minimize(grad_clip)
`` , see the document ``optimizer``
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_fluid_optimizer_SGDOptimizer`).
(for example: :ref:`api_fluid_optimizer_SGDOptimizer`).
The clipping formula is:
The clipping formula is:
...
@@ -359,8 +358,8 @@ class GradientClipByNorm(GradientClipBase):
...
@@ -359,8 +358,8 @@ class GradientClipByNorm(GradientClipBase):
# return Parameter.name=="fc_0.w_0"
# return Parameter.name=="fc_0.w_0"
# clip = fluid.clip.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func)
# clip = fluid.clip.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1
, grad_clip=clip
)
sgd_optimizer.minimize(loss
, grad_clip=clip
)
sgd_optimizer.minimize(loss)
place = fluid.CPUPlace()
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe = fluid.Executor(place)
...
@@ -394,8 +393,8 @@ class GradientClipByNorm(GradientClipBase):
...
@@ -394,8 +393,8 @@ class GradientClipByNorm(GradientClipBase):
# clip = fluid.clip.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func)
# clip = fluid.clip.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGD(
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=0.1, parameter_list=linear.parameters())
learning_rate=0.1, parameter_list=linear.parameters()
, grad_clip=clip
)
sgd_optimizer.minimize(loss
, grad_clip=clip
)
sgd_optimizer.minimize(loss)
"""
"""
...
@@ -422,6 +421,7 @@ class GradientClipByNorm(GradientClipBase):
...
@@ -422,6 +421,7 @@ class GradientClipByNorm(GradientClipBase):
def
_static_clip
(
self
,
params_grads
):
def
_static_clip
(
self
,
params_grads
):
params_and_grads
=
[]
params_and_grads
=
[]
with
framework
.
name_scope
(
'gradient_clip'
):
with
framework
.
name_scope
(
'gradient_clip'
):
param_new_grad_name_dict
=
dict
()
for
p
,
g
in
params_grads
:
for
p
,
g
in
params_grads
:
if
g
is
None
:
if
g
is
None
:
continue
continue
...
@@ -432,8 +432,9 @@ class GradientClipByNorm(GradientClipBase):
...
@@ -432,8 +432,9 @@ class GradientClipByNorm(GradientClipBase):
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
new_grad
=
layers
.
clip_by_norm
(
x
=
g
,
max_norm
=
self
.
clip_norm
)
new_grad
=
layers
.
clip_by_norm
(
x
=
g
,
max_norm
=
self
.
clip_norm
)
param_new_grad_name_dict
[
p
.
name
]
=
new_grad
.
name
params_and_grads
.
append
((
p
,
new_grad
))
params_and_grads
.
append
((
p
,
new_grad
))
_correct_clip_op_role_var
(
params_and_grads
)
_correct_clip_op_role_var
(
params_and_grads
,
param_new_grad_name_dict
)
return
params_and_grads
return
params_and_grads
def
_process_context
(
self
,
context
,
param
,
grad
):
def
_process_context
(
self
,
context
,
param
,
grad
):
...
@@ -456,7 +457,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
...
@@ -456,7 +457,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
is not None, then only part of gradients can be selected for gradient clipping.
is not None, then only part of gradients can be selected for gradient clipping.
Gradient clip will takes effect after being set in ``optimizer
.minimize(grad_clip)
`` , see the document ``optimizer``
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_fluid_optimizer_SGDOptimizer`).
(for example: :ref:`api_fluid_optimizer_SGDOptimizer`).
The clipping formula is:
The clipping formula is:
...
@@ -505,8 +506,8 @@ class GradientClipByGlobalNorm(GradientClipBase):
...
@@ -505,8 +506,8 @@ class GradientClipByGlobalNorm(GradientClipBase):
# return Parameter.name=="fc_0.w_0"
# return Parameter.name=="fc_0.w_0"
# clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func)
# clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1
, grad_clip=clip
)
sgd_optimizer.minimize(loss
, grad_clip=clip
)
sgd_optimizer.minimize(loss)
place = fluid.CPUPlace()
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe = fluid.Executor(place)
...
@@ -539,8 +540,8 @@ class GradientClipByGlobalNorm(GradientClipBase):
...
@@ -539,8 +540,8 @@ class GradientClipByGlobalNorm(GradientClipBase):
# clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func)
# clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGD(
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=0.1, parameter_list=linear.parameters())
learning_rate=0.1, parameter_list=linear.parameters()
, grad_clip=clip
)
sgd_optimizer.minimize(loss
, grad_clip=clip
)
sgd_optimizer.minimize(loss)
"""
"""
...
@@ -628,6 +629,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
...
@@ -628,6 +629,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
y
=
layers
.
elementwise_max
(
y
=
layers
.
elementwise_max
(
x
=
max_global_norm
,
y
=
global_norm_var
))
x
=
max_global_norm
,
y
=
global_norm_var
))
param_new_grad_name_dict
=
dict
()
for
p
,
g
in
params_grads
:
for
p
,
g
in
params_grads
:
if
g
is
None
:
if
g
is
None
:
continue
continue
...
@@ -638,9 +640,10 @@ class GradientClipByGlobalNorm(GradientClipBase):
...
@@ -638,9 +640,10 @@ class GradientClipByGlobalNorm(GradientClipBase):
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
new_grad
=
layers
.
elementwise_mul
(
x
=
g
,
y
=
scale_var
)
new_grad
=
layers
.
elementwise_mul
(
x
=
g
,
y
=
scale_var
)
param_new_grad_name_dict
[
p
.
name
]
=
new_grad
.
name
params_and_grads
.
append
((
p
,
new_grad
))
params_and_grads
.
append
((
p
,
new_grad
))
_correct_clip_op_role_var
(
params_and_grads
)
_correct_clip_op_role_var
(
params_and_grads
,
param_new_grad_name_dict
)
return
params_and_grads
return
params_and_grads
def
_process_context
(
self
,
context
,
param
,
grad
):
def
_process_context
(
self
,
context
,
param
,
grad
):
...
@@ -692,9 +695,10 @@ def set_gradient_clip(clip, param_list=None, program=None):
...
@@ -692,9 +695,10 @@ def set_gradient_clip(clip, param_list=None, program=None):
This API must be used after building network, and before ``minimize`` ,
This API must be used after building network, and before ``minimize`` ,
and it may be removed in future releases, so it is not recommended.
and it may be removed in future releases, so it is not recommended.
It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient.
It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` ,
this is a better method to clip gradient. There are three clipping strategies:
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
:ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` .
To specify parameters that require gradient clip.
To specify parameters that require gradient clip.
...
@@ -757,7 +761,7 @@ def set_gradient_clip(clip, param_list=None, program=None):
...
@@ -757,7 +761,7 @@ def set_gradient_clip(clip, param_list=None, program=None):
sgd = fluid.optimizer.SGD(learning_rate=1e-3)
sgd = fluid.optimizer.SGD(learning_rate=1e-3)
sgd.minimize(loss)
sgd.minimize(loss)
# network 4: use 'set_gradient_clip' and '
min
imize(grad_clip=clip)' together
# network 4: use 'set_gradient_clip' and '
opt
imize(grad_clip=clip)' together
with fluid.program_guard(fluid.Program(), fluid.Program()):
with fluid.program_guard(fluid.Program(), fluid.Program()):
loss = network()
loss = network()
clip1 = fluid.clip.GradientClipByValue(min=-1.0, max=1.0)
clip1 = fluid.clip.GradientClipByValue(min=-1.0, max=1.0)
...
@@ -765,8 +769,8 @@ def set_gradient_clip(clip, param_list=None, program=None):
...
@@ -765,8 +769,8 @@ def set_gradient_clip(clip, param_list=None, program=None):
# Set the gradient clipping strategy: clip1
# Set the gradient clipping strategy: clip1
fluid.clip.set_gradient_clip(clip1)
fluid.clip.set_gradient_clip(clip1)
# Set the gradient clipping strategy: clip2
# Set the gradient clipping strategy: clip2
sgd = fluid.optimizer.SGD(learning_rate=1e-3)
sgd = fluid.optimizer.SGD(learning_rate=1e-3
, grad_clip=clip2
)
sgd.minimize(loss
, grad_clip=clip2
)
sgd.minimize(loss)
# 'set_gradient_clip' will not take effect when setting has a conflict,
# 'set_gradient_clip' will not take effect when setting has a conflict,
# and the gradient clipping strategy will be 'clip2'
# and the gradient clipping strategy will be 'clip2'
...
@@ -774,10 +778,10 @@ def set_gradient_clip(clip, param_list=None, program=None):
...
@@ -774,10 +778,10 @@ def set_gradient_clip(clip, param_list=None, program=None):
"""
"""
warnings
.
warn
(
"Caution! 'set_gradient_clip' is not recommended "
warnings
.
warn
(
"Caution! 'set_gradient_clip' is not recommended "
"and may be deprecated in future! "
"and may be deprecated in future! "
"We recommend a new strategy:
clip gradient by
"
"We recommend a new strategy:
set 'grad_clip'
"
"
'optimizer.minimize(loss, grad_clip=clip)
'. "
"
when initializing the 'optimizer
'. "
"This method can reduce the mistakes, please "
"This method can reduce the mistakes, please "
"
see documention of 'optimzier.minimize
'."
)
"
refer to documention of 'optimizer
'."
)
if
not
isinstance
(
clip
,
GradientClipBase
):
if
not
isinstance
(
clip
,
GradientClipBase
):
raise
TypeError
(
raise
TypeError
(
...
@@ -824,33 +828,40 @@ def append_gradient_clip_ops(param_grads):
...
@@ -824,33 +828,40 @@ def append_gradient_clip_ops(param_grads):
clip_attr
.
_process_context
(
context
=
context
,
param
=
p
,
grad
=
g
)
clip_attr
.
_process_context
(
context
=
context
,
param
=
p
,
grad
=
g
)
res
=
[]
res
=
[]
param_new_grad_name_dict
=
dict
()
for
p
,
g
in
param_grads
:
for
p
,
g
in
param_grads
:
if
g
is
None
:
if
g
is
None
:
continue
continue
with
p
.
block
.
program
.
_optimized_guard
(
with
p
.
block
.
program
.
_optimized_guard
(
[
p
,
g
]),
framework
.
name_scope
(
'graident_clip_@CLIP'
):
[
p
,
g
]),
framework
.
name_scope
(
'graident_clip_@CLIP'
):
param
,
new_grad
=
clip_attr
.
_create_operators
(
param
=
p
,
grad
=
g
)
param
,
new_grad
=
clip_attr
.
_create_operators
(
param
=
p
,
grad
=
g
)
param_new_grad_name_dict
[
param
.
name
]
=
new_grad
.
name
res
.
append
([
param
,
new_grad
])
res
.
append
([
param
,
new_grad
])
_correct_clip_op_role_var
(
res
)
_correct_clip_op_role_var
(
res
,
param_new_grad_name_dict
)
return
res
return
res
# change wrong mapping relation between param & grad in clip op
# change wrong mapping relation between param & grad in clip op
def
_correct_clip_op_role_var
(
params_grads
):
def
_correct_clip_op_role_var
(
params_grads
,
param_new_grad_name_dict
):
block_id_list
=
[]
if
len
(
param_new_grad_name_dict
)
==
0
:
return
for
param
,
grad
in
params_grads
:
for
param
,
grad
in
params_grads
:
if
grad
is
None
:
if
grad
is
None
:
continue
continue
block_id
=
param
.
block
.
idx
if
block_id
in
block_id_list
:
continue
block_id_list
.
append
(
block_id
)
for
op
in
param
.
block
.
program
.
global_block
().
ops
:
for
op
in
param
.
block
.
program
.
global_block
().
ops
:
if
'op_namescope'
in
op
.
all_attrs
()
and
"gradient_clip"
in
op
.
attr
(
if
'op_namescope'
in
op
.
all_attrs
()
and
"gradient_clip"
in
op
.
attr
(
"op_namescope"
):
"op_namescope"
)
and
op
.
attr
(
'op_role_var'
):
if
op
.
attr
(
'op_role_var'
):
param_name
=
op
.
attr
(
'op_role_var'
)[
0
]
param_name
=
op
.
attr
(
'op_role_var'
)[
0
]
if
param_name
in
param_new_grad_name_dict
:
index
=
0
correct_p_g
=
[
for
i
in
range
(
len
(
params_grads
)):
param_name
,
param_new_grad_name_dict
[
param_name
]
if
params_grads
[
i
][
0
].
name
==
param_name
:
]
index
=
i
correct_p_g
=
[
param_name
,
params_grads
[
index
][
1
].
name
]
op
.
_set_attr
(
'op_role_var'
,
correct_p_g
)
op
.
_set_attr
(
'op_role_var'
,
correct_p_g
)
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
2bca3295
...
@@ -24,7 +24,7 @@ from . import framework
...
@@ -24,7 +24,7 @@ from . import framework
from
.
import
layers
from
.
import
layers
from
.
import
unique_name
from
.
import
unique_name
from
.backward
import
append_backward
,
_some_in_set_
,
_append_grad_suffix_
,
_get_no_grad_set_name
from
.backward
import
append_backward
,
_some_in_set_
,
_append_grad_suffix_
,
_get_no_grad_set_name
from
.clip
import
GradientClipBase
,
error_clip_callback
,
append_gradient_clip_ops
from
.clip
import
GradientClipBase
,
GradientClipByNorm
,
error_clip_callback
,
append_gradient_clip_ops
from
.framework
import
program_guard
from
.framework
import
program_guard
from
.initializer
import
Constant
from
.initializer
import
Constant
from
.layer_helper
import
LayerHelper
from
.layer_helper
import
LayerHelper
...
@@ -64,6 +64,7 @@ class Optimizer(object):
...
@@ -64,6 +64,7 @@ class Optimizer(object):
learning_rate
,
learning_rate
,
parameter_list
=
None
,
parameter_list
=
None
,
regularization
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
name
=
None
):
self
.
_parameter_list
=
parameter_list
self
.
_parameter_list
=
parameter_list
if
framework
.
in_dygraph_mode
():
if
framework
.
in_dygraph_mode
():
...
@@ -88,7 +89,13 @@ class Optimizer(object):
...
@@ -88,7 +89,13 @@ class Optimizer(object):
type
(
learning_rate
))
type
(
learning_rate
))
self
.
_name
=
name
self
.
_name
=
name
if
grad_clip
is
not
None
:
if
not
isinstance
(
grad_clip
,
GradientClipBase
):
raise
TypeError
(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
self
.
regularization
=
regularization
self
.
regularization
=
regularization
self
.
_grad_clip
=
grad_clip
self
.
_learning_rate
=
learning_rate
self
.
_learning_rate
=
learning_rate
# the learning rate type should be inferenced from loss
# the learning rate type should be inferenced from loss
self
.
_dtype
=
None
self
.
_dtype
=
None
...
@@ -107,8 +114,6 @@ class Optimizer(object):
...
@@ -107,8 +114,6 @@ class Optimizer(object):
self
.
_opti_name_list
=
[]
self
.
_opti_name_list
=
[]
self
.
_accumulators_holder
=
{}
self
.
_accumulators_holder
=
{}
self
.
_param_device_map
=
dict
()
self
.
_param_device_map
=
dict
()
# if pass grad_clip into minimize, it will not be None
self
.
_grad_clip
=
None
@
framework
.
dygraph_only
@
framework
.
dygraph_only
def
state_dict
(
self
):
def
state_dict
(
self
):
...
@@ -787,8 +792,7 @@ class Optimizer(object):
...
@@ -787,8 +792,7 @@ class Optimizer(object):
loss
,
loss
,
startup_program
=
None
,
startup_program
=
None
,
parameter_list
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
,
no_grad_set
=
None
):
grad_clip
=
None
):
"""
"""
Add operations to minimize ``loss`` by updating ``parameter_list``.
Add operations to minimize ``loss`` by updating ``parameter_list``.
...
@@ -801,12 +805,7 @@ class Optimizer(object):
...
@@ -801,12 +805,7 @@ class Optimizer(object):
to minimize ``loss``. The default value is None, at this time all parameters
to minimize ``loss``. The default value is None, at this time all parameters
will be updated.
will be updated.
no_grad_set (set, optional): Set of ``Variable`` or ``Variable.name`` that don't need
no_grad_set (set, optional): Set of ``Variable`` or ``Variable.name`` that don't need
to be updated. The default value is None.
to be updated. The default value is None.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no
gradient clipping.
Returns:
Returns:
tuple: tuple (optimize_ops, params_grads), A list of operators appended
tuple: tuple (optimize_ops, params_grads), A list of operators appended
...
@@ -820,12 +819,7 @@ class Optimizer(object):
...
@@ -820,12 +819,7 @@ class Optimizer(object):
Please refer to the example of current Optimizer.
Please refer to the example of current Optimizer.
"""
"""
assert
isinstance
(
loss
,
Variable
),
"The loss should be an Variable."
assert
isinstance
(
loss
,
Variable
),
"The loss should be an Variable."
if
grad_clip
is
not
None
:
if
not
isinstance
(
grad_clip
,
GradientClipBase
):
raise
TypeError
(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
self
.
_grad_clip
=
grad_clip
parameter_list
=
parameter_list
if
parameter_list
\
parameter_list
=
parameter_list
if
parameter_list
\
else
self
.
_parameter_list
else
self
.
_parameter_list
params_grads
=
self
.
backward
(
params_grads
=
self
.
backward
(
...
@@ -859,6 +853,10 @@ class SGDOptimizer(Optimizer):
...
@@ -859,6 +853,10 @@ class SGDOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information.
\
name (str, optional): This parameter is used by developers to print debugging information.
\
For details, please refer to :ref:`api_guide_Name`. Default is None.
For details, please refer to :ref:`api_guide_Name`. Default is None.
...
@@ -896,12 +894,14 @@ class SGDOptimizer(Optimizer):
...
@@ -896,12 +894,14 @@ class SGDOptimizer(Optimizer):
learning_rate
,
learning_rate
,
parameter_list
=
None
,
parameter_list
=
None
,
regularization
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
name
=
None
):
assert
learning_rate
is
not
None
assert
learning_rate
is
not
None
super
(
SGDOptimizer
,
self
).
__init__
(
super
(
SGDOptimizer
,
self
).
__init__
(
learning_rate
=
learning_rate
,
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
name
=
name
)
self
.
type
=
"sgd"
self
.
type
=
"sgd"
...
@@ -962,6 +962,10 @@ class MomentumOptimizer(Optimizer):
...
@@ -962,6 +962,10 @@ class MomentumOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information.
\
name (str, optional): This parameter is used by developers to print debugging information.
\
For details, please refer to :ref:`api_guide_Name`. Default is None.
For details, please refer to :ref:`api_guide_Name`. Default is None.
...
@@ -1002,6 +1006,7 @@ class MomentumOptimizer(Optimizer):
...
@@ -1002,6 +1006,7 @@ class MomentumOptimizer(Optimizer):
parameter_list
=
None
,
parameter_list
=
None
,
use_nesterov
=
False
,
use_nesterov
=
False
,
regularization
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
name
=
None
):
assert
learning_rate
is
not
None
assert
learning_rate
is
not
None
assert
momentum
is
not
None
assert
momentum
is
not
None
...
@@ -1009,6 +1014,7 @@ class MomentumOptimizer(Optimizer):
...
@@ -1009,6 +1014,7 @@ class MomentumOptimizer(Optimizer):
learning_rate
=
learning_rate
,
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
name
=
name
)
self
.
type
=
"momentum"
self
.
type
=
"momentum"
self
.
_momentum
=
momentum
self
.
_momentum
=
momentum
...
@@ -1097,13 +1103,14 @@ class DGCMomentumOptimizer(Optimizer):
...
@@ -1097,13 +1103,14 @@ class DGCMomentumOptimizer(Optimizer):
This parameter is required in dygraph mode.
\
This parameter is required in dygraph mode.
\
The default value is None in static mode, at this time all parameters will be updated.
The default value is None in static mode, at this time all parameters will be updated.
use_nesterov (bool): Enables Nesterov momentum. True means use Nesterov. Default is False.
use_nesterov (bool): Enables Nesterov momentum. True means use Nesterov. Default is False.
local_grad_clip_norm (float, optional): Local gradient clip norm value. Optional, default is None, represent no need clip.
num_trainers (int, optional): The number of training nodes. Optional, default is None.
regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method:
\
regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method:
\
:ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set
\
:ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set
\
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
Default None, meaning there is no regularization.
grad_clip (GradientClipByNorm, optional): Gradient cliping strategy. ``DGCMomentumOptimizer`` only support
:ref:`api_fluid_clip_GradientClipByNorm` , and if not, it will raise TypeError. Default None,
meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information.
\
name (str, optional): This parameter is used by developers to print debugging information.
\
For details, please refer to :ref:`api_guide_Name`. Default is None.
For details, please refer to :ref:`api_guide_Name`. Default is None.
...
@@ -1130,9 +1137,9 @@ class DGCMomentumOptimizer(Optimizer):
...
@@ -1130,9 +1137,9 @@ class DGCMomentumOptimizer(Optimizer):
sparsity
=
[
0.999
],
sparsity
=
[
0.999
],
parameter_list
=
None
,
parameter_list
=
None
,
use_nesterov
=
False
,
use_nesterov
=
False
,
local_grad_clip_norm
=
None
,
num_trainers
=
None
,
num_trainers
=
None
,
regularization
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
name
=
None
):
if
framework
.
in_dygraph_mode
():
if
framework
.
in_dygraph_mode
():
raise
Exception
(
"In dygraph, don't support DGCMomentumOptimizer."
)
raise
Exception
(
"In dygraph, don't support DGCMomentumOptimizer."
)
...
@@ -1146,6 +1153,7 @@ class DGCMomentumOptimizer(Optimizer):
...
@@ -1146,6 +1153,7 @@ class DGCMomentumOptimizer(Optimizer):
learning_rate
=
learning_rate
,
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
name
=
name
)
self
.
type
=
"dgc_momentum"
self
.
type
=
"dgc_momentum"
self
.
_momentum
=
momentum
self
.
_momentum
=
momentum
...
@@ -1159,20 +1167,23 @@ class DGCMomentumOptimizer(Optimizer):
...
@@ -1159,20 +1167,23 @@ class DGCMomentumOptimizer(Optimizer):
self
.
_rampup_begin_step_var
=
None
self
.
_rampup_begin_step_var
=
None
self
.
_global_step_var
=
None
self
.
_global_step_var
=
None
self
.
_local_grad_clip_norm
=
None
self
.
_dgc_clip_norm
=
None
self
.
_clip_norm
=
None
if
grad_clip
is
not
None
:
if
local_grad_clip_norm
is
not
None
:
if
not
isinstance
(
grad_clip
,
GradientClipByNorm
):
assert
isinstance
(
num_trainers
,
int
)
raise
TypeError
(
assert
isinstance
(
local_grad_clip_norm
,
float
)
"The type of grad_clip should be 'GradientClipByNorm', because DGCMomentumOptimizer only support GradientClipByNorm"
assert
num_trainers
>
0
)
assert
isinstance
(
num_trainers
,
int
),
"The type of num_trainers should be 'int', but received %s"
%
type
(
value
)
assert
num_trainers
>
0
,
"The value of num_trainers should be greater than 0!"
self
.
_local_grad_clip_norm
=
local_grad_clip_norm
self
.
_num_trainers
=
num_trainers
self
.
_num_trainers
=
num_trainers
self
.
_
clip_norm
=
local_grad_
clip_norm
*
(
num_trainers
**-
0.5
)
self
.
_
dgc_clip_norm
=
grad_clip
.
clip_norm
*
(
num_trainers
**-
0.5
)
self
.
regular_type
,
self
.
regular_coeff
=
self
.
_get_regularization_param
(
self
.
regular_type
,
self
.
regular_coeff
=
self
.
_get_regularization_param
(
self
.
regularization
)
self
.
regularization
)
self
.
_grad_clip
=
None
def
_get_regularization_param
(
self
,
regularization
):
def
_get_regularization_param
(
self
,
regularization
):
regular_type
=
0
regular_type
=
0
...
@@ -1342,8 +1353,8 @@ class DGCMomentumOptimizer(Optimizer):
...
@@ -1342,8 +1353,8 @@ class DGCMomentumOptimizer(Optimizer):
op
.
_remove_attr
(
op_maker
.
kOpRoleVarAttrName
())
op
.
_remove_attr
(
op_maker
.
kOpRoleVarAttrName
())
clip_var
=
grad_var
clip_var
=
grad_var
if
self
.
_
local_grad
_clip_norm
is
not
None
:
if
self
.
_
dgc
_clip_norm
is
not
None
:
clip_var
=
self
.
_append_clip_norm
(
grad_var
,
self
.
_clip_norm
)
clip_var
=
self
.
_append_clip_norm
(
grad_var
,
self
.
_
dgc_
clip_norm
)
self
.
_dgc_op
(
param_var
,
clip_var
,
grad_var
,
u_var
,
v_var
,
k_var
,
self
.
_dgc_op
(
param_var
,
clip_var
,
grad_var
,
u_var
,
v_var
,
k_var
,
encoded_var
,
gather_var
)
encoded_var
,
gather_var
)
...
@@ -1494,6 +1505,10 @@ class LarsMomentumOptimizer(Optimizer):
...
@@ -1494,6 +1505,10 @@ class LarsMomentumOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information.
\
name (str, optional): This parameter is used by developers to print debugging information.
\
For details, please refer to :ref:`api_guide_Name`. Default is None.
For details, please refer to :ref:`api_guide_Name`. Default is None.
...
@@ -1526,6 +1541,7 @@ class LarsMomentumOptimizer(Optimizer):
...
@@ -1526,6 +1541,7 @@ class LarsMomentumOptimizer(Optimizer):
lars_weight_decay
=
0.0005
,
lars_weight_decay
=
0.0005
,
parameter_list
=
None
,
parameter_list
=
None
,
regularization
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
name
=
None
):
assert
learning_rate
is
not
None
assert
learning_rate
is
not
None
assert
momentum
is
not
None
assert
momentum
is
not
None
...
@@ -1533,6 +1549,7 @@ class LarsMomentumOptimizer(Optimizer):
...
@@ -1533,6 +1549,7 @@ class LarsMomentumOptimizer(Optimizer):
learning_rate
=
learning_rate
,
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
name
=
name
)
self
.
type
=
"lars_momentum"
self
.
type
=
"lars_momentum"
self
.
_momentum
=
momentum
self
.
_momentum
=
momentum
...
@@ -1607,6 +1624,10 @@ class AdagradOptimizer(Optimizer):
...
@@ -1607,6 +1624,10 @@ class AdagradOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
The default value is None.
...
@@ -1639,6 +1660,7 @@ class AdagradOptimizer(Optimizer):
...
@@ -1639,6 +1660,7 @@ class AdagradOptimizer(Optimizer):
epsilon
=
1.0e-6
,
epsilon
=
1.0e-6
,
parameter_list
=
None
,
parameter_list
=
None
,
regularization
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
,
name
=
None
,
initial_accumulator_value
=
0.0
):
initial_accumulator_value
=
0.0
):
assert
learning_rate
is
not
None
assert
learning_rate
is
not
None
...
@@ -1647,6 +1669,7 @@ class AdagradOptimizer(Optimizer):
...
@@ -1647,6 +1669,7 @@ class AdagradOptimizer(Optimizer):
learning_rate
=
learning_rate
,
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
name
=
name
)
self
.
type
=
"adagrad"
self
.
type
=
"adagrad"
self
.
_epsilon
=
epsilon
self
.
_epsilon
=
epsilon
...
@@ -1726,6 +1749,10 @@ class AdamOptimizer(Optimizer):
...
@@ -1726,6 +1749,10 @@ class AdamOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
The default value is None.
...
@@ -1835,6 +1862,7 @@ class AdamOptimizer(Optimizer):
...
@@ -1835,6 +1862,7 @@ class AdamOptimizer(Optimizer):
epsilon
=
1e-8
,
epsilon
=
1e-8
,
parameter_list
=
None
,
parameter_list
=
None
,
regularization
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
,
name
=
None
,
lazy_mode
=
False
):
lazy_mode
=
False
):
assert
learning_rate
is
not
None
assert
learning_rate
is
not
None
...
@@ -1845,6 +1873,7 @@ class AdamOptimizer(Optimizer):
...
@@ -1845,6 +1873,7 @@ class AdamOptimizer(Optimizer):
learning_rate
=
learning_rate
,
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
name
=
name
)
self
.
type
=
"adam"
self
.
type
=
"adam"
self
.
_beta1
=
beta1
self
.
_beta1
=
beta1
...
@@ -1986,6 +2015,10 @@ class AdamaxOptimizer(Optimizer):
...
@@ -1986,6 +2015,10 @@ class AdamaxOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
The default value is None.
...
@@ -2031,6 +2064,7 @@ class AdamaxOptimizer(Optimizer):
...
@@ -2031,6 +2064,7 @@ class AdamaxOptimizer(Optimizer):
epsilon
=
1e-8
,
epsilon
=
1e-8
,
parameter_list
=
None
,
parameter_list
=
None
,
regularization
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
name
=
None
):
assert
learning_rate
is
not
None
assert
learning_rate
is
not
None
assert
beta1
is
not
None
assert
beta1
is
not
None
...
@@ -2040,6 +2074,7 @@ class AdamaxOptimizer(Optimizer):
...
@@ -2040,6 +2074,7 @@ class AdamaxOptimizer(Optimizer):
learning_rate
=
learning_rate
,
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
name
=
name
)
self
.
type
=
"adamax"
self
.
type
=
"adamax"
self
.
_beta1
=
beta1
self
.
_beta1
=
beta1
...
@@ -2238,6 +2273,10 @@ class DecayedAdagradOptimizer(Optimizer):
...
@@ -2238,6 +2273,10 @@ class DecayedAdagradOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
The default value is None.
...
@@ -2264,6 +2303,7 @@ class DecayedAdagradOptimizer(Optimizer):
...
@@ -2264,6 +2303,7 @@ class DecayedAdagradOptimizer(Optimizer):
epsilon
=
1.0e-6
,
epsilon
=
1.0e-6
,
parameter_list
=
None
,
parameter_list
=
None
,
regularization
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
name
=
None
):
assert
learning_rate
is
not
None
assert
learning_rate
is
not
None
assert
decay
is
not
None
assert
decay
is
not
None
...
@@ -2273,6 +2313,7 @@ class DecayedAdagradOptimizer(Optimizer):
...
@@ -2273,6 +2313,7 @@ class DecayedAdagradOptimizer(Optimizer):
learning_rate
=
learning_rate
,
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
name
=
name
)
self
.
type
=
"decayed_adagrad"
self
.
type
=
"decayed_adagrad"
self
.
_decay
=
decay
self
.
_decay
=
decay
...
@@ -2337,6 +2378,10 @@ class AdadeltaOptimizer(Optimizer):
...
@@ -2337,6 +2378,10 @@ class AdadeltaOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): The default value is None. Normally there is no need for user
name (str, optional): The default value is None. Normally there is no need for user
to set this property. For more information, please refer to
to set this property. For more information, please refer to
:ref:`api_guide_Name` .
:ref:`api_guide_Name` .
...
@@ -2367,6 +2412,7 @@ class AdadeltaOptimizer(Optimizer):
...
@@ -2367,6 +2412,7 @@ class AdadeltaOptimizer(Optimizer):
rho
=
0.95
,
rho
=
0.95
,
parameter_list
=
None
,
parameter_list
=
None
,
regularization
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
name
=
None
):
if
learning_rate
is
None
:
if
learning_rate
is
None
:
raise
ValueError
(
"learning_rate is not set."
)
raise
ValueError
(
"learning_rate is not set."
)
...
@@ -2378,6 +2424,7 @@ class AdadeltaOptimizer(Optimizer):
...
@@ -2378,6 +2424,7 @@ class AdadeltaOptimizer(Optimizer):
learning_rate
=
learning_rate
,
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
name
=
name
)
self
.
type
=
"adadelta"
self
.
type
=
"adadelta"
self
.
_epsilon
=
epsilon
self
.
_epsilon
=
epsilon
...
@@ -2488,6 +2535,10 @@ class RMSPropOptimizer(Optimizer):
...
@@ -2488,6 +2535,10 @@ class RMSPropOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information.
\
name (str, optional): This parameter is used by developers to print debugging information.
\
For details, please refer to :ref:`api_guide_Name`. Default is None.
For details, please refer to :ref:`api_guide_Name`. Default is None.
...
@@ -2536,11 +2587,13 @@ class RMSPropOptimizer(Optimizer):
...
@@ -2536,11 +2587,13 @@ class RMSPropOptimizer(Optimizer):
centered
=
False
,
centered
=
False
,
parameter_list
=
None
,
parameter_list
=
None
,
regularization
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
name
=
None
):
super
(
RMSPropOptimizer
,
self
).
__init__
(
super
(
RMSPropOptimizer
,
self
).
__init__
(
learning_rate
=
learning_rate
,
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
name
=
name
)
if
learning_rate
is
None
:
if
learning_rate
is
None
:
raise
ValueError
(
"learning_rate is not set."
)
raise
ValueError
(
"learning_rate is not set."
)
...
@@ -2656,6 +2709,10 @@ class FtrlOptimizer(Optimizer):
...
@@ -2656,6 +2709,10 @@ class FtrlOptimizer(Optimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information.
\
name (str, optional): This parameter is used by developers to print debugging information.
\
For details, please refer to :ref:`api_guide_Name`. Default is None.
For details, please refer to :ref:`api_guide_Name`. Default is None.
...
@@ -2704,11 +2761,13 @@ class FtrlOptimizer(Optimizer):
...
@@ -2704,11 +2761,13 @@ class FtrlOptimizer(Optimizer):
lr_power
=-
0.5
,
lr_power
=-
0.5
,
parameter_list
=
None
,
parameter_list
=
None
,
regularization
=
None
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
name
=
None
):
super
(
FtrlOptimizer
,
self
).
__init__
(
super
(
FtrlOptimizer
,
self
).
__init__
(
learning_rate
=
learning_rate
,
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
name
=
name
)
if
learning_rate
is
None
:
if
learning_rate
is
None
:
raise
ValueError
(
"learning_rate is not set."
)
raise
ValueError
(
"learning_rate is not set."
)
...
@@ -2798,6 +2857,10 @@ class LambOptimizer(AdamOptimizer):
...
@@ -2798,6 +2857,10 @@ class LambOptimizer(AdamOptimizer):
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
exclude_from_weight_decay_fn (function|None): Exclude a parameter from weight
exclude_from_weight_decay_fn (function|None): Exclude a parameter from weight
decay when **exclude_from_weight_decay_fn(parameter)** returns true.
decay when **exclude_from_weight_decay_fn(parameter)** returns true.
Default None.
Default None.
...
@@ -2834,6 +2897,7 @@ class LambOptimizer(AdamOptimizer):
...
@@ -2834,6 +2897,7 @@ class LambOptimizer(AdamOptimizer):
epsilon
=
1e-6
,
epsilon
=
1e-6
,
parameter_list
=
None
,
parameter_list
=
None
,
regularization
=
None
,
regularization
=
None
,
grad_clip
=
None
,
exclude_from_weight_decay_fn
=
None
,
exclude_from_weight_decay_fn
=
None
,
name
=
None
):
name
=
None
):
assert
learning_rate
is
not
None
assert
learning_rate
is
not
None
...
@@ -2845,6 +2909,7 @@ class LambOptimizer(AdamOptimizer):
...
@@ -2845,6 +2909,7 @@ class LambOptimizer(AdamOptimizer):
learning_rate
=
learning_rate
,
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
beta1
=
beta1
,
beta1
=
beta1
,
beta2
=
beta2
,
beta2
=
beta2
,
epsilon
=
epsilon
,
epsilon
=
epsilon
,
...
@@ -4043,20 +4108,13 @@ class RecomputeOptimizer(Optimizer):
...
@@ -4043,20 +4108,13 @@ class RecomputeOptimizer(Optimizer):
loss
,
loss
,
startup_program
=
None
,
startup_program
=
None
,
parameter_list
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
,
no_grad_set
=
None
):
grad_clip
=
None
):
assert
isinstance
(
loss
,
Variable
),
"The loss should be an Variable."
assert
isinstance
(
loss
,
Variable
),
"The loss should be an Variable."
assert
(
self
.
_checkpoints
is
not
None
assert
(
self
.
_checkpoints
is
not
None
),
"You should call _set_checkpoints first"
),
"You should call _set_checkpoints first"
if
framework
.
in_dygraph_mode
():
if
framework
.
in_dygraph_mode
():
raise
NotImplementedError
(
raise
NotImplementedError
(
"DyGraph current does not support recompute"
)
"DyGraph current does not support recompute"
)
if
grad_clip
is
not
None
:
if
not
isinstance
(
grad_clip
,
GradientClipBase
):
raise
TypeError
(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
self
.
_optimizer
.
_grad_clip
=
grad_clip
params_grads
=
self
.
backward
(
params_grads
=
self
.
backward
(
loss
,
loss
,
startup_program
=
startup_program
,
startup_program
=
startup_program
,
...
...
python/paddle/fluid/param_attr.py
浏览文件 @
2bca3295
...
@@ -34,7 +34,7 @@ class ParamAttr(object):
...
@@ -34,7 +34,7 @@ class ParamAttr(object):
Note:
Note:
``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
It is recommended to
use ``minimize(loss, grad_clip=clip)
`` to clip gradient.
It is recommended to
set ``grad_clip`` in ``optimizer
`` to clip gradient.
There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` ,
There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
...
...
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
浏览文件 @
2bca3295
...
@@ -19,6 +19,7 @@ import unittest
...
@@ -19,6 +19,7 @@ import unittest
import
paddle.fluid.framework
as
framework
import
paddle.fluid.framework
as
framework
import
paddle.fluid.optimizer
as
optimizer
import
paddle.fluid.optimizer
as
optimizer
import
paddle.fluid.regularizer
as
regularizer
import
paddle.fluid.regularizer
as
regularizer
import
paddle.fluid.clip
as
clip
import
paddle.compat
as
cpt
import
paddle.compat
as
cpt
from
paddle.fluid.backward
import
append_backward
from
paddle.fluid.backward
import
append_backward
from
paddle.fluid.transpiler.details
import
program_to_code
from
paddle.fluid.transpiler.details
import
program_to_code
...
@@ -70,9 +71,9 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
...
@@ -70,9 +71,9 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
learning_rate
=
learning_rate
,
learning_rate
=
learning_rate
,
momentum
=
0.2
,
momentum
=
0.2
,
rampup_begin_step
=
0
,
rampup_begin_step
=
0
,
local_grad_clip_norm
=
1.0
,
num_trainers
=
2
,
num_trainers
=
2
,
regularization
=
regularization
)
regularization
=
regularization
,
grad_clip
=
clip
.
GradientClipByNorm
(
1.0
))
if
use_recompute
:
if
use_recompute
:
dgc_momentum_optimizer
=
optimizer
.
RecomputeOptimizer
(
dgc_momentum_optimizer
=
optimizer
.
RecomputeOptimizer
(
...
@@ -124,6 +125,16 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
...
@@ -124,6 +125,16 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
#with open("test_dgc_optimizer_" + name + str(use_recompute) + ".log", "w") as f:
#with open("test_dgc_optimizer_" + name + str(use_recompute) + ".log", "w") as f:
# program_to_code(program, fout=f)
# program_to_code(program, fout=f)
def
test_tpyeError
(
self
):
# the type of DGCMomentumOptimizer(grad_clip=) must be 'GradientClipByNorm'
with
self
.
assertRaises
(
TypeError
):
dgc_momentum_optimizer
=
self
.
MockDGCMomentum
(
learning_rate
=
0.01
,
momentum
=
0.2
,
rampup_begin_step
=
0
,
num_trainers
=
2
,
grad_clip
=
clip
.
GradientClipByGlobalNorm
(
1.0
))
def
test_momentum_without_dgc
(
self
):
def
test_momentum_without_dgc
(
self
):
self
.
check_dgc_momentum_optimizer
(
self
.
check_dgc_momentum_optimizer
(
regularization
=
regularizer
.
L1Decay
(
1e-4
))
regularization
=
regularizer
.
L1Decay
(
1e-4
))
...
...
python/paddle/fluid/tests/unittests/test_gradient_clip.py
浏览文件 @
2bca3295
...
@@ -76,8 +76,8 @@ class TestGradientClip(unittest.TestCase):
...
@@ -76,8 +76,8 @@ class TestGradientClip(unittest.TestCase):
startup_program
=
fluid
.
Program
()
startup_program
=
fluid
.
Program
()
with
fluid
.
program_guard
(
with
fluid
.
program_guard
(
main_program
=
prog
,
startup_program
=
startup_program
):
main_program
=
prog
,
startup_program
=
startup_program
):
image
=
fluid
.
data
(
name
=
'x'
,
shape
=
[
-
1
,
784
],
dtype
=
'float32'
)
image
=
fluid
.
data
(
name
=
"a"
,
shape
=
[
-
1
,
784
],
dtype
=
'float32'
)
label
=
fluid
.
data
(
name
=
'y'
,
shape
=
[
-
1
,
1
],
dtype
=
'int64'
)
label
=
fluid
.
data
(
name
=
"b"
,
shape
=
[
-
1
,
1
],
dtype
=
'int64'
)
hidden
=
fluid
.
layers
.
fc
(
input
=
image
,
size
=
32
,
act
=
'relu'
)
hidden
=
fluid
.
layers
.
fc
(
input
=
image
,
size
=
32
,
act
=
'relu'
)
predict
=
fluid
.
layers
.
fc
(
input
=
hidden
,
size
=
10
,
act
=
'softmax'
)
predict
=
fluid
.
layers
.
fc
(
input
=
hidden
,
size
=
10
,
act
=
'softmax'
)
...
@@ -112,13 +112,13 @@ class TestGradientClip(unittest.TestCase):
...
@@ -112,13 +112,13 @@ class TestGradientClip(unittest.TestCase):
self
.
check_clip_result
(
out
,
out_clip
)
self
.
check_clip_result
(
out
,
out_clip
)
def
check_sparse_gradient_clip
(
self
,
place
):
def
check_sparse_gradient_clip
(
self
,
place
):
prog
=
fluid
.
framework
.
Program
()
prog
=
fluid
.
Program
()
startup_program
=
fluid
.
framework
.
Program
()
startup_program
=
fluid
.
Program
()
with
fluid
.
program_guard
(
with
fluid
.
program_guard
(
main_program
=
prog
,
startup_program
=
startup_program
):
main_program
=
prog
,
startup_program
=
startup_program
):
data
=
fluid
.
layers
.
data
(
data
=
fluid
.
data
(
name
=
"words"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
name
=
"words"
,
shape
=
[
-
1
,
1
],
dtype
=
"int64"
,
lod_level
=
1
)
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
label
=
fluid
.
data
(
name
=
"label"
,
shape
=
[
-
1
,
1
],
dtype
=
"int64"
)
cost
=
bow_net
(
data
,
label
,
self
.
word_dict_len
)
cost
=
bow_net
(
data
,
label
,
self
.
word_dict_len
)
self
.
backward_and_optimize
(
cost
)
self
.
backward_and_optimize
(
cost
)
...
@@ -172,7 +172,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
...
@@ -172,7 +172,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
self
.
clip_gradient
=
func
self
.
clip_gradient
=
func
self
.
check_gradient_clip
(
fluid
.
CPUPlace
())
self
.
check_gradient_clip
(
fluid
.
CPUPlace
())
# test whether the ouput is right when use
'minimize(grad_clip)'
# test whether the ouput is right when use
grad_clip
def
test_new_gradient_clip
(
self
):
def
test_new_gradient_clip
(
self
):
def
func
(
params_grads
):
def
func
(
params_grads
):
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
self
.
clip_norm
)
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
self
.
clip_norm
)
...
@@ -192,9 +192,10 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
...
@@ -192,9 +192,10 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
5.0
,
need_clip
=
fileter_func
)
clip_norm
=
5.0
,
need_clip
=
fileter_func
)
fluid
.
clip
.
set_gradient_clip
(
clip
)
fluid
.
clip
.
set_gradient_clip
(
clip
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.01
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.01
,
# if 'set_gradient_clip' and 'minimize(grad_clip)' together, 'set_gradient_clip' will be ineffective
grad_clip
=
clip
)
sgd_optimizer
.
minimize
(
cost
,
grad_clip
=
clip
)
# if 'set_gradient_clip' and 'optimize(grad_clip)' together, 'set_gradient_clip' will be ineffective
sgd_optimizer
.
minimize
(
cost
)
# 'set_gradient_clip' must before 'minimize', otherwise, 'set_gradient_clip' will be ineffective
# 'set_gradient_clip' must before 'minimize', otherwise, 'set_gradient_clip' will be ineffective
fluid
.
clip
.
set_gradient_clip
(
clip
)
fluid
.
clip
.
set_gradient_clip
(
clip
)
...
@@ -232,24 +233,10 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
...
@@ -232,24 +233,10 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
self
.
clip_norm
,
need_clip
=
"test"
)
clip_norm
=
self
.
clip_norm
,
need_clip
=
"test"
)
# the type of minimize(grad_clip=) must be an instance of GradientClipBase's derived class
# the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class
with
self
.
assertRaises
(
TypeError
):
x
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
name
=
"x"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
loss
=
fluid
.
layers
.
reduce_mean
(
x
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.1
)
sgd_optimizer
.
minimize
(
loss
,
grad_clip
=
"test"
)
# the type of RecomputeOptimizer.minimize(grad_clip=) must be an instance of GradientClipBase's derived class
with
self
.
assertRaises
(
TypeError
):
with
self
.
assertRaises
(
TypeError
):
x
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.1
,
name
=
"x"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
grad_clip
=
"test"
)
loss
=
fluid
.
layers
.
reduce_mean
(
x
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.1
)
recompute_optimizer
=
fluid
.
optimizer
.
RecomputeOptimizer
(
sgd_optimizer
)
recompute_optimizer
.
_set_checkpoints
([
x
])
recompute_optimizer
.
minimize
(
loss
,
grad_clip
=
"test"
)
class
TestGradientClipByNorm
(
TestGradientClip
):
class
TestGradientClipByNorm
(
TestGradientClip
):
...
@@ -271,7 +258,7 @@ class TestGradientClipByNorm(TestGradientClip):
...
@@ -271,7 +258,7 @@ class TestGradientClipByNorm(TestGradientClip):
a
=
u
,
b
=
v
,
rtol
=
1e-5
,
atol
=
1e-8
),
a
=
u
,
b
=
v
,
rtol
=
1e-5
,
atol
=
1e-8
),
"gradient clip by norm has wrong results!"
)
"gradient clip by norm has wrong results!"
)
# test whether the ouput is right when use
'minimize(grad_clip)'
# test whether the ouput is right when use
grad_clip
def
test_gradient_clip
(
self
):
def
test_gradient_clip
(
self
):
self
.
check_gradient_clip
(
fluid
.
CPUPlace
())
self
.
check_gradient_clip
(
fluid
.
CPUPlace
())
...
@@ -319,7 +306,7 @@ class TestGradientClipByValue(TestGradientClip):
...
@@ -319,7 +306,7 @@ class TestGradientClipByValue(TestGradientClip):
a
=
u
,
b
=
v
,
rtol
=
1e-6
,
atol
=
1e-8
),
a
=
u
,
b
=
v
,
rtol
=
1e-6
,
atol
=
1e-8
),
"gradient clip by value has wrong results!"
)
"gradient clip by value has wrong results!"
)
# test whether the ouput is right when use
'minimize(grad_clip)'
# test whether the ouput is right when use
grad_clip
def
test_gradient_clip
(
self
):
def
test_gradient_clip
(
self
):
self
.
check_gradient_clip
(
fluid
.
CPUPlace
())
self
.
check_gradient_clip
(
fluid
.
CPUPlace
())
...
@@ -357,7 +344,9 @@ class TestDygraphGradientClip(unittest.TestCase):
...
@@ -357,7 +344,9 @@ class TestDygraphGradientClip(unittest.TestCase):
loss
=
fluid
.
layers
.
reduce_mean
(
out
)
loss
=
fluid
.
layers
.
reduce_mean
(
out
)
loss
.
backward
()
loss
.
backward
()
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.0
,
parameter_list
=
linear
.
parameters
())
learning_rate
=
0.0
,
parameter_list
=
linear
.
parameters
(),
grad_clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
0.1
))
self
.
check_clip_result
(
loss
,
sgd_optimizer
)
self
.
check_clip_result
(
loss
,
sgd_optimizer
)
def
check_clip_result
(
self
,
loss
,
optimizer
):
def
check_clip_result
(
self
,
loss
,
optimizer
):
...
@@ -384,7 +373,7 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
...
@@ -384,7 +373,7 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
np
.
array
([
3
,
4
]).
astype
(
"float32"
),
name
=
"y"
)
np
.
array
([
3
,
4
]).
astype
(
"float32"
),
name
=
"y"
)
assert
len
(
self
.
clip1
([(
x
,
x
),
(
x
,
y
),
(
x
,
None
)]))
==
2
assert
len
(
self
.
clip1
([(
x
,
x
),
(
x
,
y
),
(
x
,
None
)]))
==
2
# get params and grads from network
# get params and grads from network
opt
,
params_grads
=
optimizer
.
minimize
(
loss
,
grad_clip
=
self
.
clip2
)
opt
,
params_grads
=
optimizer
.
minimize
(
loss
)
_
,
grads
=
zip
(
*
params_grads
)
_
,
grads
=
zip
(
*
params_grads
)
params_grads
=
self
.
clip2
(
params_grads
)
params_grads
=
self
.
clip2
(
params_grads
)
_
,
grads_clip
=
zip
(
*
params_grads
)
_
,
grads_clip
=
zip
(
*
params_grads
)
...
@@ -426,7 +415,7 @@ class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
...
@@ -426,7 +415,7 @@ class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
assert
len
(
self
.
clip
([(
x
,
None
)]))
==
0
assert
len
(
self
.
clip
([(
x
,
None
)]))
==
0
# get params and grads from network
# get params and grads from network
self
.
clip
([(
fluid
.
dygraph
.
to_variable
(
np
.
array
([
2
,
3
])),
None
)])
self
.
clip
([(
fluid
.
dygraph
.
to_variable
(
np
.
array
([
2
,
3
])),
None
)])
params_grads
=
optimizer
.
backward
(
loss
)
opt
,
params_grads
=
optimizer
.
minimize
(
loss
)
_
,
grads
=
zip
(
*
params_grads
)
_
,
grads
=
zip
(
*
params_grads
)
params_grads
=
self
.
clip
(
params_grads
)
params_grads
=
self
.
clip
(
params_grads
)
_
,
grads_clip
=
zip
(
*
params_grads
)
_
,
grads_clip
=
zip
(
*
params_grads
)
...
@@ -460,7 +449,7 @@ class TestDygraphGradientClipByValue(TestDygraphGradientClip):
...
@@ -460,7 +449,7 @@ class TestDygraphGradientClipByValue(TestDygraphGradientClip):
x
=
fluid
.
dygraph
.
to_variable
(
np
.
array
([
2
,
3
]).
astype
(
"float32"
))
x
=
fluid
.
dygraph
.
to_variable
(
np
.
array
([
2
,
3
]).
astype
(
"float32"
))
assert
len
(
self
.
clip
([(
x
,
None
)]))
==
0
assert
len
(
self
.
clip
([(
x
,
None
)]))
==
0
# get params and grads from network
# get params and grads from network
params_grads
=
optimizer
.
backward
(
loss
)
opt
,
params_grads
=
optimizer
.
minimize
(
loss
)
_
,
grads
=
zip
(
*
params_grads
)
_
,
grads
=
zip
(
*
params_grads
)
params_grads
=
self
.
clip
(
params_grads
)
params_grads
=
self
.
clip
(
params_grads
)
_
,
grads_clip
=
zip
(
*
params_grads
)
_
,
grads_clip
=
zip
(
*
params_grads
)
...
...
python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
浏览文件 @
2bca3295
...
@@ -329,9 +329,9 @@ class TestImperativeAutoPrune(unittest.TestCase):
...
@@ -329,9 +329,9 @@ class TestImperativeAutoPrune(unittest.TestCase):
place
=
fluid
.
CPUPlace
()
place
=
fluid
.
CPUPlace
()
with
fluid
.
dygraph
.
guard
(
place
):
with
fluid
.
dygraph
.
guard
(
place
):
model
=
MyLayer
(
size
,
vocab_size
,
size
)
model
=
MyLayer
(
size
,
vocab_size
,
size
)
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
0.001
,
parameter_list
=
model
.
parameters
())
grad_clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
0.001
)
grad_clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
0.001
)
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
0.001
,
parameter_list
=
model
.
parameters
(),
grad_clip
=
grad_clip
)
indices
=
fluid
.
dygraph
.
to_variable
(
indices
)
indices
=
fluid
.
dygraph
.
to_variable
(
indices
)
embed
=
fluid
.
dygraph
.
to_variable
(
embed
)
embed
=
fluid
.
dygraph
.
to_variable
(
embed
)
...
@@ -339,7 +339,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
...
@@ -339,7 +339,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
loss
=
model
.
embed_linear0
(
indices
)
loss
=
model
.
embed_linear0
(
indices
)
loss
.
backward
()
loss
.
backward
()
_
,
params_grads
=
optimizer
.
minimize
(
loss
,
grad_clip
=
grad_clip
)
_
,
params_grads
=
optimizer
.
minimize
(
loss
)
for
items
in
params_grads
:
for
items
in
params_grads
:
assert
items
[
0
].
name
is
not
model
.
embed1
.
weight
.
name
assert
items
[
0
].
name
is
not
model
.
embed1
.
weight
.
name
assert
items
[
0
].
name
is
not
model
.
linear_1
.
weight
.
name
assert
items
[
0
].
name
is
not
model
.
linear_1
.
weight
.
name
...
@@ -348,9 +348,9 @@ class TestImperativeAutoPrune(unittest.TestCase):
...
@@ -348,9 +348,9 @@ class TestImperativeAutoPrune(unittest.TestCase):
with
fluid
.
dygraph
.
guard
(
place
):
with
fluid
.
dygraph
.
guard
(
place
):
model
=
MyLayer2
(
size
,
vocab_size
,
size
)
model
=
MyLayer2
(
size
,
vocab_size
,
size
)
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
0.001
,
parameter_list
=
model
.
parameters
())
grad_clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
0.001
)
grad_clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
0.001
)
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
0.001
,
parameter_list
=
model
.
parameters
(),
grad_clip
=
grad_clip
)
indices
=
fluid
.
dygraph
.
to_variable
(
indices
)
indices
=
fluid
.
dygraph
.
to_variable
(
indices
)
emebd
=
fluid
.
dygraph
.
to_variable
(
embed
)
emebd
=
fluid
.
dygraph
.
to_variable
(
embed
)
...
@@ -358,7 +358,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
...
@@ -358,7 +358,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
loss
=
model
.
embed_linear0
(
indices
)
loss
=
model
.
embed_linear0
(
indices
)
loss
.
backward
()
loss
.
backward
()
optimizer
.
minimize
(
loss
,
grad_clip
=
grad_clip
)
optimizer
.
minimize
(
loss
)
for
items
in
params_grads
:
for
items
in
params_grads
:
assert
items
[
0
].
name
is
not
model
.
embed1
.
weight
.
name
assert
items
[
0
].
name
is
not
model
.
embed1
.
weight
.
name
assert
items
[
0
].
name
is
not
model
.
linear_1
.
weight
.
name
assert
items
[
0
].
name
is
not
model
.
linear_1
.
weight
.
name
...
...
python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
浏览文件 @
2bca3295
...
@@ -58,14 +58,15 @@ class TestSimpleNet(unittest.TestCase):
...
@@ -58,14 +58,15 @@ class TestSimpleNet(unittest.TestCase):
simplenet
=
SimpleNet
(
20
,
32
,
dtype
)
simplenet
=
SimpleNet
(
20
,
32
,
dtype
)
adam
=
SGDOptimizer
(
adam
=
SGDOptimizer
(
learning_rate
=
0.001
,
learning_rate
=
0.001
,
parameter_list
=
simplenet
.
parameters
())
parameter_list
=
simplenet
.
parameters
(
))
# grad_clip=grad_clip
input_emb
,
emb
=
simplenet
(
input
)
input_emb
,
emb
=
simplenet
(
input
)
self
.
assertTrue
(
emb
.
weight
.
gradient
()
is
None
)
self
.
assertTrue
(
emb
.
weight
.
gradient
()
is
None
)
self
.
assertTrue
(
input_emb
.
gradient
()
is
None
)
self
.
assertTrue
(
input_emb
.
gradient
()
is
None
)
input_emb
.
backward
(
backward_strategy
)
input_emb
.
backward
(
backward_strategy
)
adam
.
minimize
(
input_emb
)
# grad_clip=grad_clip
adam
.
minimize
(
input_emb
)
self
.
assertTrue
(
emb
.
weight
.
gradient
()
is
not
None
)
self
.
assertTrue
(
emb
.
weight
.
gradient
()
is
not
None
)
emb
.
clear_gradients
()
emb
.
clear_gradients
()
...
@@ -92,14 +93,15 @@ class TestSimpleNet(unittest.TestCase):
...
@@ -92,14 +93,15 @@ class TestSimpleNet(unittest.TestCase):
simplenet
=
SimpleNet
(
20
,
32
,
"float32"
)
simplenet
=
SimpleNet
(
20
,
32
,
"float32"
)
adam
=
SGDOptimizer
(
adam
=
SGDOptimizer
(
learning_rate
=
0.001
,
learning_rate
=
0.001
,
parameter_list
=
simplenet
.
parameters
())
parameter_list
=
simplenet
.
parameters
(),
grad_clip
=
grad_clip
)
input_emb
,
emb
=
simplenet
(
input
)
input_emb
,
emb
=
simplenet
(
input
)
self
.
assertTrue
(
emb
.
weight
.
gradient
()
is
None
)
self
.
assertTrue
(
emb
.
weight
.
gradient
()
is
None
)
self
.
assertTrue
(
input_emb
.
gradient
()
is
None
)
self
.
assertTrue
(
input_emb
.
gradient
()
is
None
)
input_emb
.
backward
(
backward_strategy
)
input_emb
.
backward
(
backward_strategy
)
adam
.
minimize
(
input_emb
,
grad_clip
=
grad_clip
)
adam
.
minimize
(
input_emb
)
self
.
assertTrue
(
emb
.
weight
.
gradient
()
is
not
None
)
self
.
assertTrue
(
emb
.
weight
.
gradient
()
is
not
None
)
emb
.
clear_gradients
()
emb
.
clear_gradients
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录