Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
66dc8e30
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
66dc8e30
编写于
4月 17, 2020
作者:
Z
Zhou Wei
提交者:
GitHub
4月 17, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
move the initialize position of grad_clip to optimizer(__init__),and speed up clip (#23782)
上级
361c6ccc
变更
7
展开全部
隐藏空白更改
内联
并排
Showing
7 changed file
with
198 addition
and
127 deletion
+198
-127
python/paddle/fluid/clip.py
python/paddle/fluid/clip.py
+52
-41
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+97
-39
python/paddle/fluid/param_attr.py
python/paddle/fluid/param_attr.py
+1
-1
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
+13
-2
python/paddle/fluid/tests/unittests/test_gradient_clip.py
python/paddle/fluid/tests/unittests/test_gradient_clip.py
+23
-34
python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
...addle/fluid/tests/unittests/test_imperative_auto_prune.py
+6
-6
python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
...le/fluid/tests/unittests/test_imperative_selected_rows.py
+6
-4
未找到文件。
python/paddle/fluid/clip.py
浏览文件 @
66dc8e30
...
...
@@ -137,9 +137,6 @@ class GradientClipBase(object):
raise
NotImplementedError
def
__call__
(
self
,
params_grads
):
assert
len
(
params_grads
)
>
0
,
"The number of trainable parameters should be greater than 0."
if
framework
.
in_dygraph_mode
():
return
self
.
_dygraph_clip
(
params_grads
)
else
:
...
...
@@ -147,7 +144,7 @@ class GradientClipBase(object):
if
getattr
(
p
,
'gradient_clip_attr'
,
None
)
is
not
None
:
warnings
.
warn
(
"'set_gradient_clip' will be ineffective, because you have "
"
pass 'grad_clip' into 'minimize
'. So, 'set_gradient_clip' "
"
set 'grad_clip' in 'optimizer
'. So, 'set_gradient_clip' "
"is redundant and you can remove it."
)
break
return
self
.
_static_clip
(
params_grads
)
...
...
@@ -170,7 +167,7 @@ class GradientClipByValue(GradientClipBase):
The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
is not None, then only part of gradients can be selected for gradient clipping.
Gradient clip will takes effect after being set in ``optimizer
.minimize(grad_clip)
`` , see the document ``optimizer``
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_fluid_optimizer_SGDOptimizer`).
Args:
...
...
@@ -208,8 +205,8 @@ class GradientClipByValue(GradientClipBase):
# return Parameter.name=="fc_0.w_0"
# clip = fluid.clip.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1)
sgd_optimizer.minimize(loss
, grad_clip=clip
)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1
, grad_clip=clip
)
sgd_optimizer.minimize(loss)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
...
...
@@ -242,8 +239,8 @@ class GradientClipByValue(GradientClipBase):
# clip = fluid.clip.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=0.1, parameter_list=linear.parameters())
sgd_optimizer.minimize(loss
, grad_clip=clip
)
learning_rate=0.1, parameter_list=linear.parameters()
, grad_clip=clip
)
sgd_optimizer.minimize(loss)
"""
def
__init__
(
self
,
max
,
min
=
None
,
need_clip
=
None
):
...
...
@@ -272,6 +269,7 @@ class GradientClipByValue(GradientClipBase):
def
_static_clip
(
self
,
params_grads
):
params_and_grads
=
[]
param_new_grad_name_dict
=
dict
()
with
framework
.
name_scope
(
'gradient_clip'
):
for
p
,
g
in
params_grads
:
if
g
is
None
:
...
...
@@ -284,7 +282,8 @@ class GradientClipByValue(GradientClipBase):
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
new_grad
=
layers
.
clip
(
x
=
g
,
min
=
self
.
min
,
max
=
self
.
max
)
params_and_grads
.
append
((
p
,
new_grad
))
_correct_clip_op_role_var
(
params_and_grads
)
param_new_grad_name_dict
[
p
.
name
]
=
new_grad
.
name
_correct_clip_op_role_var
(
params_and_grads
,
param_new_grad_name_dict
)
return
params_and_grads
def
_process_context
(
self
,
context
,
param
,
grad
):
...
...
@@ -306,7 +305,7 @@ class GradientClipByNorm(GradientClipBase):
The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
is not None, then only part of gradients can be selected for gradient clipping.
Gradient clip will takes effect after being set in ``optimizer
.minimize(grad_clip)
`` , see the document ``optimizer``
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_fluid_optimizer_SGDOptimizer`).
The clipping formula is:
...
...
@@ -359,8 +358,8 @@ class GradientClipByNorm(GradientClipBase):
# return Parameter.name=="fc_0.w_0"
# clip = fluid.clip.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1)
sgd_optimizer.minimize(loss
, grad_clip=clip
)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1
, grad_clip=clip
)
sgd_optimizer.minimize(loss)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
...
...
@@ -394,8 +393,8 @@ class GradientClipByNorm(GradientClipBase):
# clip = fluid.clip.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=0.1, parameter_list=linear.parameters())
sgd_optimizer.minimize(loss
, grad_clip=clip
)
learning_rate=0.1, parameter_list=linear.parameters()
, grad_clip=clip
)
sgd_optimizer.minimize(loss)
"""
...
...
@@ -422,6 +421,7 @@ class GradientClipByNorm(GradientClipBase):
def
_static_clip
(
self
,
params_grads
):
params_and_grads
=
[]
with
framework
.
name_scope
(
'gradient_clip'
):
param_new_grad_name_dict
=
dict
()
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
...
...
@@ -432,8 +432,9 @@ class GradientClipByNorm(GradientClipBase):
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
new_grad
=
layers
.
clip_by_norm
(
x
=
g
,
max_norm
=
self
.
clip_norm
)
param_new_grad_name_dict
[
p
.
name
]
=
new_grad
.
name
params_and_grads
.
append
((
p
,
new_grad
))
_correct_clip_op_role_var
(
params_and_grads
)
_correct_clip_op_role_var
(
params_and_grads
,
param_new_grad_name_dict
)
return
params_and_grads
def
_process_context
(
self
,
context
,
param
,
grad
):
...
...
@@ -456,7 +457,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
is not None, then only part of gradients can be selected for gradient clipping.
Gradient clip will takes effect after being set in ``optimizer
.minimize(grad_clip)
`` , see the document ``optimizer``
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_fluid_optimizer_SGDOptimizer`).
The clipping formula is:
...
...
@@ -505,8 +506,8 @@ class GradientClipByGlobalNorm(GradientClipBase):
# return Parameter.name=="fc_0.w_0"
# clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1)
sgd_optimizer.minimize(loss
, grad_clip=clip
)
sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1
, grad_clip=clip
)
sgd_optimizer.minimize(loss)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
...
...
@@ -539,8 +540,8 @@ class GradientClipByGlobalNorm(GradientClipBase):
# clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func)
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=0.1, parameter_list=linear.parameters())
sgd_optimizer.minimize(loss
, grad_clip=clip
)
learning_rate=0.1, parameter_list=linear.parameters()
, grad_clip=clip
)
sgd_optimizer.minimize(loss)
"""
...
...
@@ -628,6 +629,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
y
=
layers
.
elementwise_max
(
x
=
max_global_norm
,
y
=
global_norm_var
))
param_new_grad_name_dict
=
dict
()
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
...
...
@@ -638,9 +640,10 @@ class GradientClipByGlobalNorm(GradientClipBase):
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
new_grad
=
layers
.
elementwise_mul
(
x
=
g
,
y
=
scale_var
)
param_new_grad_name_dict
[
p
.
name
]
=
new_grad
.
name
params_and_grads
.
append
((
p
,
new_grad
))
_correct_clip_op_role_var
(
params_and_grads
)
_correct_clip_op_role_var
(
params_and_grads
,
param_new_grad_name_dict
)
return
params_and_grads
def
_process_context
(
self
,
context
,
param
,
grad
):
...
...
@@ -692,9 +695,10 @@ def set_gradient_clip(clip, param_list=None, program=None):
This API must be used after building network, and before ``minimize`` ,
and it may be removed in future releases, so it is not recommended.
It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient.
There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
this is a better method to clip gradient. There are three clipping strategies:
:ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` .
To specify parameters that require gradient clip.
...
...
@@ -757,7 +761,7 @@ def set_gradient_clip(clip, param_list=None, program=None):
sgd = fluid.optimizer.SGD(learning_rate=1e-3)
sgd.minimize(loss)
# network 4: use 'set_gradient_clip' and '
min
imize(grad_clip=clip)' together
# network 4: use 'set_gradient_clip' and '
opt
imize(grad_clip=clip)' together
with fluid.program_guard(fluid.Program(), fluid.Program()):
loss = network()
clip1 = fluid.clip.GradientClipByValue(min=-1.0, max=1.0)
...
...
@@ -765,8 +769,8 @@ def set_gradient_clip(clip, param_list=None, program=None):
# Set the gradient clipping strategy: clip1
fluid.clip.set_gradient_clip(clip1)
# Set the gradient clipping strategy: clip2
sgd = fluid.optimizer.SGD(learning_rate=1e-3)
sgd.minimize(loss
, grad_clip=clip2
)
sgd = fluid.optimizer.SGD(learning_rate=1e-3
, grad_clip=clip2
)
sgd.minimize(loss)
# 'set_gradient_clip' will not take effect when setting has a conflict,
# and the gradient clipping strategy will be 'clip2'
...
...
@@ -774,10 +778,10 @@ def set_gradient_clip(clip, param_list=None, program=None):
"""
warnings
.
warn
(
"Caution! 'set_gradient_clip' is not recommended "
"and may be deprecated in future! "
"We recommend a new strategy:
clip gradient by
"
"
'optimizer.minimize(loss, grad_clip=clip)
'. "
"We recommend a new strategy:
set 'grad_clip'
"
"
when initializing the 'optimizer
'. "
"This method can reduce the mistakes, please "
"
see documention of 'optimzier.minimize
'."
)
"
refer to documention of 'optimizer
'."
)
if
not
isinstance
(
clip
,
GradientClipBase
):
raise
TypeError
(
...
...
@@ -824,33 +828,40 @@ def append_gradient_clip_ops(param_grads):
clip_attr
.
_process_context
(
context
=
context
,
param
=
p
,
grad
=
g
)
res
=
[]
param_new_grad_name_dict
=
dict
()
for
p
,
g
in
param_grads
:
if
g
is
None
:
continue
with
p
.
block
.
program
.
_optimized_guard
(
[
p
,
g
]),
framework
.
name_scope
(
'graident_clip_@CLIP'
):
param
,
new_grad
=
clip_attr
.
_create_operators
(
param
=
p
,
grad
=
g
)
param_new_grad_name_dict
[
param
.
name
]
=
new_grad
.
name
res
.
append
([
param
,
new_grad
])
_correct_clip_op_role_var
(
res
)
_correct_clip_op_role_var
(
res
,
param_new_grad_name_dict
)
return
res
# change wrong mapping relation between param & grad in clip op
def
_correct_clip_op_role_var
(
params_grads
):
def
_correct_clip_op_role_var
(
params_grads
,
param_new_grad_name_dict
):
block_id_list
=
[]
if
len
(
param_new_grad_name_dict
)
==
0
:
return
for
param
,
grad
in
params_grads
:
if
grad
is
None
:
continue
block_id
=
param
.
block
.
idx
if
block_id
in
block_id_list
:
continue
block_id_list
.
append
(
block_id
)
for
op
in
param
.
block
.
program
.
global_block
().
ops
:
if
'op_namescope'
in
op
.
all_attrs
()
and
"gradient_clip"
in
op
.
attr
(
"op_namescope"
):
if
op
.
attr
(
'op_role_var'
):
param_name
=
op
.
attr
(
'op_role_var'
)[
0
]
index
=
0
for
i
in
range
(
len
(
params_grads
)):
if
params_grads
[
i
][
0
].
name
==
param_name
:
index
=
i
correct_p_g
=
[
param_name
,
params_grads
[
index
][
1
].
name
]
"op_namescope"
)
and
op
.
attr
(
'op_role_var'
):
param_name
=
op
.
attr
(
'op_role_var'
)[
0
]
if
param_name
in
param_new_grad_name_dict
:
correct_p_g
=
[
param_name
,
param_new_grad_name_dict
[
param_name
]
]
op
.
_set_attr
(
'op_role_var'
,
correct_p_g
)
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
66dc8e30
此差异已折叠。
点击以展开。
python/paddle/fluid/param_attr.py
浏览文件 @
66dc8e30
...
...
@@ -36,7 +36,7 @@ class ParamAttr(object):
Note:
``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
It is recommended to
use ``minimize(loss, grad_clip=clip)
`` to clip gradient.
It is recommended to
set ``grad_clip`` in ``optimizer
`` to clip gradient.
There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
...
...
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
浏览文件 @
66dc8e30
...
...
@@ -19,6 +19,7 @@ import unittest
import
paddle.fluid.framework
as
framework
import
paddle.fluid.optimizer
as
optimizer
import
paddle.fluid.regularizer
as
regularizer
import
paddle.fluid.clip
as
clip
import
paddle.compat
as
cpt
from
paddle.fluid.backward
import
append_backward
from
paddle.fluid.transpiler.details
import
program_to_code
...
...
@@ -70,9 +71,9 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
learning_rate
=
learning_rate
,
momentum
=
0.2
,
rampup_begin_step
=
0
,
local_grad_clip_norm
=
1.0
,
num_trainers
=
2
,
regularization
=
regularization
)
regularization
=
regularization
,
grad_clip
=
clip
.
GradientClipByNorm
(
1.0
))
if
use_recompute
:
dgc_momentum_optimizer
=
optimizer
.
RecomputeOptimizer
(
...
...
@@ -124,6 +125,16 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
#with open("test_dgc_optimizer_" + name + str(use_recompute) + ".log", "w") as f:
# program_to_code(program, fout=f)
def
test_tpyeError
(
self
):
# the type of DGCMomentumOptimizer(grad_clip=) must be 'GradientClipByNorm'
with
self
.
assertRaises
(
TypeError
):
dgc_momentum_optimizer
=
self
.
MockDGCMomentum
(
learning_rate
=
0.01
,
momentum
=
0.2
,
rampup_begin_step
=
0
,
num_trainers
=
2
,
grad_clip
=
clip
.
GradientClipByGlobalNorm
(
1.0
))
def
test_momentum_without_dgc
(
self
):
self
.
check_dgc_momentum_optimizer
(
regularization
=
regularizer
.
L1Decay
(
1e-4
))
...
...
python/paddle/fluid/tests/unittests/test_gradient_clip.py
浏览文件 @
66dc8e30
...
...
@@ -76,8 +76,8 @@ class TestGradientClip(unittest.TestCase):
startup_program
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main_program
=
prog
,
startup_program
=
startup_program
):
image
=
fluid
.
data
(
name
=
'x'
,
shape
=
[
-
1
,
784
],
dtype
=
'float32'
)
label
=
fluid
.
data
(
name
=
'y'
,
shape
=
[
-
1
,
1
],
dtype
=
'int64'
)
image
=
fluid
.
data
(
name
=
"a"
,
shape
=
[
-
1
,
784
],
dtype
=
'float32'
)
label
=
fluid
.
data
(
name
=
"b"
,
shape
=
[
-
1
,
1
],
dtype
=
'int64'
)
hidden
=
fluid
.
layers
.
fc
(
input
=
image
,
size
=
32
,
act
=
'relu'
)
predict
=
fluid
.
layers
.
fc
(
input
=
hidden
,
size
=
10
,
act
=
'softmax'
)
...
...
@@ -112,13 +112,13 @@ class TestGradientClip(unittest.TestCase):
self
.
check_clip_result
(
out
,
out_clip
)
def
check_sparse_gradient_clip
(
self
,
place
):
prog
=
fluid
.
framework
.
Program
()
startup_program
=
fluid
.
framework
.
Program
()
prog
=
fluid
.
Program
()
startup_program
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main_program
=
prog
,
startup_program
=
startup_program
):
data
=
fluid
.
layers
.
data
(
name
=
"words"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
data
=
fluid
.
data
(
name
=
"words"
,
shape
=
[
-
1
,
1
],
dtype
=
"int64"
,
lod_level
=
1
)
label
=
fluid
.
data
(
name
=
"label"
,
shape
=
[
-
1
,
1
],
dtype
=
"int64"
)
cost
=
bow_net
(
data
,
label
,
self
.
word_dict_len
)
self
.
backward_and_optimize
(
cost
)
...
...
@@ -172,7 +172,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
self
.
clip_gradient
=
func
self
.
check_gradient_clip
(
fluid
.
CPUPlace
())
# test whether the ouput is right when use
'minimize(grad_clip)'
# test whether the ouput is right when use
grad_clip
def
test_new_gradient_clip
(
self
):
def
func
(
params_grads
):
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
self
.
clip_norm
)
...
...
@@ -192,9 +192,10 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
5.0
,
need_clip
=
fileter_func
)
fluid
.
clip
.
set_gradient_clip
(
clip
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.01
)
# if 'set_gradient_clip' and 'minimize(grad_clip)' together, 'set_gradient_clip' will be ineffective
sgd_optimizer
.
minimize
(
cost
,
grad_clip
=
clip
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.01
,
grad_clip
=
clip
)
# if 'set_gradient_clip' and 'optimize(grad_clip)' together, 'set_gradient_clip' will be ineffective
sgd_optimizer
.
minimize
(
cost
)
# 'set_gradient_clip' must before 'minimize', otherwise, 'set_gradient_clip' will be ineffective
fluid
.
clip
.
set_gradient_clip
(
clip
)
...
...
@@ -232,24 +233,10 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
self
.
clip_norm
,
need_clip
=
"test"
)
# the type of minimize(grad_clip=) must be an instance of GradientClipBase's derived class
with
self
.
assertRaises
(
TypeError
):
x
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
name
=
"x"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
loss
=
fluid
.
layers
.
reduce_mean
(
x
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.1
)
sgd_optimizer
.
minimize
(
loss
,
grad_clip
=
"test"
)
# the type of RecomputeOptimizer.minimize(grad_clip=) must be an instance of GradientClipBase's derived class
# the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class
with
self
.
assertRaises
(
TypeError
):
x
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
name
=
"x"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
loss
=
fluid
.
layers
.
reduce_mean
(
x
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.1
)
recompute_optimizer
=
fluid
.
optimizer
.
RecomputeOptimizer
(
sgd_optimizer
)
recompute_optimizer
.
_set_checkpoints
([
x
])
recompute_optimizer
.
minimize
(
loss
,
grad_clip
=
"test"
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.1
,
grad_clip
=
"test"
)
class
TestGradientClipByNorm
(
TestGradientClip
):
...
...
@@ -271,7 +258,7 @@ class TestGradientClipByNorm(TestGradientClip):
a
=
u
,
b
=
v
,
rtol
=
1e-5
,
atol
=
1e-8
),
"gradient clip by norm has wrong results!"
)
# test whether the ouput is right when use
'minimize(grad_clip)'
# test whether the ouput is right when use
grad_clip
def
test_gradient_clip
(
self
):
self
.
check_gradient_clip
(
fluid
.
CPUPlace
())
...
...
@@ -319,7 +306,7 @@ class TestGradientClipByValue(TestGradientClip):
a
=
u
,
b
=
v
,
rtol
=
1e-6
,
atol
=
1e-8
),
"gradient clip by value has wrong results!"
)
# test whether the ouput is right when use
'minimize(grad_clip)'
# test whether the ouput is right when use
grad_clip
def
test_gradient_clip
(
self
):
self
.
check_gradient_clip
(
fluid
.
CPUPlace
())
...
...
@@ -357,7 +344,9 @@ class TestDygraphGradientClip(unittest.TestCase):
loss
=
fluid
.
layers
.
reduce_mean
(
out
)
loss
.
backward
()
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.0
,
parameter_list
=
linear
.
parameters
())
learning_rate
=
0.0
,
parameter_list
=
linear
.
parameters
(),
grad_clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
0.1
))
self
.
check_clip_result
(
loss
,
sgd_optimizer
)
def
check_clip_result
(
self
,
loss
,
optimizer
):
...
...
@@ -384,7 +373,7 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
np
.
array
([
3
,
4
]).
astype
(
"float32"
),
name
=
"y"
)
assert
len
(
self
.
clip1
([(
x
,
x
),
(
x
,
y
),
(
x
,
None
)]))
==
2
# get params and grads from network
opt
,
params_grads
=
optimizer
.
minimize
(
loss
,
grad_clip
=
self
.
clip2
)
opt
,
params_grads
=
optimizer
.
minimize
(
loss
)
_
,
grads
=
zip
(
*
params_grads
)
params_grads
=
self
.
clip2
(
params_grads
)
_
,
grads_clip
=
zip
(
*
params_grads
)
...
...
@@ -426,7 +415,7 @@ class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
assert
len
(
self
.
clip
([(
x
,
None
)]))
==
0
# get params and grads from network
self
.
clip
([(
fluid
.
dygraph
.
to_variable
(
np
.
array
([
2
,
3
])),
None
)])
params_grads
=
optimizer
.
backward
(
loss
)
opt
,
params_grads
=
optimizer
.
minimize
(
loss
)
_
,
grads
=
zip
(
*
params_grads
)
params_grads
=
self
.
clip
(
params_grads
)
_
,
grads_clip
=
zip
(
*
params_grads
)
...
...
@@ -460,7 +449,7 @@ class TestDygraphGradientClipByValue(TestDygraphGradientClip):
x
=
fluid
.
dygraph
.
to_variable
(
np
.
array
([
2
,
3
]).
astype
(
"float32"
))
assert
len
(
self
.
clip
([(
x
,
None
)]))
==
0
# get params and grads from network
params_grads
=
optimizer
.
backward
(
loss
)
opt
,
params_grads
=
optimizer
.
minimize
(
loss
)
_
,
grads
=
zip
(
*
params_grads
)
params_grads
=
self
.
clip
(
params_grads
)
_
,
grads_clip
=
zip
(
*
params_grads
)
...
...
python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
浏览文件 @
66dc8e30
...
...
@@ -329,9 +329,9 @@ class TestImperativeAutoPrune(unittest.TestCase):
place
=
fluid
.
CPUPlace
()
with
fluid
.
dygraph
.
guard
(
place
):
model
=
MyLayer
(
size
,
vocab_size
,
size
)
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
0.001
,
parameter_list
=
model
.
parameters
())
grad_clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
0.001
)
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
0.001
,
parameter_list
=
model
.
parameters
(),
grad_clip
=
grad_clip
)
indices
=
fluid
.
dygraph
.
to_variable
(
indices
)
embed
=
fluid
.
dygraph
.
to_variable
(
embed
)
...
...
@@ -339,7 +339,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
loss
=
model
.
embed_linear0
(
indices
)
loss
.
backward
()
_
,
params_grads
=
optimizer
.
minimize
(
loss
,
grad_clip
=
grad_clip
)
_
,
params_grads
=
optimizer
.
minimize
(
loss
)
for
items
in
params_grads
:
assert
items
[
0
].
name
is
not
model
.
embed1
.
weight
.
name
assert
items
[
0
].
name
is
not
model
.
linear_1
.
weight
.
name
...
...
@@ -348,9 +348,9 @@ class TestImperativeAutoPrune(unittest.TestCase):
with
fluid
.
dygraph
.
guard
(
place
):
model
=
MyLayer2
(
size
,
vocab_size
,
size
)
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
0.001
,
parameter_list
=
model
.
parameters
())
grad_clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
0.001
)
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
0.001
,
parameter_list
=
model
.
parameters
(),
grad_clip
=
grad_clip
)
indices
=
fluid
.
dygraph
.
to_variable
(
indices
)
emebd
=
fluid
.
dygraph
.
to_variable
(
embed
)
...
...
@@ -358,7 +358,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
loss
=
model
.
embed_linear0
(
indices
)
loss
.
backward
()
optimizer
.
minimize
(
loss
,
grad_clip
=
grad_clip
)
optimizer
.
minimize
(
loss
)
for
items
in
params_grads
:
assert
items
[
0
].
name
is
not
model
.
embed1
.
weight
.
name
assert
items
[
0
].
name
is
not
model
.
linear_1
.
weight
.
name
...
...
python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
浏览文件 @
66dc8e30
...
...
@@ -58,14 +58,15 @@ class TestSimpleNet(unittest.TestCase):
simplenet
=
SimpleNet
(
20
,
32
,
dtype
)
adam
=
SGDOptimizer
(
learning_rate
=
0.001
,
parameter_list
=
simplenet
.
parameters
())
parameter_list
=
simplenet
.
parameters
(
))
# grad_clip=grad_clip
input_emb
,
emb
=
simplenet
(
input
)
self
.
assertTrue
(
emb
.
weight
.
gradient
()
is
None
)
self
.
assertTrue
(
input_emb
.
gradient
()
is
None
)
input_emb
.
backward
(
backward_strategy
)
adam
.
minimize
(
input_emb
)
# grad_clip=grad_clip
adam
.
minimize
(
input_emb
)
self
.
assertTrue
(
emb
.
weight
.
gradient
()
is
not
None
)
emb
.
clear_gradients
()
...
...
@@ -92,14 +93,15 @@ class TestSimpleNet(unittest.TestCase):
simplenet
=
SimpleNet
(
20
,
32
,
"float32"
)
adam
=
SGDOptimizer
(
learning_rate
=
0.001
,
parameter_list
=
simplenet
.
parameters
())
parameter_list
=
simplenet
.
parameters
(),
grad_clip
=
grad_clip
)
input_emb
,
emb
=
simplenet
(
input
)
self
.
assertTrue
(
emb
.
weight
.
gradient
()
is
None
)
self
.
assertTrue
(
input_emb
.
gradient
()
is
None
)
input_emb
.
backward
(
backward_strategy
)
adam
.
minimize
(
input_emb
,
grad_clip
=
grad_clip
)
adam
.
minimize
(
input_emb
)
self
.
assertTrue
(
emb
.
weight
.
gradient
()
is
not
None
)
emb
.
clear_gradients
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录