Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
0cdde0b4
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
0cdde0b4
编写于
6月 16, 2022
作者:
Z
zhaoyingli
提交者:
GitHub
6月 16, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
cherry-pick adamw unittest (#43498)
上级
abb0b2d6
变更
1
显示空白变更内容
内联
并排
Showing
1 changed file
with
191 addition
and
36 deletion
+191
-36
python/paddle/fluid/tests/unittests/test_adamw_op.py
python/paddle/fluid/tests/unittests/test_adamw_op.py
+191
-36
未找到文件。
python/paddle/fluid/tests/unittests/test_adamw_op.py
浏览文件 @
0cdde0b4
...
@@ -53,8 +53,8 @@ def adamw_step(inputs, attributes):
...
@@ -53,8 +53,8 @@ def adamw_step(inputs, attributes):
moment1_out
=
beta1
*
moment1
+
(
1
-
beta1
)
*
grad
moment1_out
=
beta1
*
moment1
+
(
1
-
beta1
)
*
grad
moment2_out
=
beta2
*
moment2
+
(
1
-
beta2
)
*
np
.
square
(
grad
)
moment2_out
=
beta2
*
moment2
+
(
1
-
beta2
)
*
np
.
square
(
grad
)
lr_t
=
lr
*
np
.
sqrt
(
1
-
beta2_pow
)
/
(
1
-
beta1_pow
)
denom
=
(
np
.
sqrt
(
moment2_out
)
/
np
.
sqrt
(
1.0
-
beta2_pow
))
+
epsilon
param_out
=
param
-
lr_t
*
(
moment1_out
/
(
np
.
sqrt
(
moment2_out
)
+
epsilon
))
param_out
=
param
+
((
moment1_out
/
denom
)
*
(
-
(
lr
/
(
1.0
-
beta1_pow
))
))
return
param_out
,
moment1_out
,
moment2_out
return
param_out
,
moment1_out
,
moment2_out
...
@@ -308,44 +308,120 @@ def simple_lr_setting(param, decay_rate, n_layers):
...
@@ -308,44 +308,120 @@ def simple_lr_setting(param, decay_rate, n_layers):
"core is not compiled with CUDA"
)
"core is not compiled with CUDA"
)
class
TestAdamWOpLayerwiseLR
(
TestAdamWOp
):
class
TestAdamWOpLayerwiseLR
(
TestAdamWOp
):
def
setUp
(
self
):
def
setUp
(
self
):
random
.
seed
(
202
1
)
random
.
seed
(
202
2
)
np
.
random
.
seed
(
202
1
)
np
.
random
.
seed
(
202
2
)
paddle
.
seed
(
202
1
)
paddle
.
seed
(
202
2
)
def
test_adamw_op_dygraph
(
self
):
def
test_adamw_op_dygraph
(
self
):
paddle
.
disable_static
()
paddle
.
disable_static
()
value
=
np
.
arange
(
26
).
reshape
(
2
,
13
).
astype
(
"float32"
)
linear1
=
paddle
.
nn
.
Linear
(
a
=
paddle
.
to_tensor
(
value
)
13
,
8
,
bias_attr
=
paddle
.
nn
.
initializer
.
Constant
(
value
=
1.0
))
linear1
=
paddle
.
nn
.
Linear
(
13
,
8
)
linear2
=
paddle
.
nn
.
Linear
(
linear2
=
paddle
.
nn
.
Linear
(
8
,
5
)
8
,
5
,
bias_attr
=
paddle
.
nn
.
initializer
.
Constant
(
value
=
1.0
))
# fix the linear name, simple_lr_setting function will use the name
linear1
.
weight
.
name
=
"linear_1.w_0"
linear1
.
bias
.
name
=
"linear_1.b_0"
linear2
.
weight
.
name
=
"linear_2.w_0"
linear2
.
bias
.
name
=
"linear_2.b_0"
fc1_w
=
np
.
array
(
linear1
.
weight
)
fc1_w_mon1
=
np
.
zeros_like
(
fc1_w
)
fc1_w_mon2
=
np
.
zeros_like
(
fc1_w
)
fc1_b
=
np
.
array
(
linear1
.
bias
)
fc1_b_mon1
=
np
.
zeros_like
(
fc1_b
)
fc1_b_mon2
=
np
.
zeros_like
(
fc1_b
)
fc2_w
=
np
.
array
(
linear2
.
weight
)
fc2_w_mon1
=
np
.
zeros_like
(
fc2_w
)
fc2_w_mon2
=
np
.
zeros_like
(
fc2_w
)
fc2_b
=
np
.
array
(
linear2
.
bias
)
fc2_b_mon1
=
np
.
zeros_like
(
fc2_b
)
fc2_b_mon2
=
np
.
zeros_like
(
fc2_b
)
simple_lr_fun
=
partial
(
simple_lr_setting
,
decay_rate
=
0.8
,
n_layers
=
2
)
simple_lr_fun
=
partial
(
simple_lr_setting
,
decay_rate
=
0.8
,
n_layers
=
2
)
learning_rate
=
0.001
weight_decay
=
0.01
beta1
=
0.9
beta2
=
0.999
adam
=
paddle
.
optimizer
.
AdamW
(
opt
=
paddle
.
optimizer
.
AdamW
(
learning_rate
=
0.01
,
learning_rate
=
learning_rate
,
parameters
=
[{
parameters
=
[{
'params'
:
linear1
.
parameters
()
'params'
:
linear1
.
parameters
()
},
{
},
{
'params'
:
linear2
.
parameters
(),
'params'
:
linear2
.
parameters
(),
}],
}],
apply_decay_param_fun
=
lambda
name
:
True
,
apply_decay_param_fun
=
lambda
name
:
True
,
weight_decay
=
0.01
,
weight_decay
=
weight_decay
,
lr_ratio
=
simple_lr_fun
)
lr_ratio
=
simple_lr_fun
)
loss_ref
=
np
.
array
(
def
get_numpy_output
(
param
,
grad
,
moment1
,
moment2
,
lr_ratio
,
t
):
[
-
1.7267396
,
-
2.81524
,
-
3.9250019
,
-
5.05954
,
-
6.2272625
])
np_inputs
=
{
'Param'
:
param
,
'Grad'
:
grad
,
'Moment1'
:
moment1
,
'Moment2'
:
moment2
,
'LearningRate'
:
np
.
array
([
learning_rate
]).
astype
(
"float32"
),
'Beta1Pow'
:
np
.
array
([
beta1
**
t
]).
astype
(
"float32"
),
'Beta2Pow'
:
np
.
array
([
beta2
**
t
]).
astype
(
"float32"
)
}
np_attrs
=
{
'epsilon'
:
1e-8
,
'beta1'
:
beta1
,
'beta2'
:
beta2
,
"lr_ratio"
:
lr_ratio
,
"coeff"
:
weight_decay
,
"with_decay"
:
True
}
param_out
,
moment1_out
,
moment2_out
=
adamw_step
(
np_inputs
,
np_attrs
)
return
param_out
,
moment1_out
,
moment2_out
for
i
in
range
(
5
):
for
i
in
range
(
5
):
a
=
paddle
.
to_tensor
(
np
.
random
.
uniform
(
-
1
,
1
,
(
2
,
13
)).
astype
(
"float32"
))
a1
=
linear1
(
a
)
a1
=
linear1
(
a
)
out
=
linear2
(
a1
)
out
=
linear2
(
a1
)
out
=
paddle
.
mean
(
out
)
out
=
paddle
.
mean
(
out
)
out
.
backward
()
out
.
backward
()
adam
.
step
()
adam
.
clear_gradients
()
fc1_w
,
fc1_w_mon1
,
fc1_w_mon2
=
get_numpy_output
(
np
.
testing
.
assert_allclose
(
out
[
0
].
numpy
(),
loss_ref
[
i
],
rtol
=
1e-6
)
fc1_w
,
np
.
array
(
linear1
.
weight
.
grad
),
fc1_w_mon1
,
fc1_w_mon2
,
simple_lr_fun
(
linear1
.
weight
),
i
+
1
)
fc1_b
,
fc1_b_mon1
,
fc1_b_mon2
=
get_numpy_output
(
fc1_b
,
np
.
array
(
linear1
.
bias
.
grad
),
fc1_b_mon1
,
fc1_b_mon2
,
simple_lr_fun
(
linear1
.
bias
),
i
+
1
)
fc2_w
,
fc2_w_mon1
,
fc2_w_mon2
=
get_numpy_output
(
fc2_w
,
np
.
array
(
linear2
.
weight
.
grad
),
fc2_w_mon1
,
fc2_w_mon2
,
simple_lr_fun
(
linear2
.
weight
),
i
+
1
)
fc2_b
,
fc2_b_mon1
,
fc2_b_mon2
=
get_numpy_output
(
fc2_b
,
np
.
array
(
linear2
.
bias
.
grad
),
fc2_b_mon1
,
fc2_b_mon2
,
simple_lr_fun
(
linear2
.
bias
),
i
+
1
)
opt
.
step
()
opt
.
clear_gradients
()
np
.
testing
.
assert_allclose
(
linear1
.
weight
.
numpy
(),
fc1_w
,
rtol
=
1e-6
)
np
.
testing
.
assert_allclose
(
linear1
.
bias
.
numpy
(),
fc1_b
,
rtol
=
1e-6
)
np
.
testing
.
assert_allclose
(
linear2
.
weight
.
numpy
(),
fc2_w
,
rtol
=
1e-6
)
np
.
testing
.
assert_allclose
(
linear2
.
bias
.
numpy
(),
fc2_b
,
rtol
=
1e-6
)
def
test_adamw_op
(
self
):
def
test_adamw_op
(
self
):
paddle
.
enable_static
()
paddle
.
enable_static
()
place
=
fluid
.
CUDAPlace
(
0
)
place
=
fluid
.
CUDAPlace
(
0
)
learning_rate
=
0.0001
beta1
=
0.85
beta2
=
0.95
weight_decay
=
0.01
epsilon
=
1e-8
train_prog
=
fluid
.
Program
()
train_prog
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
with
fluid
.
program_guard
(
train_prog
,
startup
):
with
fluid
.
program_guard
(
train_prog
,
startup
):
...
@@ -353,42 +429,121 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp):
...
@@ -353,42 +429,121 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp):
x
=
fluid
.
data
(
name
=
'x'
,
shape
=
[
None
,
10
],
dtype
=
'float32'
)
x
=
fluid
.
data
(
name
=
'x'
,
shape
=
[
None
,
10
],
dtype
=
'float32'
)
y
=
fluid
.
data
(
name
=
'y'
,
shape
=
[
None
,
1
],
dtype
=
'float32'
)
y
=
fluid
.
data
(
name
=
'y'
,
shape
=
[
None
,
1
],
dtype
=
'float32'
)
fc1
=
fluid
.
layers
.
fc
(
input
=
x
,
size
=
32
,
act
=
None
)
weight_attr1
=
paddle
.
framework
.
ParamAttr
(
name
=
"linear_0.w_0"
)
prediction
=
fluid
.
layers
.
fc
(
input
=
fc1
,
size
=
1
,
act
=
None
)
bias_attr1
=
paddle
.
framework
.
ParamAttr
(
cost
=
fluid
.
layers
.
square_error_cost
(
input
=
prediction
,
label
=
y
)
name
=
"linear_0.b_0"
,
initializer
=
paddle
.
nn
.
initializer
.
Constant
(
value
=
1.0
))
weight_attr2
=
paddle
.
framework
.
ParamAttr
(
name
=
"linear_1.w_0"
)
bias_attr2
=
paddle
.
framework
.
ParamAttr
(
name
=
"linear_1.b_0"
,
initializer
=
paddle
.
nn
.
initializer
.
Constant
(
value
=
1.0
))
linear1
=
paddle
.
nn
.
Linear
(
10
,
32
,
weight_attr
=
weight_attr1
,
bias_attr
=
bias_attr1
)
linear2
=
paddle
.
nn
.
Linear
(
32
,
1
,
weight_attr
=
weight_attr2
,
bias_attr
=
bias_attr2
)
out
=
linear1
(
x
)
out
=
linear2
(
out
)
fc1_w_mon1
=
np
.
zeros
((
linear1
.
weight
.
shape
)).
astype
(
"float32"
)
fc1_w_mon2
=
np
.
zeros
((
linear1
.
weight
.
shape
)).
astype
(
"float32"
)
fc1_b_mon1
=
np
.
zeros
((
linear1
.
bias
.
shape
)).
astype
(
"float32"
)
fc1_b_mon2
=
np
.
zeros
((
linear1
.
bias
.
shape
)).
astype
(
"float32"
)
fc2_w_mon1
=
np
.
zeros
((
linear2
.
weight
.
shape
)).
astype
(
"float32"
)
fc2_w_mon2
=
np
.
zeros
((
linear2
.
weight
.
shape
)).
astype
(
"float32"
)
fc2_b_mon1
=
np
.
zeros
((
linear2
.
bias
.
shape
)).
astype
(
"float32"
)
fc2_b_mon2
=
np
.
zeros
((
linear2
.
bias
.
shape
)).
astype
(
"float32"
)
cost
=
fluid
.
layers
.
square_error_cost
(
input
=
out
,
label
=
y
)
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
simple_lr_fun
=
partial
(
simple_lr_fun
=
partial
(
simple_lr_setting
,
decay_rate
=
0.8
,
n_layers
=
2
)
simple_lr_setting
,
decay_rate
=
0.8
,
n_layers
=
2
)
beta1
=
fluid
.
layers
.
create_global_var
(
shape
=
[
1
],
value
=
0.85
,
dtype
=
'float32'
,
persistable
=
True
)
beta2
=
fluid
.
layers
.
create_global_var
(
shape
=
[
1
],
value
=
0.95
,
dtype
=
'float32'
,
persistable
=
True
)
betas
=
[
beta1
,
beta2
]
opt
=
paddle
.
optimizer
.
AdamW
(
opt
=
paddle
.
optimizer
.
AdamW
(
learning_rate
=
1e-5
,
learning_rate
=
learning_rate
,
beta1
=
beta1
,
beta1
=
beta1
,
beta2
=
beta2
,
beta2
=
beta2
,
weight_decay
=
0.01
,
weight_decay
=
weight_decay
,
epsilon
=
1e-8
,
epsilon
=
epsilon
,
lr_ratio
=
simple_lr_fun
)
lr_ratio
=
simple_lr_fun
)
opt
.
minimize
(
avg_cost
)
opt
.
minimize
(
avg_cost
)
def
get_numpy_output
(
param
,
grad
,
moment1
,
moment2
,
lr_ratio
,
t
):
np_inputs
=
{
'Param'
:
param
,
'Grad'
:
grad
,
'Moment1'
:
moment1
,
'Moment2'
:
moment2
,
'LearningRate'
:
np
.
array
([
learning_rate
]).
astype
(
"float32"
),
'Beta1Pow'
:
np
.
array
([
beta1
**
t
]).
astype
(
"float32"
),
'Beta2Pow'
:
np
.
array
([
beta2
**
t
]).
astype
(
"float32"
)
}
np_attrs
=
{
'epsilon'
:
epsilon
,
'beta1'
:
beta1
,
'beta2'
:
beta2
,
"lr_ratio"
:
lr_ratio
,
"coeff"
:
weight_decay
,
"with_decay"
:
True
}
param_out
,
moment1_out
,
moment2_out
=
adamw_step
(
np_inputs
,
np_attrs
)
return
param_out
,
moment1_out
,
moment2_out
fetch_list1
=
[
"linear_0.w_0"
,
"linear_0.b_0"
,
"linear_1.w_0"
,
"linear_1.b_0"
]
fetch_list2
=
[
"linear_0.w_0"
,
"linear_0.w_0@GRAD"
,
"linear_0.b_0"
,
"linear_0.b_0@GRAD"
,
"linear_1.w_0"
,
"linear_1.w_0@GRAD"
,
"linear_1.b_0"
,
"linear_1.b_0@GRAD"
]
exe
=
fluid
.
Executor
(
place
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup
)
exe
.
run
(
startup
)
test_prog
=
train_prog
.
clone
(
for_test
=
True
)
loss_ref
=
np
.
array
(
[
0.33895183
,
0.3159437
,
0.19472016
,
0.17764759
,
0.1520702
])
for
i
in
range
(
5
):
for
i
in
range
(
5
):
inputs
=
np
.
random
.
random
(
size
=
[
8
,
10
]).
astype
(
'float32'
)
inputs
=
np
.
random
.
random
(
size
=
[
8
,
10
]).
astype
(
'float32'
)
outputs
=
np
.
random
.
random
(
size
=
[
8
,
1
]).
astype
(
'float32'
)
outputs
=
np
.
random
.
random
(
size
=
[
8
,
1
]).
astype
(
'float32'
)
rets
=
exe
.
run
(
train_prog
,
param
=
exe
.
run
(
test_prog
,
feed
=
{
"x"
:
inputs
,
feed
=
{
"x"
:
inputs
,
"y"
:
outputs
},
"y"
:
outputs
},
fetch_list
=
[
avg_cost
])
fetch_list
=
fetch_list1
)
assert
rets
[
0
]
is
not
None
params_and_gras
=
exe
.
run
(
train_prog
,
np
.
testing
.
assert_allclose
(
rets
[
0
],
loss_ref
[
i
],
rtol
=
1e-6
)
feed
=
{
"x"
:
inputs
,
"y"
:
outputs
},
fetch_list
=
fetch_list2
)
fc1_w
=
param
[
0
]
fc1_w_grad
=
params_and_gras
[
1
]
fc1_b
=
param
[
1
]
fc1_b_grad
=
params_and_gras
[
3
]
fc2_w
=
param
[
2
]
fc2_w_grad
=
params_and_gras
[
5
]
fc2_b
=
param
[
3
]
fc2_b_grad
=
params_and_gras
[
7
]
fc1_w
,
fc1_w_mon1
,
fc1_w_mon2
=
get_numpy_output
(
fc1_w
,
fc1_w_grad
,
fc1_w_mon1
,
fc1_w_mon2
,
simple_lr_fun
(
linear1
.
weight
),
i
+
1
)
fc1_b
,
fc1_b_mon1
,
fc1_b_mon2
=
get_numpy_output
(
fc1_b
,
fc1_b_grad
,
fc1_b_mon1
,
fc1_b_mon2
,
simple_lr_fun
(
linear1
.
bias
),
i
+
1
)
fc2_w
,
fc2_w_mon1
,
fc2_w_mon2
=
get_numpy_output
(
fc2_w
,
fc2_w_grad
,
fc2_w_mon1
,
fc2_w_mon2
,
simple_lr_fun
(
linear2
.
weight
),
i
+
1
)
fc2_b
,
fc2_b_mon1
,
fc2_b_mon2
=
get_numpy_output
(
fc2_b
,
fc2_b_grad
,
fc2_b_mon1
,
fc2_b_mon2
,
simple_lr_fun
(
linear2
.
bias
),
i
+
1
)
np
.
testing
.
assert_allclose
(
params_and_gras
[
0
],
fc1_w
,
rtol
=
1e-6
)
np
.
testing
.
assert_allclose
(
params_and_gras
[
2
],
fc1_b
,
rtol
=
1e-6
)
np
.
testing
.
assert_allclose
(
params_and_gras
[
4
],
fc2_w
,
rtol
=
1e-6
)
np
.
testing
.
assert_allclose
(
params_and_gras
[
6
],
fc2_b
,
rtol
=
1e-6
)
paddle
.
disable_static
()
paddle
.
disable_static
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录