Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
70e67843
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
70e67843
编写于
9月 30, 2021
作者:
Z
zhaoyingli
提交者:
GitHub
9月 30, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add optest for adamw (#36148) (#36239)
* update func name * skip cpu * update unittest * update unittest
上级
28d12007
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
165 addition
and
7 deletion
+165
-7
python/paddle/fluid/tests/unittests/test_adamw_op.py
python/paddle/fluid/tests/unittests/test_adamw_op.py
+162
-4
python/paddle/optimizer/adamw.py
python/paddle/optimizer/adamw.py
+3
-3
未找到文件。
python/paddle/fluid/tests/unittests/test_adamw_op.py
浏览文件 @
70e67843
...
@@ -14,9 +14,153 @@
...
@@ -14,9 +14,153 @@
import
unittest
import
unittest
import
paddle
import
paddle
import
random
import
numpy
as
np
import
numpy
as
np
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
op_test
import
OpTest
from
functools
import
partial
from
functools
import
partial
from
paddle.framework
import
core
def
adamw_step
(
inputs
,
attributes
):
param
=
inputs
[
'Param'
]
grad
=
inputs
[
'Grad'
]
moment1
=
inputs
[
'Moment1'
]
moment2
=
inputs
[
'Moment2'
]
lr
=
inputs
[
'LearningRate'
]
beta1_pow
=
inputs
[
'Beta1Pow'
]
beta2_pow
=
inputs
[
'Beta2Pow'
]
epsilon
=
attributes
[
'epsilon'
]
if
'lr_ratio'
in
attributes
:
lr
=
lr
*
attributes
[
'lr_ratio'
]
if
attributes
[
"with_decay"
]:
coeff
=
attributes
[
"coeff"
]
decay
=
1.0
-
lr
*
coeff
param2
=
param
*
decay
param
=
param2
.
copy
()
if
'beta1'
in
attributes
:
beta1
=
attributes
[
'beta1'
]
else
:
beta1
=
inputs
[
'Beta1Tensor'
][
0
]
if
'beta2'
in
attributes
:
beta2
=
attributes
[
'beta2'
]
else
:
beta2
=
inputs
[
'Beta2Tensor'
][
0
]
moment1_out
=
beta1
*
moment1
+
(
1
-
beta1
)
*
grad
moment2_out
=
beta2
*
moment2
+
(
1
-
beta2
)
*
np
.
square
(
grad
)
lr_t
=
lr
*
np
.
sqrt
(
1
-
beta2_pow
)
/
(
1
-
beta1_pow
)
param_out
=
param
-
lr_t
*
(
moment1_out
/
(
np
.
sqrt
(
moment2_out
)
+
epsilon
))
return
param_out
,
moment1_out
,
moment2_out
class
TestAdamW
(
OpTest
):
def
setUp
(
self
):
'''Test AdamW Op with supplied attributes
'''
self
.
op_type
=
"adamw"
param
=
np
.
random
.
uniform
(
-
1
,
1
,
(
102
,
105
)).
astype
(
"float32"
)
grad
=
np
.
random
.
uniform
(
-
1
,
1
,
(
102
,
105
)).
astype
(
"float32"
)
moment1
=
np
.
random
.
uniform
(
-
1
,
1
,
(
102
,
105
)).
astype
(
"float32"
)
# The second moment is positive
moment2
=
np
.
random
.
random
((
102
,
105
)).
astype
(
"float32"
)
learning_rate
=
0.004
beta1
=
0.78
beta2
=
0.836
epsilon
=
1e-4
beta1_pow
=
beta1
**
10
beta2_pow
=
beta2
**
10
self
.
inputs
=
{
'Param'
:
param
,
'Grad'
:
grad
,
'Moment1'
:
moment1
,
'Moment2'
:
moment2
,
'LearningRate'
:
np
.
array
([
learning_rate
]).
astype
(
"float32"
),
'Beta1Pow'
:
np
.
array
([
beta1_pow
]).
astype
(
"float32"
),
'Beta2Pow'
:
np
.
array
([
beta2_pow
]).
astype
(
"float32"
)
}
self
.
attrs
=
{
'epsilon'
:
epsilon
,
'beta1'
:
beta1
,
'beta2'
:
beta2
,
"coeff"
:
0.5
,
"with_decay"
:
True
}
param_out
,
moment1_out
,
\
moment2_out
=
adamw_step
(
self
.
inputs
,
self
.
attrs
)
self
.
outputs
=
{
'Moment1Out'
:
moment1_out
,
'Moment2Out'
:
moment2_out
,
'ParamOut'
:
param_out
,
'Beta1PowOut'
:
np
.
array
([
beta1_pow
]).
astype
(
"float32"
)
*
beta1
,
'Beta2PowOut'
:
np
.
array
([
beta2_pow
]).
astype
(
"float32"
)
*
beta2
}
def
test_check_output
(
self
):
self
.
check_output
()
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestAdamW2
(
OpTest
):
def
setUp
(
self
):
'''Test AdamW Op with supplied attributes
'''
self
.
op_type
=
"adamw"
param
=
np
.
random
.
uniform
(
-
1
,
1
,
(
2
,
2
)).
astype
(
"float32"
)
grad
=
np
.
random
.
uniform
(
-
1
,
1
,
(
2
,
2
)).
astype
(
"float32"
)
moment1
=
np
.
random
.
uniform
(
-
1
,
1
,
(
2
,
2
)).
astype
(
"float32"
)
# The second moment is positive
moment2
=
np
.
random
.
random
((
2
,
2
)).
astype
(
"float32"
)
learning_rate
=
0.004
beta1
=
0.78
beta2
=
0.836
epsilon
=
1e-4
beta1_pow
=
beta1
**
10
beta2_pow
=
beta2
**
10
self
.
inputs
=
{
'Param'
:
param
,
'Grad'
:
grad
,
'Moment1'
:
moment1
,
'Moment2'
:
moment2
,
'LearningRate'
:
np
.
array
([
learning_rate
]).
astype
(
"float32"
),
'Beta1Pow'
:
np
.
array
([
beta1_pow
]).
astype
(
"float32"
),
'Beta2Pow'
:
np
.
array
([
beta2_pow
]).
astype
(
"float32"
)
}
self
.
attrs
=
{
'epsilon'
:
epsilon
,
'beta1'
:
beta1
,
'beta2'
:
beta2
,
"lr_ratio"
:
0.1
,
"coeff"
:
0.5
,
"with_decay"
:
True
}
param_out
,
moment1_out
,
moment2_out
=
adamw_step
(
self
.
inputs
,
self
.
attrs
)
self
.
outputs
=
{
'Moment1Out'
:
moment1_out
,
'Moment2Out'
:
moment2_out
,
'ParamOut'
:
param_out
,
'Beta1PowOut'
:
np
.
array
([
beta1_pow
]).
astype
(
"float32"
)
*
beta1
,
'Beta2PowOut'
:
np
.
array
([
beta2_pow
]).
astype
(
"float32"
)
*
beta2
}
def
test_check_output
(
self
):
self
.
check_output_with_place
(
core
.
CUDAPlace
(
0
))
class
TestAdamWOp
(
unittest
.
TestCase
):
class
TestAdamWOp
(
unittest
.
TestCase
):
...
@@ -160,7 +304,14 @@ def simple_lr_setting(param, decay_rate, n_layers):
...
@@ -160,7 +304,14 @@ def simple_lr_setting(param, decay_rate, n_layers):
return
decay_rate
**
(
n_layers
+
2
-
depth
)
return
decay_rate
**
(
n_layers
+
2
-
depth
)
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestAdamWOpLayerwiseLR
(
TestAdamWOp
):
class
TestAdamWOpLayerwiseLR
(
TestAdamWOp
):
def
setUp
(
self
):
random
.
seed
(
2021
)
np
.
random
.
seed
(
2021
)
paddle
.
seed
(
2021
)
def
test_adamw_op_dygraph
(
self
):
def
test_adamw_op_dygraph
(
self
):
paddle
.
disable_static
()
paddle
.
disable_static
()
value
=
np
.
arange
(
26
).
reshape
(
2
,
13
).
astype
(
"float32"
)
value
=
np
.
arange
(
26
).
reshape
(
2
,
13
).
astype
(
"float32"
)
...
@@ -181,17 +332,20 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp):
...
@@ -181,17 +332,20 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp):
weight_decay
=
0.01
,
weight_decay
=
0.01
,
lr_ratio
=
simple_lr_fun
)
lr_ratio
=
simple_lr_fun
)
for
_
in
range
(
2
):
loss_ref
=
np
.
array
(
[
4.8383293
,
3.0854003
,
1.33299
,
-
0.418993
,
-
2.171043
])
for
i
in
range
(
5
):
a1
=
linear1
(
a
)
a1
=
linear1
(
a
)
out
=
linear2
(
a1
)
out
=
linear2
(
a1
)
out
=
paddle
.
mean
(
out
)
out
.
backward
()
out
.
backward
()
adam
.
step
()
adam
.
step
()
adam
.
clear_gradients
()
adam
.
clear_gradients
()
np
.
testing
.
assert_allclose
(
out
[
0
].
numpy
(),
loss_ref
[
i
],
rtol
=
1e-6
)
def
test_adamw_op
(
self
):
def
test_adamw_op
(
self
):
paddle
.
enable_static
()
paddle
.
enable_static
()
place
=
fluid
.
CUDAPlace
(
0
)
if
fluid
.
is_compiled_with_cuda
()
\
place
=
fluid
.
CUDAPlace
(
0
)
else
fluid
.
CPUPlace
()
train_prog
=
fluid
.
Program
()
train_prog
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
with
fluid
.
program_guard
(
train_prog
,
startup
):
with
fluid
.
program_guard
(
train_prog
,
startup
):
...
@@ -223,7 +377,10 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp):
...
@@ -223,7 +377,10 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp):
exe
=
fluid
.
Executor
(
place
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup
)
exe
.
run
(
startup
)
for
_
in
range
(
2
):
loss_ref
=
np
.
array
(
[
0.36120513
,
0.2720821
,
0.67208904
,
0.14607805
,
0.24098626
])
for
i
in
range
(
5
):
inputs
=
np
.
random
.
random
(
size
=
[
8
,
10
]).
astype
(
'float32'
)
inputs
=
np
.
random
.
random
(
size
=
[
8
,
10
]).
astype
(
'float32'
)
outputs
=
np
.
random
.
random
(
size
=
[
8
,
1
]).
astype
(
'float32'
)
outputs
=
np
.
random
.
random
(
size
=
[
8
,
1
]).
astype
(
'float32'
)
rets
=
exe
.
run
(
train_prog
,
rets
=
exe
.
run
(
train_prog
,
...
@@ -231,6 +388,7 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp):
...
@@ -231,6 +388,7 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp):
"y"
:
outputs
},
"y"
:
outputs
},
fetch_list
=
[
avg_cost
])
fetch_list
=
[
avg_cost
])
assert
rets
[
0
]
is
not
None
assert
rets
[
0
]
is
not
None
np
.
testing
.
assert_allclose
(
rets
[
0
],
loss_ref
[
i
],
rtol
=
1e-6
)
paddle
.
disable_static
()
paddle
.
disable_static
()
...
...
python/paddle/optimizer/adamw.py
浏览文件 @
70e67843
...
@@ -171,9 +171,9 @@ class AdamW(Adam):
...
@@ -171,9 +171,9 @@ class AdamW(Adam):
self
.
_lr_to_coeff
=
dict
()
self
.
_lr_to_coeff
=
dict
()
if
lr_ratio
is
not
None
:
if
lr_ratio
is
not
None
:
assert
isinstance
(
lr_ratio
,
Callable
)
assert
isinstance
(
lr_ratio
,
Callable
)
if
core
.
is_compiled_with_xpu
()
or
core
.
is_compiled_with_npu
():
if
not
core
.
is_compiled_with_cuda
():
raise
NotImplementedError
(
raise
NotImplementedError
(
"'lr_ratio' is unimplemented in XPU and NPU"
)
"'lr_ratio' is unimplemented in
CPU,
XPU and NPU"
)
self
.
_lr_ratio
=
lr_ratio
self
.
_lr_ratio
=
lr_ratio
super
(
AdamW
,
self
).
__init__
(
super
(
AdamW
,
self
).
__init__
(
...
@@ -305,7 +305,7 @@ class AdamW(Adam):
...
@@ -305,7 +305,7 @@ class AdamW(Adam):
'epsilon'
,
self
.
_epsilon
,
'lazy_mode'
,
self
.
_lazy_mode
,
'epsilon'
,
self
.
_epsilon
,
'lazy_mode'
,
self
.
_lazy_mode
,
'min_row_size_to_use_multithread'
,
1000
,
'beta1'
,
_beta1
,
'min_row_size_to_use_multithread'
,
1000
,
'beta1'
,
_beta1
,
'beta2'
,
_beta2
,
'coeff'
,
self
.
_coeff
,
'multi_precision'
,
'beta2'
,
_beta2
,
'coeff'
,
self
.
_coeff
,
'multi_precision'
,
find_master
,
"lr_ratio"
,
lr_ratio_
)
find_master
,
'lr_ratio'
,
lr_ratio_
)
return
None
return
None
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录