Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
s920243400
PaddleDetection
提交
8693e1d0
P
PaddleDetection
项目概览
s920243400
/
PaddleDetection
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleDetection
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
8693e1d0
编写于
6月 20, 2022
作者:
W
Wenyu
提交者:
GitHub
6月 20, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
AdamWDL builder and VIT checkpoint functionality (#6232)
* upgrade adawm to adawmdl for transoformer * rename arg
上级
bf895541
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
51 addition
and
32 deletion
+51
-32
ppdet/modeling/backbones/vision_transformer.py
ppdet/modeling/backbones/vision_transformer.py
+13
-4
ppdet/optimizer/adamw.py
ppdet/optimizer/adamw.py
+31
-28
ppdet/optimizer/optimizer.py
ppdet/optimizer/optimizer.py
+7
-0
未找到文件。
ppdet/modeling/backbones/vision_transformer.py
浏览文件 @
8693e1d0
...
...
@@ -340,12 +340,15 @@ class VisionTransformer(nn.Layer):
use_abs_pos_emb
=
False
,
use_sincos_pos_emb
=
True
,
with_fpn
=
True
,
use_checkpoint
=
False
,
**
args
):
super
().
__init__
()
self
.
img_size
=
img_size
self
.
embed_dim
=
embed_dim
self
.
with_fpn
=
with_fpn
self
.
use_checkpoint
=
use_checkpoint
if
use_checkpoint
:
print
(
'please set: FLAGS_allocator_strategy=naive_best_fit'
)
self
.
patch_embed
=
PatchEmbed
(
img_size
=
img_size
,
patch_size
=
patch_size
,
...
...
@@ -575,7 +578,7 @@ class VisionTransformer(nn.Layer):
def
forward
(
self
,
x
):
x
=
x
[
'image'
]
if
isinstance
(
x
,
dict
)
else
x
_
,
_
,
w
,
h
=
x
.
shape
_
,
_
,
h
,
w
=
x
.
shape
x
=
self
.
patch_embed
(
x
)
...
...
@@ -586,7 +589,8 @@ class VisionTransformer(nn.Layer):
x
=
paddle
.
concat
([
cls_tokens
,
x
],
axis
=
1
)
if
self
.
pos_embed
is
not
None
:
x
=
x
+
self
.
interpolate_pos_encoding
(
x
,
w
,
h
)
# x = x + self.interpolate_pos_encoding(x, w, h)
x
=
x
+
self
.
interpolate_pos_encoding
(
x
,
h
,
w
)
x
=
self
.
pos_drop
(
x
)
...
...
@@ -597,7 +601,12 @@ class VisionTransformer(nn.Layer):
feats
=
[]
for
idx
,
blk
in
enumerate
(
self
.
blocks
):
x
=
blk
(
x
,
rel_pos_bias
)
if
self
.
use_checkpoint
:
x
=
paddle
.
distributed
.
fleet
.
utils
.
recompute
(
blk
,
x
,
rel_pos_bias
,
**
{
"preserve_rng_state"
:
True
})
else
:
x
=
blk
(
x
,
rel_pos_bias
)
if
idx
in
self
.
out_indices
:
xp
=
paddle
.
reshape
(
paddle
.
transpose
(
...
...
ppdet/optimizer/adamw.py
浏览文件 @
8693e1d0
...
...
@@ -18,6 +18,7 @@ from __future__ import print_function
from
paddle.optimizer
import
AdamW
from
functools
import
partial
import
re
def
layerwise_lr_decay
(
decay_rate
,
name_dict
,
n_layers
,
param
):
...
...
@@ -34,15 +35,20 @@ def layerwise_lr_decay(decay_rate, name_dict, n_layers, param):
"""
ratio
=
1.0
static_name
=
name_dict
[
param
.
name
]
if
"blocks"
in
static_name
:
idx
=
static_name
.
find
(
"blocks."
)
layer
=
int
(
static_name
[
idx
:].
split
(
"."
)[
1
])
if
'blocks.'
in
static_name
or
'layers.'
in
static_name
:
idx_1
=
static_name
.
find
(
'blocks.'
)
idx_2
=
static_name
.
find
(
'layers.'
)
assert
any
([
x
>=
0
for
x
in
[
idx_1
,
idx_2
]]),
''
idx
=
idx_1
if
idx_1
>=
0
else
idx_2
# idx = re.findall('[blocks|layers]\.(\d+)\.', static_name)[0]
layer
=
int
(
static_name
[
idx
:].
split
(
'.'
)[
1
])
ratio
=
decay_rate
**
(
n_layers
-
layer
)
elif
"cls_token"
in
static_name
or
'patch_embed'
in
static_name
:
elif
'cls_token'
in
static_name
or
'patch_embed'
in
static_name
:
ratio
=
decay_rate
**
(
n_layers
+
1
)
param
.
optimize_attr
[
"learning_rate"
]
*=
ratio
param
.
optimize_attr
[
'learning_rate'
]
*=
ratio
class
AdamWDL
(
AdamW
):
...
...
@@ -156,16 +162,16 @@ class AdamWDL(AdamW):
multi_precision
=
False
,
layerwise_decay
=
1.0
,
n_layers
=
12
,
set_param_lr_fun
=
None
,
set_param_lr_fun
c
=
None
,
name_dict
=
None
,
name
=
None
):
if
not
isinstance
(
layerwise_decay
,
float
):
raise
TypeError
(
"coeff should be float or Tensor."
)
self
.
layerwise_decay
=
layerwise_decay
self
.
n_layers
=
n_layers
self
.
set_param_lr_fun
=
partial
(
set_param_lr_fun
,
layerwise_decay
,
name_dict
,
n_layers
)
if
set_param_lr_fun
is
not
None
else
set_param_lr_fun
self
.
set_param_lr_fun
c
=
partial
(
set_param_lr_fun
c
,
layerwise_decay
,
name_dict
,
n_layers
)
if
set_param_lr_fun
c
is
not
None
else
set_param_lr_func
super
(
AdamWDL
,
self
).
__init__
(
learning_rate
=
learning_rate
,
parameters
=
parameters
,
...
...
@@ -180,36 +186,35 @@ class AdamWDL(AdamW):
multi_precision
=
multi_precision
)
def
_append_optimize_op
(
self
,
block
,
param_and_grad
):
if
self
.
set_param_lr_fun
is
None
:
if
self
.
set_param_lr_fun
c
is
None
:
return
super
(
AdamWDL
,
self
).
_append_optimize_op
(
block
,
param_and_grad
)
self
.
_append_decoupled_weight_decay
(
block
,
param_and_grad
)
prev_lr
=
param_and_grad
[
0
].
optimize_attr
[
"learning_rate"
]
self
.
set_param_lr_fun
(
param_and_grad
[
0
])
self
.
set_param_lr_fun
c
(
param_and_grad
[
0
])
# excute Adam op
res
=
super
(
AdamW
,
self
).
_append_optimize_op
(
block
,
param_and_grad
)
param_and_grad
[
0
].
optimize_attr
[
"learning_rate"
]
=
prev_lr
return
res
def
build_adamw
(
model
,
lr
=
1e-4
,
weight_decay
=
0.05
,
betas
=
(
0.9
,
0.999
),
layer_decay
=
0.65
,
num_layers
=
None
,
filter_bias_and_bn
=
True
,
skip_decay_names
=
None
,
set_param_lr_fun
=
None
):
def
build_adamw
dl
(
model
,
lr
=
1e-4
,
weight_decay
=
0.05
,
betas
=
(
0.9
,
0.999
),
layer_decay
=
0.65
,
num_layers
=
None
,
filter_bias_and_bn
=
True
,
skip_decay_names
=
None
,
set_param_lr_func
=
'layerwise_lr_decay'
):
if
skip_decay_names
and
filter_bias_and_bn
:
decay_dict
=
{
param
.
name
:
not
(
len
(
param
.
shape
)
==
1
or
name
.
endswith
(
".bias"
)
or
param
.
name
:
not
(
len
(
param
.
shape
)
==
1
or
name
.
endswith
(
'.bias'
)
or
any
([
_n
in
name
for
_n
in
skip_decay_names
]))
for
name
,
param
in
model
.
named_parameters
()
}
parameters
=
[
p
for
p
in
model
.
parameters
()]
else
:
...
...
@@ -221,17 +226,15 @@ def build_adamw(model,
if
decay_dict
is
not
None
:
opt_args
[
'apply_decay_param_fun'
]
=
lambda
n
:
decay_dict
[
n
]
if
isinstance
(
set_param_lr_fun
,
str
):
set_param_lr_fun
=
eval
(
set_param_lr_fun
)
opt_args
[
'set_param_lr_fun
'
]
=
set_param_lr_fun
if
isinstance
(
set_param_lr_fun
c
,
str
):
set_param_lr_fun
c
=
eval
(
set_param_lr_func
)
opt_args
[
'set_param_lr_fun
c'
]
=
set_param_lr_func
opt_args
[
'beta1'
]
=
betas
[
0
]
opt_args
[
'beta2'
]
=
betas
[
1
]
opt_args
[
'layerwise_decay'
]
=
layer_decay
name_dict
=
dict
()
for
n
,
p
in
model
.
named_parameters
():
name_dict
[
p
.
name
]
=
n
name_dict
=
{
p
.
name
:
n
for
n
,
p
in
model
.
named_parameters
()}
opt_args
[
'name_dict'
]
=
name_dict
opt_args
[
'n_layers'
]
=
num_layers
...
...
ppdet/optimizer/optimizer.py
浏览文件 @
8693e1d0
...
...
@@ -27,6 +27,8 @@ import paddle.regularizer as regularizer
from
ppdet.core.workspace
import
register
,
serializable
import
copy
from
.adamw
import
AdamWDL
,
build_adamwdl
__all__
=
[
'LearningRate'
,
'OptimizerBuilder'
]
from
ppdet.utils.logger
import
setup_logger
...
...
@@ -317,8 +319,13 @@ class OptimizerBuilder():
optim_args
=
self
.
optimizer
.
copy
()
optim_type
=
optim_args
[
'type'
]
del
optim_args
[
'type'
]
if
optim_type
==
'AdamWDL'
:
return
build_adamwdl
(
model
,
lr
=
learning_rate
,
**
optim_args
)
if
optim_type
!=
'AdamW'
:
optim_args
[
'weight_decay'
]
=
regularization
op
=
getattr
(
optimizer
,
optim_type
)
if
'param_groups'
in
optim_args
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录