Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
a1f8ea54
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
大约 2 年 前同步成功
通知
285
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a1f8ea54
编写于
3月 05, 2020
作者:
W
wuzewu
提交者:
GitHub
3月 05, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
strategy support module v2 params_layer (#343)
* strategy support layer_params
上级
2ef7c1e9
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
153 addition
and
59 deletion
+153
-59
paddlehub/finetune/strategy.py
paddlehub/finetune/strategy.py
+153
-59
未找到文件。
paddlehub/finetune/strategy.py
浏览文件 @
a1f8ea54
...
@@ -202,9 +202,13 @@ class CombinedStrategy(DefaultStrategy):
...
@@ -202,9 +202,13 @@ class CombinedStrategy(DefaultStrategy):
"noam_decay"
:
False
,
"noam_decay"
:
False
,
"discriminative"
:
{
"discriminative"
:
{
"blocks"
:
0
,
"blocks"
:
0
,
"params_layer"
:
None
,
"factor"
:
2.6
"factor"
:
2.6
},
},
"gradual_unfreeze"
:
0
,
"gradual_unfreeze"
:
{
"blocks"
:
0
,
"params_layer"
:
None
,
},
"slanted_triangle"
:
{
"slanted_triangle"
:
{
"cut_fraction"
:
0.0
,
"cut_fraction"
:
0.0
,
"ratio"
:
32
"ratio"
:
32
...
@@ -234,6 +238,38 @@ class CombinedStrategy(DefaultStrategy):
...
@@ -234,6 +238,38 @@ class CombinedStrategy(DefaultStrategy):
for
name
in
clip
:
for
name
in
clip
:
self
.
check_assign
(
self
.
clip
,
name
,
clip
[
name
])
self
.
check_assign
(
self
.
clip
,
name
,
clip
[
name
])
# resolve the conflict
if
self
.
scheduler
[
"discriminative"
][
"params_layer"
]
and
self
.
scheduler
[
"discriminative"
][
"blocks"
]:
logger
.
warning
(
"Both params_layer and blocks have been set in discriminative, only params_layer will take effect"
)
self
.
scheduler
[
"discriminative"
][
"blocks"
]
=
0
if
self
.
scheduler
[
"gradual_unfreeze"
][
"params_layer"
]
and
self
.
scheduler
[
"gradual_unfreeze"
][
"blocks"
]:
logger
.
warning
(
"Both params_layer and blocks have been set in gradual_unfreeze, only params_layer will take effect"
)
self
.
scheduler
[
"gradual_unfreeze"
][
"blocks"
]
=
0
if
self
.
scheduler
[
"slanted_triangle"
][
"cut_fraction"
]
and
(
self
.
scheduler
[
"warmup"
]
or
self
.
scheduler
[
"noam_decay"
]
or
self
.
scheduler
[
"linear_decay"
][
"start_point"
]
<
1
):
logger
.
warning
(
"You are using slanted_triangle learning rate strategy, "
"which will make warmup, noam_decay and linear_decay useless"
)
self
.
scheduler
[
"warmup"
]
=
0.0
self
.
scheduler
[
"noam_decay"
]
=
False
self
.
scheduler
[
"linear_decay"
][
"start_point"
]
=
1
if
self
.
scheduler
[
"noam_decay"
]
and
self
.
scheduler
[
"linear_decay"
][
"start_point"
]:
logger
.
warning
(
"Both noam_decay and linear_decay have been set, only noam_decay will take effect"
)
self
.
scheduler
[
"linear_decay"
][
"start_point"
]
=
1
self
.
epoch
=
0
self
.
epoch
=
0
self
.
main_program
=
None
self
.
main_program
=
None
...
@@ -244,9 +280,9 @@ class CombinedStrategy(DefaultStrategy):
...
@@ -244,9 +280,9 @@ class CombinedStrategy(DefaultStrategy):
sub_dict
=
dictionary
[
key
]
sub_dict
=
dictionary
[
key
]
for
sub_name
in
value
:
for
sub_name
in
value
:
self
.
check_assign
(
sub_dict
,
sub_name
,
value
[
sub_name
])
self
.
check_assign
(
sub_dict
,
sub_name
,
value
[
sub_name
])
elif
isinstance
(
dictionary
[
key
],
elif
isinstance
(
dictionary
[
key
],
type
(
value
))
or
(
type
(
value
))
or
(
isinstance
(
dictionary
[
key
],
float
)
isinstance
(
dictionary
[
key
],
float
)
and
isinstance
(
value
,
(
float
,
int
)))
:
and
isinstance
(
value
,
(
float
,
int
)))
or
dictionary
[
key
]
==
None
:
dictionary
[
key
]
=
value
dictionary
[
key
]
=
value
else
:
else
:
if
isinstance
(
dictionary
[
key
],
dict
):
if
isinstance
(
dictionary
[
key
],
dict
):
...
@@ -283,52 +319,49 @@ class CombinedStrategy(DefaultStrategy):
...
@@ -283,52 +319,49 @@ class CombinedStrategy(DefaultStrategy):
persistable
=
True
,
persistable
=
True
,
name
=
"learning_rate"
)
name
=
"learning_rate"
)
if
not
self
.
scheduler
[
"slanted_triangle"
][
"cut_fraction"
]:
warmup_steps
=
int
(
max_train_steps
*
self
.
scheduler
[
"warmup"
])
warmup_steps
=
int
(
max_train_steps
*
self
.
scheduler
[
"warmup"
])
linear_decay_start
=
int
(
# noam_decay (based on warmup)
max_train_steps
*
self
.
scheduler
[
"linear_decay"
][
"start_point"
])
if
self
.
scheduler
[
"noam_decay"
]:
if
linear_decay_start
<
warmup_steps
:
if
warmup_steps
>
0
:
logger
.
warning
(
scheduled_lr
=
fluid
.
layers
.
learning_rate_scheduler
\
"linear decay can not start during warmup process,"
.
noam_decay
(
1
/
(
warmup_steps
*
(
self
.
learning_rate
**
2
)),
"it will start after warmup ends!"
)
warmup_steps
)
linear_decay_start
=
warmup_steps
else
:
if
self
.
scheduler
[
"noam_decay"
]:
if
warmup_steps
>
0
:
scheduled_lr
=
fluid
.
layers
.
learning_rate_scheduler
\
.
noam_decay
(
1
/
(
warmup_steps
*
(
self
.
learning_rate
**
2
)),
warmup_steps
)
else
:
logger
.
warning
(
"Noam decay learning rate scheduler should have positive
\
warmup steps, using constant learning rate instead!"
)
if
not
self
.
scheduler
[
"noam_decay"
]
and
\
(
warmup_steps
>
0
or
self
.
scheduler
[
"linear_decay"
][
"start_point"
]
<
1
):
with
self
.
main_program
.
_lr_schedule_guard
():
global_step
=
lr_scheduler
.
_decay_step_counter
()
with
control_flow
.
Switch
()
as
switch
:
if
warmup_steps
>
0
:
with
switch
.
case
(
global_step
<
warmup_steps
):
decayed_lr
=
self
.
learning_rate
*
global_step
*
1.0
/
warmup_steps
fluid
.
layers
.
assign
(
decayed_lr
,
scheduled_lr
)
if
self
.
scheduler
[
"linear_decay"
][
"start_point"
]
<
1
:
with
switch
.
case
(
global_step
>=
linear_decay_start
):
decayed_lr
=
lr_scheduler
.
polynomial_decay
(
learning_rate
=
self
.
learning_rate
,
decay_steps
=
max_train_steps
,
end_learning_rate
=
self
.
scheduler
[
"linear_decay"
][
"end_learning_rate"
],
power
=
1.0
,
cycle
=
False
)
fluid
.
layers
.
assign
(
decayed_lr
,
scheduled_lr
)
else
:
if
self
.
scheduler
[
"warmup"
]
or
self
.
scheduler
[
"noam_decay"
]
or
self
.
scheduler
[
"linear_decay"
][
"start_point"
]
<
1
:
logger
.
warning
(
logger
.
warning
(
"You are using slanted_triangle learning rate "
"Noam decay learning rate scheduler should have positive
\
"which will make warmup, noam_decay and linear_decay unable"
warmup steps, using constant learning rate instead!"
)
)
# warmup, linear_decay
if
warmup_steps
>
0
or
self
.
scheduler
[
"linear_decay"
][
"start_point"
]
<
1
:
with
self
.
main_program
.
_lr_schedule_guard
():
global_step
=
lr_scheduler
.
_decay_step_counter
()
with
control_flow
.
Switch
()
as
switch
:
if
warmup_steps
>
0
:
with
switch
.
case
(
global_step
<
warmup_steps
):
decayed_lr
=
self
.
learning_rate
*
global_step
*
1.0
/
warmup_steps
fluid
.
layers
.
assign
(
decayed_lr
,
scheduled_lr
)
if
self
.
scheduler
[
"linear_decay"
][
"start_point"
]
<
1
:
linear_decay_start
=
int
(
max_train_steps
*
self
.
scheduler
[
"linear_decay"
][
"start_point"
])
if
linear_decay_start
<
warmup_steps
:
logger
.
warning
(
"linear decay can not start during warmup process,"
"it will start after warmup ends!"
)
linear_decay_start
=
warmup_steps
with
switch
.
case
(
global_step
>=
linear_decay_start
):
decayed_lr
=
lr_scheduler
.
polynomial_decay
(
learning_rate
=
self
.
learning_rate
,
decay_steps
=
max_train_steps
,
end_learning_rate
=
self
.
scheduler
[
"linear_decay"
]
[
"end_learning_rate"
],
power
=
1.0
,
cycle
=
False
)
fluid
.
layers
.
assign
(
decayed_lr
,
scheduled_lr
)
# slanted_triangle
if
self
.
scheduler
[
"slanted_triangle"
][
"cut_fraction"
]:
cut_step
=
int
(
max_train_steps
*
cut_step
=
int
(
max_train_steps
*
self
.
scheduler
[
"slanted_triangle"
][
"cut_fraction"
])
self
.
scheduler
[
"slanted_triangle"
][
"cut_fraction"
])
ratio
=
self
.
scheduler
[
"slanted_triangle"
][
"ratio"
]
ratio
=
self
.
scheduler
[
"slanted_triangle"
][
"ratio"
]
...
@@ -346,9 +379,25 @@ class CombinedStrategy(DefaultStrategy):
...
@@ -346,9 +379,25 @@ class CombinedStrategy(DefaultStrategy):
(
ratio
-
1
))
/
ratio
(
ratio
-
1
))
/
ratio
fluid
.
layers
.
assign
(
decayed_lr
,
scheduled_lr
)
fluid
.
layers
.
assign
(
decayed_lr
,
scheduled_lr
)
# set optimizer
super
(
CombinedStrategy
,
self
).
__init__
(
super
(
CombinedStrategy
,
self
).
__init__
(
optimizer_name
=
self
.
_optimizer_name
,
learning_rate
=
scheduled_lr
)
optimizer_name
=
self
.
_optimizer_name
,
learning_rate
=
scheduled_lr
)
# discriminative learning rate
# based on layer
if
self
.
scheduler
[
"discriminative"
][
"params_layer"
]:
max_layer
=
max
(
self
.
scheduler
[
"discriminative"
][
"params_layer"
].
values
())
for
param
in
self
.
main_program
.
global_block
().
iter_parameters
():
if
param
.
name
in
self
.
scheduler
[
"discriminative"
][
"params_layer"
]:
param_layer
=
self
.
scheduler
[
"discriminative"
][
"params_layer"
][
param
.
name
]
param
.
optimize_attr
[
"learning_rate"
]
*=
pow
(
1.0
/
self
.
scheduler
[
"discriminative"
][
"factor"
],
max_layer
-
param_layer
)
# based on blocks
if
self
.
scheduler
[
"discriminative"
][
"blocks"
]:
if
self
.
scheduler
[
"discriminative"
][
"blocks"
]:
_block_layers
=
math
.
ceil
(
_block_layers
=
math
.
ceil
(
len
(
self
.
sorted_depth
)
/
len
(
self
.
sorted_depth
)
/
...
@@ -426,13 +475,12 @@ class CombinedStrategy(DefaultStrategy):
...
@@ -426,13 +475,12 @@ class CombinedStrategy(DefaultStrategy):
pass
pass
if
self
.
scheduler
[
"discriminative"
][
"blocks"
]
>
0
or
self
.
scheduler
[
if
self
.
scheduler
[
"discriminative"
][
"blocks"
]
>
0
or
self
.
scheduler
[
"gradual_unfreeze"
]
>
0
:
"gradual_unfreeze"
]
[
"blocks"
]
>
0
:
self
.
depth_params_dict
=
get_depth_parameter
(
self
.
main_program
)
self
.
depth_params_dict
=
get_depth_parameter
(
self
.
main_program
)
self
.
sorted_depth
=
sorted
(
self
.
sorted_depth
=
sorted
(
self
.
depth_params_dict
.
keys
(),
reverse
=
True
)
self
.
depth_params_dict
.
keys
(),
reverse
=
True
)
self
.
max_depth
=
len
(
self
.
sorted_depth
)
self
.
max_depth
=
len
(
self
.
sorted_depth
)
logger
.
info
(
self
.
__str__
())
# handle scheduler
# handle scheduler
scheduled_lr
=
self
.
scheduler_handler
(
max_train_steps
)
scheduled_lr
=
self
.
scheduler_handler
(
max_train_steps
)
...
@@ -442,6 +490,8 @@ class CombinedStrategy(DefaultStrategy):
...
@@ -442,6 +490,8 @@ class CombinedStrategy(DefaultStrategy):
# handle regularization
# handle regularization
self
.
regularization_handler
(
loss
,
scheduled_lr
)
self
.
regularization_handler
(
loss
,
scheduled_lr
)
logger
.
info
(
self
.
__str__
())
return
scheduled_lr
,
max_train_steps
return
scheduled_lr
,
max_train_steps
def
exclude_from_weight_decay
(
self
,
name
):
def
exclude_from_weight_decay
(
self
,
name
):
...
@@ -454,25 +504,64 @@ class CombinedStrategy(DefaultStrategy):
...
@@ -454,25 +504,64 @@ class CombinedStrategy(DefaultStrategy):
return
False
return
False
def
step
(
self
):
def
step
(
self
):
if
self
.
scheduler
[
"gradual_unfreeze"
]
>
0
:
if
self
.
scheduler
[
"gradual_unfreeze"
]
[
"blocks"
]
>
0
:
self
.
epoch
+=
1
self
.
epoch
+=
1
if
self
.
max_depth
>
0
and
self
.
epoch
<=
self
.
scheduler
[
if
self
.
max_depth
>
0
and
self
.
epoch
<=
self
.
scheduler
[
"gradual_unfreeze"
]:
"gradual_unfreeze"
]
[
"blocks"
]
:
set_gradual_unfreeze
(
set_gradual_unfreeze
(
self
.
main_program
,
self
.
main_program
,
unfreeze_depths
=
self
.
unfreeze_depths
=
self
.
sorted_depth
[:
self
.
max_depth
*
self
.
epoch
//
sorted_depth
[:
self
.
max_depth
*
self
.
epoch
//
self
.
scheduler
[
"gradual_unfreeze"
]])
self
.
scheduler
[
"gradual_unfreeze"
]
[
"blocks"
]
])
else
:
else
:
logger
.
warning
(
logger
.
warning
(
"The max op-depth in the network is %s. That results in that can't use the gradual unfreeze finetune strategy."
"The max op-depth in the network is %s. That results in that can't use the gradual unfreeze finetune strategy."
%
(
self
.
max_depth
))
%
(
self
.
max_depth
))
elif
self
.
scheduler
[
"gradual_unfreeze"
][
"params_layer"
]:
max_layer
=
max
(
self
.
scheduler
[
"gradual_unfreeze"
][
"params_layer"
].
values
())
if
self
.
epoch
<=
max_layer
:
for
param
in
self
.
main_program
.
global_block
().
iter_parameters
():
if
param
.
name
in
self
.
scheduler
[
"gradual_unfreeze"
][
"params_layer"
]:
param_layer
=
self
.
scheduler
[
"gradual_unfreeze"
][
"params_layer"
][
param
.
name
]
if
param_layer
>=
max_layer
-
self
.
epoch
:
param
.
stop_gradient
=
False
else
:
param
.
stop_gradient
=
True
self
.
epoch
+=
1
else
:
else
:
pass
pass
def
__str__
(
self
):
def
__str__
(
self
):
return
"Strategy with scheduler: %s, regularization: %s and clip: %s"
%
(
self
.
clip
=
{
"GlobalNorm"
:
0.0
,
"Norm"
:
0.0
}
self
.
scheduler
,
self
.
regularization
,
self
.
clip
)
strategy_name
=
""
strategy_name
+=
"warmup, "
if
self
.
scheduler
[
"warmup"
]
else
""
strategy_name
+=
"linear decay, "
if
self
.
scheduler
[
"linear_decay"
][
"start_point"
]
<
1
else
""
strategy_name
+=
"noam decay, "
if
self
.
scheduler
[
"noam_decay"
]
else
""
strategy_name
+=
"discriminative learning rate, "
if
self
.
scheduler
[
"discriminative"
][
"blocks"
]
or
self
.
scheduler
[
"discriminative"
][
"params_layer"
]
else
""
strategy_name
+=
"gradual unfreeze, "
if
self
.
scheduler
[
"gradual_unfreeze"
][
"blocks"
]
or
self
.
scheduler
[
"gradual_unfreeze"
][
"params_layer"
]
else
""
strategy_name
+=
"slanted triangle learning rate, "
if
self
.
scheduler
[
"slanted_triangle"
]
else
""
strategy_name
+=
"L2 regularization, "
if
self
.
regularization
[
"L2"
]
else
""
strategy_name
+=
"L2SP regularization, "
if
self
.
regularization
[
"L2SP"
]
else
""
strategy_name
+=
"weight decay regularization, "
if
self
.
regularization
[
"weight_decay"
]
else
""
strategy_name
+=
"GlobalNorm clip, "
if
self
.
clip
[
"GlobalNorm"
]
else
""
strategy_name
+=
"Norm clip, "
if
self
.
clip
[
"Norm"
]
else
""
return
"Strategy with %s"
%
(
strategy_name
)
class
AdamWeightDecayStrategy
(
CombinedStrategy
):
class
AdamWeightDecayStrategy
(
CombinedStrategy
):
...
@@ -544,17 +633,22 @@ class ULMFiTStrategy(CombinedStrategy):
...
@@ -544,17 +633,22 @@ class ULMFiTStrategy(CombinedStrategy):
ratio
=
32
,
ratio
=
32
,
dis_blocks
=
3
,
dis_blocks
=
3
,
factor
=
2.6
,
factor
=
2.6
,
frz_blocks
=
3
):
frz_blocks
=
3
,
params_layer
=
None
):
scheduler
=
{
scheduler
=
{
"slanted_triangle"
:
{
"slanted_triangle"
:
{
"cut_fraction"
:
cut_fraction
,
"cut_fraction"
:
cut_fraction
,
"ratio"
:
ratio
"ratio"
:
ratio
},
},
"gradual_unfreeze"
:
frz_blocks
,
"gradual_unfreeze"
:
{
"blocks"
:
frz_blocks
,
"params_layer"
:
params_layer
},
"discriminative"
:
{
"discriminative"
:
{
"blocks"
:
dis_blocks
,
"blocks"
:
dis_blocks
,
"factor"
:
factor
"factor"
:
factor
,
"params_layer"
:
params_layer
}
}
}
}
regularization
=
{}
regularization
=
{}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录