Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
DeepSpeed
提交
0e942df0
D
DeepSpeed
项目概览
Greenplum
/
DeepSpeed
上一次同步 大约 1 年
通知
10
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeed
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
0e942df0
编写于
9月 16, 2020
作者:
O
Olatunji Ruwase
提交者:
GitHub
9月 16, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add Linear warmup+decay lr schedule (#414)
Update lr schedule unit tests
上级
a9e8325f
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
210 addition
and
8 deletion
+210
-8
deepspeed/runtime/lr_schedules.py
deepspeed/runtime/lr_schedules.py
+55
-1
tests/unit/test_lr_schedulers.py
tests/unit/test_lr_schedulers.py
+155
-7
未找到文件。
deepspeed/runtime/lr_schedules.py
浏览文件 @
0e942df0
...
@@ -19,7 +19,8 @@ LR_SCHEDULE = 'lr_schedule'
...
@@ -19,7 +19,8 @@ LR_SCHEDULE = 'lr_schedule'
LR_RANGE_TEST
=
'LRRangeTest'
LR_RANGE_TEST
=
'LRRangeTest'
ONE_CYCLE
=
'OneCycle'
ONE_CYCLE
=
'OneCycle'
WARMUP_LR
=
'WarmupLR'
WARMUP_LR
=
'WarmupLR'
VALID_LR_SCHEDULES
=
[
LR_RANGE_TEST
,
ONE_CYCLE
,
WARMUP_LR
]
WARMUP_DECAY_LR
=
'WarmupDecayLR'
VALID_LR_SCHEDULES
=
[
LR_RANGE_TEST
,
ONE_CYCLE
,
WARMUP_LR
,
WARMUP_DECAY_LR
]
LR_RANGE_TEST_MIN_LR
=
'lr_range_test_min_lr'
LR_RANGE_TEST_MIN_LR
=
'lr_range_test_min_lr'
LR_RANGE_TEST_STEP_RATE
=
'lr_range_test_step_rate'
LR_RANGE_TEST_STEP_RATE
=
'lr_range_test_step_rate'
...
@@ -47,6 +48,8 @@ WARMUP_MIN_LR = 'warmup_min_lr'
...
@@ -47,6 +48,8 @@ WARMUP_MIN_LR = 'warmup_min_lr'
WARMUP_MAX_LR
=
'warmup_max_lr'
WARMUP_MAX_LR
=
'warmup_max_lr'
WARMUP_NUM_STEPS
=
'warmup_num_steps'
WARMUP_NUM_STEPS
=
'warmup_num_steps'
TOTAL_NUM_STEPS
=
'total_num_steps'
def
add_tuning_arguments
(
parser
):
def
add_tuning_arguments
(
parser
):
group
=
parser
.
add_argument_group
(
'Convergence Tuning'
,
group
=
parser
.
add_argument_group
(
'Convergence Tuning'
,
...
@@ -714,3 +717,54 @@ class WarmupLR(object):
...
@@ -714,3 +717,54 @@ class WarmupLR(object):
FileNotFoundError
(
param_value
)))
FileNotFoundError
(
param_value
)))
return
list
(
param_value
)
return
list
(
param_value
)
return
[
param_value
]
*
len
(
optimizer
.
param_groups
)
return
[
param_value
]
*
len
(
optimizer
.
param_groups
)
class
WarmupDecayLR
(
WarmupLR
):
"""Increase the learning rate of each parameter group from min lr to max lr
over warmup_num_steps steps, and then decay at linear rate over the remaining training steps.
Args:
optimizer (Optimizer): Wrapped optimizer.
total_num_steps (int): total number of training steps
warmup_min_lr (float or list): minimum learning rate. Default: 0
warmup_max_lr (float or list): maximum learning rate. Default: 0.001
warmup_num_steps (int): number of steps to warm up from min_lr to max_lr. Default: 1000
last_batch_iteration (int): The index of the last batch. Default: -1.
Example:
>>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
>>> scheduler = WarmupDecayLR(optimizer, 1000000)
>>> data_loader = torch.utils.data.DataLoader(...)
>>> for epoch in range(10):
>>> for batch in data_loader:
>>> train_batch(...)
>>> scheduler.step()
"""
def
__init__
(
self
,
optimizer
:
Optimizer
,
total_num_steps
:
int
,
warmup_min_lr
:
float
=
0.0
,
warmup_max_lr
:
float
=
0.001
,
warmup_num_steps
:
int
=
1000
,
last_batch_iteration
:
int
=
-
1
):
self
.
total_num_steps
=
total_num_steps
super
(
WarmupDecayLR
,
self
).
__init__
(
optimizer
,
warmup_min_lr
,
warmup_max_lr
,
warmup_num_steps
,
last_batch_iteration
)
if
self
.
total_num_steps
<
self
.
warmup_num_steps
:
logger
.
warning
(
'total_num_steps {} is less than warmup_num_steps {}'
.
format
(
total_num_steps
,
warmup_num_steps
))
def
_get_gamma
(
self
):
if
self
.
last_batch_iteration
<
self
.
warmup_num_steps
:
return
self
.
inverse_log_warm_up
*
math
.
log
(
self
.
last_batch_iteration
+
1
)
return
max
(
0.0
,
float
(
self
.
total_num_steps
-
self
.
last_batch_iteration
)
/
float
(
max
(
1.0
,
self
.
total_num_steps
-
self
.
warmup_num_steps
)))
tests/unit/test_lr_schedulers.py
100644 → 100755
浏览文件 @
0e942df0
...
@@ -6,17 +6,25 @@ import json
...
@@ -6,17 +6,25 @@ import json
import
os
import
os
from
common
import
distributed_test
from
common
import
distributed_test
from
simple_model
import
SimpleModel
,
SimpleOptimizer
,
random_dataloader
,
args_from_dict
from
simple_model
import
SimpleModel
,
SimpleOptimizer
,
random_dataloader
,
args_from_dict
from
deepspeed.runtime.lr_schedules
import
LR_RANGE_TEST
,
ONE_CYCLE
,
WARMUP_LR
,
WARMUP_DECAY_LR
from
deepspeed.runtime.lr_schedules
import
WARMUP_MIN_LR
,
WARMUP_MAX_LR
,
WARMUP_NUM_STEPS
,
TOTAL_NUM_STEPS
from
deepspeed.runtime.lr_schedules
import
CYCLE_MIN_LR
,
CYCLE_MAX_LR
@
pytest
.
mark
.
parametrize
(
"scheduler_type,params"
,
@
pytest
.
mark
.
parametrize
(
"scheduler_type,params"
,
[(
"WarmupLR"
,
[(
WARMUP_LR
,
{}),
{}),
(
"OneCycle"
,
(
WARMUP_DECAY_LR
,
{
{
'cycle_min_lr'
:
0
,
WARMUP_NUM_STEPS
:
1
0
,
'cycle_max_lr'
:
0
TOTAL_NUM_STEPS
:
2
0
}),
}),
(
"LRRangeTest"
,
(
ONE_CYCLE
,
{
CYCLE_MIN_LR
:
0
,
CYCLE_MAX_LR
:
0
}),
(
LR_RANGE_TEST
,
{})])
{})])
def
test_get_lr_before_train
(
tmpdir
,
scheduler_type
,
params
):
def
test_get_lr_before_train
(
tmpdir
,
scheduler_type
,
params
):
config_dict
=
{
config_dict
=
{
...
@@ -42,8 +50,8 @@ def test_get_lr_before_train(tmpdir, scheduler_type, params):
...
@@ -42,8 +50,8 @@ def test_get_lr_before_train(tmpdir, scheduler_type, params):
@
distributed_test
(
world_size
=
[
1
])
@
distributed_test
(
world_size
=
[
1
])
def
_test_get_lr_before_train
(
args
,
model
,
hidden_dim
):
def
_test_get_lr_before_train
(
args
,
model
,
hidden_dim
):
model
,
_
,
_
,
lr_scheduler
=
deepspeed
.
initialize
(
args
=
args
,
model
,
_
,
_
,
lr_scheduler
=
deepspeed
.
initialize
(
args
=
args
,
model
=
model
,
model
=
model
,
model_parameters
=
model
.
parameters
())
model_parameters
=
model
.
parameters
())
data_loader
=
random_dataloader
(
model
=
model
,
data_loader
=
random_dataloader
(
model
=
model
,
total_samples
=
50
,
total_samples
=
50
,
hidden_dim
=
hidden_dim
,
hidden_dim
=
hidden_dim
,
...
@@ -57,3 +65,143 @@ def test_get_lr_before_train(tmpdir, scheduler_type, params):
...
@@ -57,3 +65,143 @@ def test_get_lr_before_train(tmpdir, scheduler_type, params):
model
.
step
()
model
.
step
()
_test_get_lr_before_train
(
args
=
args
,
model
=
model
,
hidden_dim
=
hidden_dim
)
_test_get_lr_before_train
(
args
=
args
,
model
=
model
,
hidden_dim
=
hidden_dim
)
@
pytest
.
mark
.
parametrize
(
"warmup_num_steps"
,
[
10
,
15
,
19
,
33
])
def
test_lr_warmup_schedule
(
tmpdir
,
warmup_num_steps
):
config_dict
=
{
"train_batch_size"
:
2
,
"steps_per_print"
:
1
,
"optimizer"
:
{
"type"
:
"Adam"
,
"params"
:
{
"lr"
:
0.00015
},
},
"scheduler"
:
{
"type"
:
WARMUP_LR
,
"params"
:
{
WARMUP_MIN_LR
:
0.1
,
WARMUP_MAX_LR
:
0.2
,
WARMUP_NUM_STEPS
:
warmup_num_steps
}
},
"gradient_clipping"
:
1.0
}
total_num_steps
=
2
*
warmup_num_steps
args
=
args_from_dict
(
tmpdir
,
config_dict
)
hidden_dim
=
10
model
=
SimpleModel
(
hidden_dim
,
empty_grad
=
False
)
@
distributed_test
(
world_size
=
[
1
])
def
_test_lr_warmup_schedule
(
args
,
model
,
hidden_dim
,
schedule_params
,
num_steps
):
model
,
_
,
_
,
lr_scheduler
=
deepspeed
.
initialize
(
args
=
args
,
model
=
model
,
model_parameters
=
model
.
parameters
())
data_loader
=
random_dataloader
(
model
=
model
,
total_samples
=
num_steps
*
2
,
hidden_dim
=
hidden_dim
,
device
=
model
.
device
,
dtype
=
torch
.
float
)
step_lrs
=
[]
for
n
,
batch
in
enumerate
(
data_loader
):
loss
=
model
(
batch
[
0
],
batch
[
1
])
model
.
backward
(
loss
)
model
.
step
()
step_lrs
.
append
(
lr_scheduler
.
get_lr
())
# Verify initial lr
assert
step_lrs
[
0
]
==
[
schedule_params
[
WARMUP_MIN_LR
]]
# Verify warmup completion
warmup_num_steps
=
schedule_params
[
WARMUP_NUM_STEPS
]
warmup_max_lr
=
[
schedule_params
[
WARMUP_MAX_LR
]]
assert
step_lrs
[
warmup_num_steps
]
==
warmup_max_lr
# Verify post-warmup completion
assert
all
([
warmup_max_lr
==
lr
for
lr
in
step_lrs
[
warmup_num_steps
:]])
_test_lr_warmup_schedule
(
args
=
args
,
model
=
model
,
hidden_dim
=
hidden_dim
,
schedule_params
=
config_dict
[
"scheduler"
][
"params"
],
num_steps
=
total_num_steps
)
@
pytest
.
mark
.
parametrize
(
"warmup_num_steps"
,
[
10
,
15
,
19
,
33
])
def
test_lr_warmup_decay_schedule
(
tmpdir
,
warmup_num_steps
):
config_dict
=
{
"train_batch_size"
:
2
,
"steps_per_print"
:
1
,
"optimizer"
:
{
"type"
:
"Adam"
,
"params"
:
{
"lr"
:
0.00015
},
},
"scheduler"
:
{
"type"
:
WARMUP_DECAY_LR
,
"params"
:
{
WARMUP_MIN_LR
:
0.1
,
WARMUP_MAX_LR
:
0.2
,
WARMUP_NUM_STEPS
:
warmup_num_steps
,
TOTAL_NUM_STEPS
:
warmup_num_steps
*
2
}
},
"gradient_clipping"
:
1.0
}
args
=
args_from_dict
(
tmpdir
,
config_dict
)
hidden_dim
=
10
model
=
SimpleModel
(
hidden_dim
,
empty_grad
=
False
)
@
distributed_test
(
world_size
=
[
1
])
def
_test_lr_warmup_decay_schedule
(
args
,
model
,
hidden_dim
,
schedule_params
,
num_steps
):
model
,
_
,
_
,
lr_scheduler
=
deepspeed
.
initialize
(
args
=
args
,
model
=
model
,
model_parameters
=
model
.
parameters
())
data_loader
=
random_dataloader
(
model
=
model
,
total_samples
=
num_steps
*
2
,
hidden_dim
=
hidden_dim
,
device
=
model
.
device
,
dtype
=
torch
.
float
)
step_lrs
=
[]
for
n
,
batch
in
enumerate
(
data_loader
):
loss
=
model
(
batch
[
0
],
batch
[
1
])
model
.
backward
(
loss
)
model
.
step
()
step_lrs
.
append
(
lr_scheduler
.
get_lr
())
# Verify initial lr
assert
step_lrs
[
0
]
==
[
schedule_params
[
WARMUP_MIN_LR
]]
# Verify lr at warmup completion
warmup_num_steps
=
schedule_params
[
WARMUP_NUM_STEPS
]
warmup_max_lr
=
[
schedule_params
[
WARMUP_MAX_LR
]]
assert
step_lrs
[
warmup_num_steps
]
==
warmup_max_lr
# Verify decay phase
previous_lr
=
warmup_max_lr
for
lr
in
step_lrs
[
warmup_num_steps
+
1
:]:
assert
lr
<
previous_lr
previous_lr
=
lr
schedule_params
=
config_dict
[
"scheduler"
][
"params"
]
total_num_steps
=
schedule_params
[
TOTAL_NUM_STEPS
]
_test_lr_warmup_decay_schedule
(
args
=
args
,
model
=
model
,
hidden_dim
=
hidden_dim
,
schedule_params
=
schedule_params
,
num_steps
=
total_num_steps
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录