Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
DeepSpeed
提交
f5cce75e
D
DeepSpeed
项目概览
Greenplum
/
DeepSpeed
上一次同步 大约 1 年
通知
10
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeed
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
f5cce75e
编写于
9月 16, 2020
作者:
S
Shaden Smith
提交者:
GitHub
9月 16, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Overflow fix (#416)
* Switches fused_optimizer overflow calculation
上级
7d91be97
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
24 addition
and
19 deletion
+24
-19
deepspeed/runtime/fp16/fused_optimizer.py
deepspeed/runtime/fp16/fused_optimizer.py
+24
-19
未找到文件。
deepspeed/runtime/fp16/fused_optimizer.py
浏览文件 @
f5cce75e
...
@@ -11,7 +11,7 @@ from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
...
@@ -11,7 +11,7 @@ from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from
deepspeed.runtime.utils
import
get_grad_norm
,
CheckOverflow
,
get_weight_norm
from
deepspeed.runtime.utils
import
get_grad_norm
,
CheckOverflow
,
get_weight_norm
from
deepspeed.runtime.fp16.loss_scaler
import
INITIAL_LOSS_SCALE
,
SCALE_WINDOW
,
MIN_LOSS_SCALE
from
deepspeed.runtime.fp16.loss_scaler
import
INITIAL_LOSS_SCALE
,
SCALE_WINDOW
,
MIN_LOSS_SCALE
from
deepspeed.utils
import
logger
from
deepspeed.utils
import
logger
,
log_dist
class
FP16_Optimizer
(
object
):
class
FP16_Optimizer
(
object
):
...
@@ -204,9 +204,30 @@ class FP16_Optimizer(object):
...
@@ -204,9 +204,30 @@ class FP16_Optimizer(object):
UPDATE_FP16
=
'update_fp16'
UPDATE_FP16
=
'update_fp16'
STEP_TIMERS
=
OVERFLOW_TIMERS
+
[
UNSCALE_AND_CLIP
,
BASIC_STEP
,
UPDATE_FP16
]
STEP_TIMERS
=
OVERFLOW_TIMERS
+
[
UNSCALE_AND_CLIP
,
BASIC_STEP
,
UPDATE_FP16
]
# First compute norm for all group so we know if there is overflow
# First determine if there is overflow.
grads_groups_flat
=
[]
self
.
start_timers
([
OVERFLOW_CHECK
])
fp16_params
=
[]
for
i
,
group
in
enumerate
(
self
.
fp16_groups
):
fp16_params
.
extend
([
p
for
p
in
group
if
p
.
grad
is
not
None
])
self
.
overflow
=
self
.
overflow_checker
.
has_overflow
(
fp16_params
)
self
.
stop_timers
([
OVERFLOW_CHECK
])
prev_scale
=
self
.
cur_scale
self
.
_update_scale
(
self
.
overflow
)
if
self
.
overflow
:
if
self
.
verbose
:
log_dist
(
"Overflow detected. Skipping step. Attempted loss "
f
"scale:
{
prev_scale
}
, reducing to
{
self
.
cur_scale
}
"
,
ranks
=
[
0
])
# Clear gradients
for
i
,
group
in
enumerate
(
self
.
fp16_groups
):
for
p
in
group
:
p
.
grad
=
None
self
.
log_timers
(
OVERFLOW_TIMERS
)
return
self
.
overflow
grads_groups_flat
=
[]
for
i
,
group
in
enumerate
(
self
.
fp16_groups
):
for
i
,
group
in
enumerate
(
self
.
fp16_groups
):
data_type
=
self
.
fp32_groups_flat
[
i
].
dtype
data_type
=
self
.
fp32_groups_flat
[
i
].
dtype
...
@@ -227,22 +248,6 @@ class FP16_Optimizer(object):
...
@@ -227,22 +248,6 @@ class FP16_Optimizer(object):
all_groups_norm
=
get_grad_norm
(
self
.
fp32_groups_flat
,
mpu
=
self
.
mpu
)
all_groups_norm
=
get_grad_norm
(
self
.
fp32_groups_flat
,
mpu
=
self
.
mpu
)
self
.
stop_timers
([
COMPUTE_NORM
])
self
.
stop_timers
([
COMPUTE_NORM
])
self
.
start_timers
([
OVERFLOW_CHECK
])
self
.
overflow
=
self
.
overflow_checker
.
check_using_norm
([
all_groups_norm
])
self
.
stop_timers
([
OVERFLOW_CHECK
])
prev_scale
=
self
.
cur_scale
self
.
_update_scale
(
self
.
overflow
)
if
self
.
overflow
:
if
self
.
verbose
:
print
(
"[deepspeed] OVERFLOW! Skipping step. Attempted loss "
"scale: {}, reducing to {} "
.
format
(
prev_scale
,
self
.
cur_scale
))
self
.
log_timers
(
OVERFLOW_TIMERS
)
grads_groups_flat
=
None
return
self
.
overflow
self
.
start_timers
([
UNSCALE_AND_CLIP
])
self
.
start_timers
([
UNSCALE_AND_CLIP
])
self
.
unscale_and_clip_grads
(
grads_groups_flat
,
[
all_groups_norm
])
self
.
unscale_and_clip_grads
(
grads_groups_flat
,
[
all_groups_norm
])
self
.
stop_timers
([
UNSCALE_AND_CLIP
])
self
.
stop_timers
([
UNSCALE_AND_CLIP
])
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录