Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
DeepSpeed
提交
4354c3cc
D
DeepSpeed
项目概览
Greenplum
/
DeepSpeed
上一次同步 大约 1 年
通知
10
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeed
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
4354c3cc
编写于
1月 05, 2022
作者:
O
Olatunji Ruwase
提交者:
GitHub
1月 05, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix largest param numel calculation (#1623)
Co-authored-by:
N
Jeff Rasley
<
jerasley@microsoft.com
>
上级
74493b2b
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
4 addition
and
72 deletion
+4
-72
deepspeed/runtime/zero/stage3.py
deepspeed/runtime/zero/stage3.py
+4
-72
未找到文件。
deepspeed/runtime/zero/stage3.py
浏览文件 @
4354c3cc
...
...
@@ -848,8 +848,10 @@ class DeepSpeedZeroOptimizer_Stage3(object):
#Largest partitioned param
largest_partitioned_param_numel
=
max
([
max
([
tensor
.
numel
()
for
tensor
in
fp16_partitioned_group
])
for
fp16_partitioned_group
in
self
.
fp16_partitioned_groups
max
([
max
(
tensor
.
numel
(),
tensor
.
ds_numel
)
for
tensor
in
fp16_partitioned_group
])
for
fp16_partitioned_group
in
self
.
fp16_partitioned_groups
])
print_rank_0
(
f
'Largest partitioned param numel =
{
largest_partitioned_param_numel
}
'
,
...
...
@@ -982,76 +984,6 @@ class DeepSpeedZeroOptimizer_Stage3(object):
dtype
=
torch
.
float32
,
timers
=
self
.
timers
)
def
_create_fp16_partitions
(
self
):
dist
.
barrier
()
partition_id
=
dist
.
get_rank
(
group
=
self
.
dp_process_group
)
# loop to deal with groups
for
j
,
param_group
in
enumerate
(
self
.
optimizer
.
param_groups
):
sub_groups
=
self
.
_create_fp16_sub_groups
(
param_group
[
'params'
])
for
sub_group
in
sub_groups
:
i
=
len
(
self
.
fp16_groups
)
# push this group to list before modify
self
.
fp16_groups
.
append
(
sub_group
)
self
.
sub_group_to_group_id
[
i
]
=
j
#These are the list of the partitioned parameters
self
.
fp16_partitioned_groups
.
append
(
[
param
.
ds_tensor
for
param
in
self
.
fp16_groups
[
i
]])
print_rank_0
(
f
"fp16 group
{
i
}
partitioned_param norms :
{
[
param
.
ds_tensor
.
norm
().
item
()
for
param
in
self
.
fp16_groups
[
i
]]
}
"
)
# Record padding required to align group to world size (only applies to last rank)
if
partition_id
==
dist
.
get_world_size
(
group
=
self
.
dp_process_group
)
-
1
:
padding
=
[
p
.
padding_size
()
for
p
in
self
.
fp16_groups
[
i
]]
else
:
padding
=
[
0
]
*
len
(
self
.
fp16_groups
[
i
])
self
.
groups_padding
.
append
(
padding
)
#not sure why apex was cloning the weights before flattening
#removing cloning here
see_memory_usage
(
f
"Before Flattening param group
{
i
}
"
,
force
=
False
)
if
not
self
.
offload_param
:
see_memory_usage
(
f
"Before moving param group
{
i
}
to CPU"
,
force
=
False
)
#move all the parameters to cpu to free up GPU space for creating flat buffer
move_to_cpu
(
self
.
fp16_partitioned_groups
[
i
])
see_memory_usage
(
f
"After moving param group
{
i
}
to CPU"
,
force
=
False
)
#create flat buffer in CPU and move to GPU
self
.
fp16_partitioned_groups_flat
.
append
(
self
.
flatten_dense_tensors_aligned
(
self
.
fp16_partitioned_groups
[
i
],
dist
.
get_world_size
(
group
=
self
.
dp_process_group
)).
cuda
(
torch
.
cuda
.
current_device
()))
see_memory_usage
(
f
"After flattening and moving param group
{
i
}
to GPU"
,
force
=
False
)
else
:
#Without the detach, seems like the flattening becomes part of the
#model graph causing errors downstream
self
.
fp16_partitioned_groups_flat
.
append
(
self
.
flatten_dense_tensors_aligned
(
self
.
fp16_partitioned_groups
[
i
],
dist
.
get_world_size
(
group
=
self
.
dp_process_group
)).
detach
().
pin_memory
())
see_memory_usage
(
f
"After Flattening param group
{
i
}
"
,
force
=
False
)
see_memory_usage
(
f
"After Flattening param group
{
i
}
"
,
force
=
False
)
#set model fp16 weight to slices of flattened buffer
updated_params
=
self
.
unflatten
(
self
.
fp16_partitioned_groups_flat
[
i
],
self
.
fp16_partitioned_groups
[
i
])
for
partitioned_param
,
q
in
zip
(
self
.
fp16_partitioned_groups
[
i
],
updated_params
):
partitioned_param
.
data
=
q
.
data
def
_move_to_flat_buffer
(
self
,
param_list
,
flat_buffer
,
avoid_copy
=
False
):
'''If flat buffer is None then the parameters in the param_list are
not copied to the flat buffer. This is because they excede the number of max_params_in_cpu
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录