Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
53e50383
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
53e50383
编写于
5月 25, 2022
作者:
B
Baibaifan
提交者:
GitHub
5月 25, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Dygraph]fix_sharding3_offload (#42955)
* fix_sharding3_offload * fix_fp16dtype_bug
上级
07dab9da
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
35 addition
and
24 deletion
+35
-24
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
...uted/fleet/meta_parallel/sharding/group_sharded_stage3.py
+13
-6
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
...stributed/fleet/meta_parallel/sharding/sharding_stage3.py
+22
-18
未找到文件。
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
浏览文件 @
53e50383
...
...
@@ -205,7 +205,7 @@ class GroupShardedStage3(nn.Layer):
for
param
in
list
(
self
.
_unslice_params
):
param
.
clear_gradient
(
False
)
tmp_var
=
param
.
cuda
(
DEV_ID
)
param
.
_clear_data
()
if
tmp_var
.
dtype
==
Type
.
fp32
.
value
and
param2dtype
[
param
.
name
]
==
Type
.
fp16
.
value
:
tmp_var
=
paddle
.
cast
(
tmp_var
,
Type
.
fp16
.
value
)
...
...
@@ -272,6 +272,8 @@ class GroupShardedStage3(nn.Layer):
master_tensor
=
paddle
.
cast
(
param
,
Type
.
fp32
.
value
)
master_tensor
.
name
=
param
.
name
self
.
_optim
.
_master_weights
[
param
.
name
]
=
master_tensor
if
self
.
_offload
:
param
.
master_weight
=
paddle
.
cast
(
param
,
Type
.
fp32
.
value
).
cpu
()
param2dtype
[
param
.
name
]
=
param
.
dtype
p_align
=
self
.
_param2align
(
param
)
self
.
_unslice_params2align
[
param
.
name
]
=
p_align
...
...
@@ -369,7 +371,6 @@ class GroupShardedStage3(nn.Layer):
tmp_var
.
get_tensor
().
set
(
param_cpu
.
get_tensor
(),
core
.
CPUPlace
())
del
tmp_var
param
.
get_tensor
().
_set_dims
(
param_shape
)
param
.
_clear_data
()
# Current rank param_storage
if
self
.
_offload
:
...
...
@@ -379,6 +380,9 @@ class GroupShardedStage3(nn.Layer):
value
=
tmp_tensor
,
place
=
core
.
CPUPlace
(),
name
=
"slice@"
+
param
.
name
)
with
device_guard
():
param
.
master_weight
=
paddle
.
cast
(
param
.
fw_storage
,
Type
.
fp32
.
value
)
else
:
param
.
fw_storage
=
core
.
eager
.
Tensor
(
value
=
buffer
.
_slice
(
start
,
end
),
name
=
"slice@"
+
param
.
name
)
...
...
@@ -389,6 +393,7 @@ class GroupShardedStage3(nn.Layer):
master_tensor
=
paddle
.
cast
(
param
.
fw_storage
,
Type
.
fp32
.
value
)
master_tensor
.
name
=
param
.
name
self
.
_optim
.
_master_weights
[
param
.
fw_storage
.
name
]
=
master_tensor
param
.
_clear_data
()
def
_register_forward_hooks
(
self
,
layer
):
"""
...
...
@@ -480,9 +485,8 @@ class GroupShardedStage3(nn.Layer):
collective
.
all_reduce
(
tensor
=
grad_storage
.
buffer
,
group
=
self
.
_group
)
if
self
.
_offload
:
for
param
in
list
(
self
.
_unslice_params
):
tmp_var
=
_device2cpu
(
param
,
convert_dtype
=
True
)
tmp_var
.
_share_buffer_to
(
param
)
del
tmp_var
param
.
_clear_data
()
param
.
master_weight
.
_share_buffer_to
(
param
)
for
grad_storage
in
self
.
_grad_storages
.
values
():
for
p
in
grad_storage
.
_params
:
...
...
@@ -568,7 +572,8 @@ class GroupShardedStage3(nn.Layer):
del
self
.
_task_flow
.
full_param
[
param
.
name
]
if
self
.
_offload
:
param
.
fw_storage
=
_device2cpu
(
param
.
fw_storage
,
True
)
param
.
fw_storage
.
_clear_data
()
param
.
master_weight
.
_share_buffer_to
(
param
.
fw_storage
)
return
allreduce_
...
...
@@ -856,6 +861,7 @@ def _PartitionParam(param):
if
not
hasattr
(
param
,
"fw_storage"
):
setattr
(
param
,
"fw_storage"
,
None
)
setattr
(
param
,
"bw_storage"
,
None
)
setattr
(
param
,
"master_weight"
,
None
)
setattr
(
param
,
"status"
,
"all"
)
setattr
(
param
,
"use_count"
,
0
)
return
param
...
...
@@ -864,6 +870,7 @@ def _PartitionParam(param):
def
_UnsliceParam
(
param
):
if
not
hasattr
(
param
,
"unslice"
):
setattr
(
param
,
"unslice"
,
True
)
setattr
(
param
,
"master_weight"
,
None
)
return
param
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
浏览文件 @
53e50383
...
...
@@ -199,7 +199,7 @@ class ShardingStage3(nn.Layer):
param
.
clear_gradient
(
False
)
param
.
_gradient_set_empty
(
False
)
tmp_var
=
param
.
cuda
(
DEV_ID
)
param
.
_clear
()
if
tmp_var
.
dtype
==
Type
.
fp32
.
value
and
param2dtype
[
param
.
name
]
==
Type
.
fp16
.
value
:
tmp_var
=
paddle
.
cast
(
tmp_var
,
Type
.
fp16
.
value
)
...
...
@@ -220,19 +220,14 @@ class ShardingStage3(nn.Layer):
self
.
_optim
.
_param_groups
=
slice_params
+
list
(
self
.
_unslice_params
)
else
:
params_name_list
=
list
(
map
(
lambda
p
:
p
.
name
,
update_list
))
fw_storage_name_list
=
list
(
map
(
lambda
p
:
p
.
fw_storage
.
name
,
update_list
))
for
param_group
in
self
.
_optim
.
_param_groups
:
p_group
=
[]
for
p
in
param_group
[
'params'
]:
if
p
.
name
in
params_name_list
:
if
hasattr
(
p
,
"fw_storage"
)
:
p_group
.
append
(
p
.
fw_storage
)
elif
p
.
name
in
fw_storage_name_list
:
p_group
.
append
(
update_list
[
fw_storage_name_list
.
index
(
p
.
name
)].
fw_storage
)
elif
p
in
self
.
_unslice_params
:
else
:
p_group
.
append
(
p
)
param_group
[
'params'
]
=
p_group
def
forward
(
self
,
*
inputs
,
**
kwargs
):
...
...
@@ -268,6 +263,8 @@ class ShardingStage3(nn.Layer):
if
param
.
dtype
==
Type
.
fp16
.
value
and
not
self
.
_offload
:
self
.
_optim
.
_master_weights
[
param
.
name
]
=
paddle
.
cast
(
param
,
Type
.
fp32
.
value
)
if
self
.
_offload
:
param
.
master_weight
=
paddle
.
cast
(
param
,
Type
.
fp32
.
value
).
cpu
()
param2dtype
[
param
.
name
]
=
param
.
dtype
p_align
=
self
.
_param2align
(
param
)
self
.
_unslice_params2align
[
param
.
name
]
=
p_align
...
...
@@ -335,11 +332,12 @@ class ShardingStage3(nn.Layer):
self
.
_param2buffer
[
param
.
name
].
append
(
(
rank_
*
pre_buffer
,
(
rank_
+
1
)
*
pre_buffer
))
# 3.Flatten layer params and release other rank buffer
self
.
_param_storage
(
param
,
buffer_size
)
# Record param's dtype
param2dtype
[
param
.
name
]
=
param
.
dtype
# 3.Flatten layer params and release other rank buffer
self
.
_param_storage
(
param
,
buffer_size
)
def
_param_storage
(
self
,
param
,
buffer_size
):
"""
This is a function to simplify the handling of parameter InternalStorages.
...
...
@@ -365,13 +363,15 @@ class ShardingStage3(nn.Layer):
tmp_var
.
value
().
get_tensor
().
set
(
param_cpu
.
value
().
get_tensor
(),
core
.
CPUPlace
())
param
.
value
().
get_tensor
().
_set_dims
(
param_shape
)
param
.
_clear
()
# Current rank param_storage
if
self
.
_offload
:
param
.
fw_storage
=
core
.
VarBase
(
buffer
.
_slice
(
start
,
end
),
core
.
CPUPlace
(),
"slice@"
+
param
.
name
)
with
device_guard
(
device
=
"cpu"
):
param
.
master_weight
=
paddle
.
cast
(
param
.
fw_storage
,
Type
.
fp32
.
value
)
else
:
param
.
fw_storage
=
core
.
VarBase
(
buffer
.
_slice
(
start
,
end
),
"slice@"
+
param
.
name
)
...
...
@@ -381,6 +381,7 @@ class ShardingStage3(nn.Layer):
if
param
.
dtype
==
Type
.
fp16
.
value
and
not
self
.
_offload
:
self
.
_optim
.
_master_weights
[
param
.
fw_storage
.
name
]
=
paddle
.
cast
(
param
.
fw_storage
,
Type
.
fp32
.
value
)
param
.
_clear
()
def
_register_forward_hooks
(
self
,
layer
):
"""
...
...
@@ -482,9 +483,8 @@ class ShardingStage3(nn.Layer):
if
self
.
_offload
:
for
param
in
list
(
self
.
_unslice_params
):
tmp_var
=
_device2cpu
(
param
,
convert_dtype
=
True
)
tmp_var
.
_share_buffer_to
(
param
)
tmp_var
.
_clear
()
param
.
_clear
()
param
.
master_weight
.
_share_buffer_to
(
param
)
for
grad_storage
in
self
.
_grad_storages
.
values
():
for
p
in
grad_storage
.
_params
:
...
...
@@ -553,8 +553,9 @@ class ShardingStage3(nn.Layer):
cpu_grad
=
_device2cpu
(
core
.
VarBase
(
full_grad
.
_slice
(
start
,
end
))
.
detach
().
clone
(),
True
)
param
.
bw_storage
=
paddle
.
add
(
param
.
bw_storage
,
cpu_grad
)
with
device_guard
(
device
=
"cpu"
):
param
.
bw_storage
=
paddle
.
add
(
param
.
bw_storage
,
cpu_grad
)
else
:
# param.bw_storage.add_(
# core.VarBase(full_grad._slice(start, end))
...
...
@@ -581,7 +582,8 @@ class ShardingStage3(nn.Layer):
tmp_var
.
_clear
()
if
self
.
_offload
:
param
.
fw_storage
=
_device2cpu
(
param
.
fw_storage
,
True
)
param
.
fw_storage
.
_clear
()
param
.
master_weight
.
_share_buffer_to
(
param
.
fw_storage
)
return
allreduce_
...
...
@@ -869,6 +871,7 @@ def _PartitionParam(param):
if
not
hasattr
(
param
,
"fw_storage"
):
setattr
(
param
,
"fw_storage"
,
None
)
setattr
(
param
,
"bw_storage"
,
None
)
setattr
(
param
,
"master_weight"
,
None
)
setattr
(
param
,
"status"
,
"all"
)
setattr
(
param
,
"use_count"
,
0
)
return
param
...
...
@@ -877,6 +880,7 @@ def _PartitionParam(param):
def
_UnsliceParam
(
param
):
if
not
hasattr
(
param
,
"unslice"
):
setattr
(
param
,
"unslice"
,
True
)
setattr
(
param
,
"master_weight"
,
None
)
return
param
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录