Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
f74ebd8a
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f74ebd8a
编写于
12月 21, 2021
作者:
H
Haohongxiang
提交者:
GitHub
12月 21, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
optimize performance of offload in dygraph sharding stage2 (#38064)
* update * fix bugs * modify code style * fix bugs of _get_global_group
上级
61ef56a1
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
172 addition
and
92 deletion
+172
-92
python/paddle/distributed/collective.py
python/paddle/distributed/collective.py
+2
-2
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
...optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+76
-15
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
...stributed/fleet/meta_parallel/sharding/sharding_stage2.py
+41
-35
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
...istributed/fleet/meta_parallel/sharding/sharding_utils.py
+25
-18
python/paddle/distributed/fleet/utils/internal_storage.py
python/paddle/distributed/fleet/utils/internal_storage.py
+25
-11
python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
.../fluid/tests/unittests/dygraph_sharding_stage2_offload.py
+3
-11
未找到文件。
python/paddle/distributed/collective.py
浏览文件 @
f74ebd8a
...
@@ -127,8 +127,8 @@ def _get_group_map():
...
@@ -127,8 +127,8 @@ def _get_group_map():
global
_group_map
global
_group_map
if
not
_group_map
:
if
not
_group_map
:
genv
=
_get_global_env
()
genv
=
_get_global_env
()
_group_map
[
0
]
=
Group
(
genv
.
rank
,
genv
.
world_size
,
_group_map
[
0
]
=
Group
(
list
(
range
(
genv
.
world_size
)))
genv
.
rank
,
genv
.
world_size
,
ranks
=
list
(
range
(
genv
.
world_size
)))
return
_group_map
return
_group_map
...
...
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
浏览文件 @
f74ebd8a
...
@@ -30,11 +30,11 @@ from paddle.optimizer import Optimizer
...
@@ -30,11 +30,11 @@ from paddle.optimizer import Optimizer
from
paddle.fluid.clip
import
ClipGradByGlobalNorm
from
paddle.fluid.clip
import
ClipGradByGlobalNorm
from
paddle.distributed.collective
import
_get_global_group
from
paddle.distributed.collective
import
_get_global_group
from
...utils.internal_storage
import
ParamStorage
from
...utils.internal_storage
import
ParamStorage
,
GradStorage
from
...meta_parallel.sharding.sharding_utils
import
Type
,
device_guard
,
ShardingClipGrad
from
...meta_parallel.sharding.sharding_utils
import
Type
,
device_guard
,
ShardingClipGrad
# CUDA alignment 256 bytes
# CUDA alignment 256 bytes
, cpu alignment 4096 bytes
alignment
=
{
"gpu"
:
256
,
}
alignment
=
{
"gpu"
:
256
,
"cpu"
:
4096
}
align
=
{
align
=
{
Type
.
fp16
.
value
:
2
,
Type
.
fp16
.
value
:
2
,
Type
.
fp32
.
value
:
4
,
Type
.
fp32
.
value
:
4
,
...
@@ -95,10 +95,11 @@ class ShardingOptimizerStage2(Optimizer):
...
@@ -95,10 +95,11 @@ class ShardingOptimizerStage2(Optimizer):
filter
(
lambda
x
:
x
.
trainable
and
x
.
dtype
==
Type
.
fp16
.
value
,
filter
(
lambda
x
:
x
.
trainable
and
x
.
dtype
==
Type
.
fp16
.
value
,
self
.
_local_params
)))
>
0
self
.
_local_params
)))
>
0
self
.
group
=
group
self
.
group
=
dist
.
new_group
(
_get_global_group
()
group
=
_get_global_group
()
if
group
is
None
else
group
.
ranks
)
if
group
is
None
else
group
self
.
world_size
=
group
.
nranks
self
.
rank
=
group
.
rank
self
.
world_size
=
self
.
group
.
nranks
self
.
rank
=
self
.
group
.
rank
self
.
broadcast_fp16
=
broadcast_fp16
self
.
broadcast_fp16
=
broadcast_fp16
self
.
param_storages
=
{}
# {dtype: {rank: InternalStorage}}
self
.
param_storages
=
{}
# {dtype: {rank: InternalStorage}}
...
@@ -108,14 +109,18 @@ class ShardingOptimizerStage2(Optimizer):
...
@@ -108,14 +109,18 @@ class ShardingOptimizerStage2(Optimizer):
"While using ClipGradByGlobalNorm in ShardingOptimizer, the grad clip of original optimizer will be changed."
"While using ClipGradByGlobalNorm in ShardingOptimizer, the grad clip of original optimizer will be changed."
)
)
self
.
_optim
.
_grad_clip
=
ShardingClipGrad
(
self
.
_optim
.
_grad_clip
,
self
.
_optim
.
_grad_clip
=
ShardingClipGrad
(
self
.
_optim
.
_grad_clip
,
group
,
paddle
.
get_device
()
,
paddle
.
get_device
()
)
self
.
group
)
if
offload
:
if
offload
:
assert
self
.
_pfp16
,
"Only support offload strategy while using
\'
Adam
\'
,
\'
AdamW
\'
and
\'
Momentum
\'
optimizer with AMP/Pure FP16"
assert
self
.
_pfp16
,
"Only support offload strategy while using
\'
Adam
\'
,
\'
AdamW
\'
and
\'
Momentum
\'
optimizer with AMP/Pure FP16"
self
.
offload
=
offload
# Using for offload
self
.
offload
=
offload
# Using for offload
self
.
offload_device
=
"cpu"
self
.
offload_device
=
"cpu"
self
.
offload_buffer_size
=
0
self
.
offload_param2align
=
{}
self
.
offload_params
=
None
self
.
offload_grads
=
None
self
.
_master_params
=
{}
self
.
_master_params
=
{}
...
@@ -131,7 +136,6 @@ class ShardingOptimizerStage2(Optimizer):
...
@@ -131,7 +136,6 @@ class ShardingOptimizerStage2(Optimizer):
value
=
param
.
cast
(
dtype
=
Type
.
fp32
.
value
).
numpy
(),
value
=
param
.
cast
(
dtype
=
Type
.
fp32
.
value
).
numpy
(),
place
=
core
.
CPUPlace
(),
place
=
core
.
CPUPlace
(),
stop_gradient
=
param
.
stop_gradient
)
stop_gradient
=
param
.
stop_gradient
)
self
.
_optim
.
_master_weights
=
self
.
_master_params
else
:
else
:
for
param
in
trainable_params
:
for
param
in
trainable_params
:
if
param
.
dtype
==
Type
.
fp16
.
value
:
if
param
.
dtype
==
Type
.
fp16
.
value
:
...
@@ -265,13 +269,73 @@ class ShardingOptimizerStage2(Optimizer):
...
@@ -265,13 +269,73 @@ class ShardingOptimizerStage2(Optimizer):
for
d
in
dtype_to_pop
:
for
d
in
dtype_to_pop
:
self
.
param_storages
.
pop
(
d
)
self
.
param_storages
.
pop
(
d
)
if
self
.
offload
:
self
.
_optim
.
_master_weights
=
self
.
_master_params
cpu_master_params
=
[
p
for
p
in
self
.
_master_params
.
values
()]
for
param
in
cpu_master_params
:
size
=
np
.
prod
(
param
.
shape
)
*
align
[
Type
.
fp32
.
value
]
remaining
=
size
%
alignment
[
self
.
offload_device
]
ali
=
0
if
remaining
==
0
else
alignment
[
self
.
offload_device
]
-
remaining
align_
=
ali
//
align
[
Type
.
fp32
.
value
]
self
.
offload_buffer_size
+=
np
.
prod
(
param
.
shape
)
+
align_
self
.
offload_param2align
[
param
.
name
]
=
align_
if
cpu_master_params
:
with
device_guard
(
self
.
rank
,
self
.
offload_device
):
self
.
offload_params
=
ParamStorage
(
size
=
self
.
offload_buffer_size
,
dtype
=
Type
.
fp32
.
value
,
device
=
self
.
offload_device
)
self
.
offload_params
.
add_rank_params
(
cpu_master_params
,
self
.
offload_param2align
,
False
)
self
.
offload_params
.
buffer
.
stop_gradient
=
False
self
.
offload_grads
=
GradStorage
(
size
=
self
.
offload_buffer_size
,
dtype
=
Type
.
fp32
.
value
,
device
=
self
.
offload_device
,
destination
=
self
.
rank
,
parm2align
=
self
.
offload_param2align
,
convert_cpu
=
True
)
for
p
in
cpu_master_params
:
self
.
offload_grads
.
add_grad
(
p
,
self
.
offload_param2align
[
p
.
name
])
self
.
_optim
.
_master_weights
[
self
.
offload_params
.
buffer
.
name
]
=
self
.
offload_params
.
buffer
def
_offload_acc_grad
(
self
,
param_name
,
grad_fp32_cpu
):
"""accumulate grads with offload strategy"""
with
device_guard
(
self
.
rank
,
self
.
offload_device
):
if
param_name
in
self
.
_master_params
.
keys
():
if
self
.
_master_params
[
param_name
].
grad
is
None
:
self
.
_master_params
[
param_name
].
_copy_gradient_from
(
grad_fp32_cpu
)
else
:
self
.
_master_params
[
param_name
].
grad
.
add_
(
grad_fp32_cpu
)
self
.
offload_params
.
buffer
.
_copy_gradient_from
(
self
.
offload_grads
.
buffer
)
def
_offload_scale_grad
(
self
,
scale_size
):
"""scale grads with offload strategy"""
with
device_guard
(
self
.
rank
,
self
.
offload_device
):
self
.
offload_grads
.
buffer
.
scale_
(
scale
=
scale_size
)
def
_offload_clear_grad
(
self
):
"""clear grads with offload strategy"""
with
device_guard
(
self
.
rank
,
self
.
offload_device
):
self
.
offload_grads
.
buffer
.
zero_
()
def
step
(
self
):
def
step
(
self
):
"""
"""
A wrapper for Optimizer's step function to finish the update operation of the optimizer.
A wrapper for Optimizer's step function to finish the update operation of the optimizer.
"""
"""
if
self
.
offload
:
if
self
.
offload
:
params_list
=
list
(
self
.
_master_params
.
values
())
params_list
=
[
self
.
offload_params
.
buffer
]
else
:
else
:
# Synchronize optimizer parameters for the current rank
# Synchronize optimizer parameters for the current rank
params_list
=
[]
params_list
=
[]
...
@@ -296,14 +360,11 @@ class ShardingOptimizerStage2(Optimizer):
...
@@ -296,14 +360,11 @@ class ShardingOptimizerStage2(Optimizer):
with
device_guard
(
device
=
self
.
offload_device
):
with
device_guard
(
device
=
self
.
offload_device
):
self
.
_optim
.
step
()
self
.
_optim
.
step
()
dev_id
=
0
if
paddle
.
get_device
()
==
"cpu"
else
int
(
dev_id
=
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
paddle
.
get_device
().
split
(
":"
)[
1
])
for
param
in
self
.
_local_params
:
for
param
in
self
.
_local_params
:
if
param
.
name
in
self
.
_master_params
.
keys
():
if
param
.
name
in
self
.
_master_params
.
keys
():
param
.
set_value
(
self
.
_master_params
[
param
.
name
].
cuda
(
dev_id
)
param
.
set_value
(
self
.
_master_params
[
param
.
name
].
cuda
(
dev_id
)
.
cast
(
dtype
=
param
.
dtype
))
.
cast
(
dtype
=
param
.
dtype
))
self
.
_master_params
[
param
.
name
].
clear_gradient
(
False
)
else
:
else
:
self
.
_optim
.
step
()
self
.
_optim
.
step
()
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
浏览文件 @
f74ebd8a
...
@@ -51,7 +51,7 @@ class ShardingStage2(nn.Layer):
...
@@ -51,7 +51,7 @@ class ShardingStage2(nn.Layer):
# Feature Notes::
# Feature Notes::
# 1. Unified memory for param and param.grad to InternalStorage.
# 1. Unified memory for param and param.grad to InternalStorage.
# 2. Divide param.grad according to rank to centrally apply for and release GPU memory.
# 2. Divide param.grad according to rank to centrally apply for and release GPU memory.
# 3. Dynamically adjust training parameters and models
。
# 3. Dynamically adjust training parameters and models
.
# 4. Support offload function.
# 4. Support offload function.
# 5. Support the establishment of independent communication groups.
# 5. Support the establishment of independent communication groups.
...
@@ -85,11 +85,11 @@ class ShardingStage2(nn.Layer):
...
@@ -85,11 +85,11 @@ class ShardingStage2(nn.Layer):
self
.
_accumulate_grads
=
accumulate_grads
self
.
_accumulate_grads
=
accumulate_grads
# Communication related attributes
# Communication related attributes
self
.
_group
=
group
self
.
_group
=
dist
.
new_group
(
_get_global_group
()
group
=
_get_global_group
(
)
if
group
is
None
else
group
.
ranks
)
if
group
is
None
else
group
self
.
_world_size_scaling
=
1.0
/
group
.
nranks
self
.
_world_size_scaling
=
1.0
/
self
.
_
group
.
nranks
assert
group
.
nranks
>
1
,
"Training must be distributed, ranks must be greater than 1"
assert
self
.
_
group
.
nranks
>
1
,
"Training must be distributed, ranks must be greater than 1"
self
.
_rank
=
group
.
rank
self
.
_rank
=
self
.
_
group
.
rank
self
.
_global_root_rank
=
0
# picking rank 0 as the reference
self
.
_global_root_rank
=
0
# picking rank 0 as the reference
self
.
_default_device
=
device
self
.
_default_device
=
device
...
@@ -173,37 +173,40 @@ class ShardingStage2(nn.Layer):
...
@@ -173,37 +173,40 @@ class ShardingStage2(nn.Layer):
"""
"""
# Release grad storages
# Release grad storages
for
dtype
in
self
.
_grad_storages
.
keys
():
for
dtype
in
self
.
_grad_storages
.
keys
():
if
self
.
_rank
in
self
.
_grad_storages
[
dtype
].
keys
():
if
not
self
.
_offload
and
self
.
_rank
in
self
.
_grad_storages
[
if
not
self
.
_offload
:
dtype
].
keys
()
:
self
.
_grad_storages
[
dtype
][
self
.
_rank
].
buffer
.
zero_
()
self
.
_grad_storages
[
dtype
][
self
.
_rank
].
buffer
.
zero_
()
# Release params
# Release
grads of
params
for
param
in
self
.
_trainable_params
:
for
param
in
self
.
_trainable_params
:
if
param
.
name
in
self
.
_param_grads
and
param
.
grad
is
not
None
:
if
param
.
name
in
self
.
_param_grads
and
param
.
grad
is
not
None
:
param
.
clear_gradient
()
param
.
clear_gradient
()
# Release grads of master params with offload strategy
if
self
.
_offload
:
self
.
_sharding_optimizers
[
0
].
_offload_clear_grad
()
def
_grad_scale
(
self
):
def
_grad_scale
(
self
):
"""
"""
Before the gradient accumulation, scale the gradient.
Before the gradient accumulation, scale the gradient.
"""
"""
# Scale grad storages
for
dtype
in
self
.
_grad_storages
.
keys
():
if
not
self
.
_offload
and
self
.
_rank
in
self
.
_grad_storages
[
dtype
].
keys
():
self
.
_grad_storages
[
dtype
][
self
.
_rank
].
buffer
.
scale_
(
scale
=
self
.
_world_size_scaling
)
# Scale grads of params
for
param
in
self
.
_trainable_params
:
if
param
.
name
in
self
.
_param_grads
and
param
.
grad
is
not
None
:
param
.
grad
.
scale_
(
scale
=
self
.
_world_size_scaling
)
param
.
_reset_grad_inplace_version
(
True
)
# Scale grads of master params with offload strategy
if
self
.
_offload
:
if
self
.
_offload
:
for
param
in
self
.
_trainable_params
:
self
.
_sharding_optimizers
[
0
].
_offload_scale_grad
(
if
param
.
name
in
self
.
_sharding_optimizers
[
self
.
_world_size_scaling
)
0
].
_master_params
.
keys
():
self
.
_sharding_optimizers
[
0
].
_master_params
[
param
.
name
].
grad
.
scale_
(
scale
=
self
.
_world_size_scaling
)
else
:
# Scale grad storages
for
dtype
in
self
.
_grad_storages
.
keys
():
if
self
.
_rank
in
self
.
_grad_storages
[
dtype
].
keys
():
self
.
_grad_storages
[
dtype
][
self
.
_rank
].
buffer
.
scale_
(
scale
=
self
.
_world_size_scaling
)
# Scale params
for
param
in
self
.
_trainable_params
:
if
param
.
name
in
self
.
_param_grads
and
param
.
grad
is
not
None
:
param
.
grad
.
scale_
(
scale
=
self
.
_world_size_scaling
)
param
.
_reset_grad_inplace_version
(
True
)
def
_init_internal_storage
(
self
,
needs_fresh
):
def
_init_internal_storage
(
self
,
needs_fresh
):
"""
"""
...
@@ -319,9 +322,9 @@ class ShardingStage2(nn.Layer):
...
@@ -319,9 +322,9 @@ class ShardingStage2(nn.Layer):
if
dst_rank
!=
self
.
_rank
:
if
dst_rank
!=
self
.
_rank
:
param
.
clear_gradient
(
False
)
param
.
clear_gradient
(
False
)
elif
self
.
_offload
:
elif
self
.
_offload
:
self
.
_sharding_optimizers
[
0
].
_
master_params
[
self
.
_sharding_optimizers
[
0
].
_
offload_acc_grad
(
param
.
name
].
_copy_gradient_from
(
param
.
grad
.
cpu
(
param
.
name
,
).
cast
(
dtype
=
Type
.
fp32
.
value
))
param
.
grad
.
cast
(
dtype
=
Type
.
fp32
.
value
).
cpu
(
))
param
.
clear_gradient
(
False
)
param
.
clear_gradient
(
False
)
# Synchronize the reduce parameter gradient
# Synchronize the reduce parameter gradient
...
@@ -375,11 +378,14 @@ class ShardingStage2(nn.Layer):
...
@@ -375,11 +378,14 @@ class ShardingStage2(nn.Layer):
)
)
elif
self
.
_offload
:
elif
self
.
_offload
:
grad_storage
.
to
(
device
=
self
.
_offload_device
)
grad_storage
.
to
(
device
=
self
.
_offload_device
)
for
param
in
grad_storage
.
_params
:
for
p
in
grad_storage
.
_params
:
self
.
_sharding_optimizers
[
0
].
_master_params
[
self
.
_sharding_optimizers
[
param
.
name
].
_copy_gradient_from
(
0
].
_offload_acc_grad
(
param
.
grad
.
cast
(
p
.
name
,
dtype
=
Type
.
fp32
.
value
))
p
.
grad
.
cast
(
dtype
=
Type
.
fp32
.
value
))
p
.
clear_gradient
(
False
)
p
.
_gradient_set_empty
(
False
)
grad_storage
.
_device
=
self
.
_default_device
grad_storage
.
buffer
.
value
().
get_tensor
().
_clear
(
grad_storage
.
buffer
.
value
().
get_tensor
().
_clear
(
)
)
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
浏览文件 @
f74ebd8a
...
@@ -28,6 +28,7 @@ from paddle.fluid import layers
...
@@ -28,6 +28,7 @@ from paddle.fluid import layers
from
paddle.fluid.dygraph
import
to_variable
from
paddle.fluid.dygraph
import
to_variable
from
paddle.fluid.framework
import
dygraph_only
from
paddle.fluid.framework
import
dygraph_only
from
paddle.fluid.dygraph
import
base
as
imperative_base
from
paddle.fluid.dygraph
import
base
as
imperative_base
from
paddle.distributed.collective
import
_get_global_group
class
Taskflow
:
class
Taskflow
:
...
@@ -49,10 +50,10 @@ class Type(Enum):
...
@@ -49,10 +50,10 @@ class Type(Enum):
class
ShardingClipGrad
:
class
ShardingClipGrad
:
def
__init__
(
self
,
clip
,
group
,
device
):
def
__init__
(
self
,
clip
,
device
,
group
):
self
.
_clip
=
clip
self
.
_clip
=
clip
self
.
_group
=
group
self
.
_device
=
device
self
.
_device
=
device
self
.
_group
=
group
@
imperative_base
.
no_grad
@
imperative_base
.
no_grad
def
_dygraph_clip
(
self
,
params_grads
):
def
_dygraph_clip
(
self
,
params_grads
):
...
@@ -144,7 +145,7 @@ def device_guard(dev_id=0, device="cpu"):
...
@@ -144,7 +145,7 @@ def device_guard(dev_id=0, device="cpu"):
@
dygraph_only
@
dygraph_only
def
ShardingScaler
(
scaler
,
sharding_group
):
def
ShardingScaler
(
scaler
):
def
unscale_method
(
self
,
optimizer
):
def
unscale_method
(
self
,
optimizer
):
if
not
self
.
_enable
:
if
not
self
.
_enable
:
return
return
...
@@ -152,10 +153,10 @@ def ShardingScaler(scaler, sharding_group):
...
@@ -152,10 +153,10 @@ def ShardingScaler(scaler, sharding_group):
param_grads_fp16
=
[]
param_grads_fp16
=
[]
param_grads_fp32
=
[]
param_grads_fp32
=
[]
if
getattr
(
optimizer
,
'_param_groups'
,
None
)
and
isinstance
(
if
getattr
(
optimizer
.
_optim
,
'_param_groups'
,
None
)
and
isinstance
(
optimizer
.
_param_groups
[
0
],
dict
):
optimizer
.
_
optim
.
_
param_groups
[
0
],
dict
):
for
group
in
optimizer
.
_param_groups
:
for
group
in
optimizer
.
_
optim
.
_
param_groups
:
for
param
in
group
[
'params'
]:
for
param
in
group
[
'params'
]:
if
param
.
_grad_ivar
()
is
not
None
:
if
param
.
_grad_ivar
()
is
not
None
:
param_grads
.
append
(
param
.
_grad_ivar
())
param_grads
.
append
(
param
.
_grad_ivar
())
...
@@ -166,31 +167,37 @@ def ShardingScaler(scaler, sharding_group):
...
@@ -166,31 +167,37 @@ def ShardingScaler(scaler, sharding_group):
param_grads_fp32
.
append
(
param
.
_grad_ivar
())
param_grads_fp32
.
append
(
param
.
_grad_ivar
())
else
:
else
:
param_grads
=
[
param_grads
=
[
param
.
_grad_ivar
()
for
param
in
optimizer
.
_parameter_list
param
.
_grad_ivar
()
for
param
in
optimizer
.
_
optim
.
_
parameter_list
if
param
.
_grad_ivar
()
is
not
None
if
param
.
_grad_ivar
()
is
not
None
]
]
param_grads_fp16
=
[
param_grads_fp16
=
[
param
.
_grad_ivar
()
for
param
in
optimizer
.
_parameter_list
param
.
_grad_ivar
()
for
param
in
optimizer
.
_
optim
.
_
parameter_list
if
(
param
.
_grad_ivar
()
is
not
None
if
(
param
.
_grad_ivar
()
is
not
None
)
and
(
param
.
_grad_ivar
().
dtype
==
core
.
VarDesc
.
VarType
.
FP16
)
and
(
param
.
_grad_ivar
().
dtype
==
core
.
VarDesc
.
VarType
.
FP16
)
)
]
]
param_grads_fp32
=
[
param_grads_fp32
=
[
param
.
_grad_ivar
()
for
param
in
optimizer
.
_parameter_list
param
.
_grad_ivar
()
for
param
in
optimizer
.
_
optim
.
_
parameter_list
if
(
param
.
_grad_ivar
()
is
not
None
if
(
param
.
_grad_ivar
()
is
not
None
)
and
(
param
.
_grad_ivar
().
dtype
==
core
.
VarDesc
.
VarType
.
FP32
)
and
(
param
.
_grad_ivar
().
dtype
==
core
.
VarDesc
.
VarType
.
FP32
)
)
]
]
temp_found_inf_fp16
=
to_variable
(
np
.
array
([
0
]).
astype
(
np
.
bool
))
temp_found_inf_fp16
=
to_variable
(
np
.
array
([
0
]).
astype
(
np
.
bool
))
temp_found_inf_fp32
=
to_variable
(
np
.
array
([
0
]).
astype
(
np
.
bool
))
temp_found_inf_fp32
=
to_variable
(
np
.
array
([
0
]).
astype
(
np
.
bool
))
if
len
(
param_grads_fp16
):
_C_ops
.
check_finite_and_unscale
(
param_grads_fp16
,
self
.
_scale
,
device
=
"cpu"
if
optimizer
.
offload
else
"gpu"
param_grads_fp16
,
dev_id
=
0
if
device
==
"cpu"
else
int
(
paddle
.
get_device
().
split
(
":"
)[
temp_found_inf_fp16
)
1
])
if
len
(
param_grads_fp32
):
_C_ops
.
check_finite_and_unscale
(
param_grads_fp32
,
self
.
_scale
,
with
device_guard
(
dev_id
,
device
):
param_grads_fp32
,
if
len
(
param_grads_fp16
):
temp_found_inf_fp32
)
_C_ops
.
check_finite_and_unscale
(
param_grads_fp16
,
self
.
_scale
,
param_grads_fp16
,
temp_found_inf_fp16
)
if
len
(
param_grads_fp32
):
_C_ops
.
check_finite_and_unscale
(
param_grads_fp32
,
self
.
_scale
,
param_grads_fp32
,
temp_found_inf_fp32
)
self
.
_found_inf
=
1
if
temp_found_inf_fp16
or
temp_found_inf_fp32
else
0
self
.
_found_inf
=
1
if
temp_found_inf_fp16
or
temp_found_inf_fp32
else
0
is_found_inf
=
paddle
.
to_tensor
([
self
.
_found_inf
],
dtype
=
"int32"
)
is_found_inf
=
paddle
.
to_tensor
([
self
.
_found_inf
],
dtype
=
"int32"
)
...
@@ -198,7 +205,7 @@ def ShardingScaler(scaler, sharding_group):
...
@@ -198,7 +205,7 @@ def ShardingScaler(scaler, sharding_group):
paddle
.
distributed
.
all_reduce
(
paddle
.
distributed
.
all_reduce
(
is_found_inf
,
is_found_inf
,
op
=
paddle
.
distributed
.
ReduceOp
.
MAX
,
op
=
paddle
.
distributed
.
ReduceOp
.
MAX
,
group
=
sharding_
group
)
group
=
optimizer
.
group
)
self
.
_found_inf
=
is_found_inf
.
numpy
()[
0
]
self
.
_found_inf
=
is_found_inf
.
numpy
()[
0
]
scaler
.
_unscale
=
MethodType
(
unscale_method
,
scaler
)
scaler
.
_unscale
=
MethodType
(
unscale_method
,
scaler
)
...
...
python/paddle/distributed/fleet/utils/internal_storage.py
浏览文件 @
f74ebd8a
...
@@ -69,9 +69,11 @@ class InternalStorage:
...
@@ -69,9 +69,11 @@ class InternalStorage:
param
.
_gradient_set_empty
(
False
)
param
.
_gradient_set_empty
(
False
)
self
.
buffer
.
value
().
get_tensor
().
_clear
()
self
.
buffer
.
value
().
get_tensor
().
_clear
()
self
.
buffer
=
tmp_buffer
self
.
buffer
=
tmp_buffer
self
.
_device
=
device
if
dtype
is
not
None
:
if
dtype
is
not
None
:
self
.
buffer
=
self
.
buffer
.
cast
(
dtype
=
dtype
)
self
.
buffer
=
self
.
buffer
.
cast
(
dtype
=
dtype
)
self
.
_dtype
=
dtype
class
ParamStorage
(
InternalStorage
):
class
ParamStorage
(
InternalStorage
):
...
@@ -94,7 +96,7 @@ class ParamStorage(InternalStorage):
...
@@ -94,7 +96,7 @@ class ParamStorage(InternalStorage):
self
.
_array_params
()
self
.
_array_params
()
@
fluid
.
dygraph
.
no_grad
@
fluid
.
dygraph
.
no_grad
def
add_rank_params
(
self
,
trainable_params
,
param2align
):
def
add_rank_params
(
self
,
trainable_params
,
param2align
,
convert_gpu
=
True
):
"""
"""
Add new parameters to the InternalStorage. Params becomes a view of this InternalStorage buffer.
Add new parameters to the InternalStorage. Params becomes a view of this InternalStorage buffer.
"""
"""
...
@@ -108,12 +110,15 @@ class ParamStorage(InternalStorage):
...
@@ -108,12 +110,15 @@ class ParamStorage(InternalStorage):
cpu_param_shape
=
list
()
cpu_param_shape
=
list
()
for
param
in
trainable_params
:
for
param
in
trainable_params
:
p_shape
=
self
.
_add_param_as_view
(
param
,
param2align
[
param
.
name
])
p_shape
=
self
.
_add_param_as_view
(
param
,
param2align
[
param
.
name
],
convert_gpu
)
cpu_param_shape
.
append
(
p_shape
)
cpu_param_shape
.
append
(
p_shape
)
# buffer convert from cpu to cuda
if
convert_gpu
:
dev_id
=
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
# buffer convert from cpu to cuda
self
.
buffer
=
self
.
buffer
.
cuda
(
dev_id
)
dev_id
=
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
self
.
buffer
=
self
.
buffer
.
cuda
(
dev_id
)
self
.
_fill
=
0
self
.
_fill
=
0
for
idx
,
param
in
enumerate
(
trainable_params
):
for
idx
,
param
in
enumerate
(
trainable_params
):
...
@@ -123,7 +128,7 @@ class ParamStorage(InternalStorage):
...
@@ -123,7 +128,7 @@ class ParamStorage(InternalStorage):
self
.
_param_ids
.
append
(
id
(
param
))
self
.
_param_ids
.
append
(
id
(
param
))
@
fluid
.
dygraph
.
no_grad
@
fluid
.
dygraph
.
no_grad
def
_add_param_as_view
(
self
,
param
,
align
):
def
_add_param_as_view
(
self
,
param
,
align
,
convert_gpu
=
True
):
assert
(
assert
(
param
.
dtype
==
self
.
buffer
.
dtype
param
.
dtype
==
self
.
buffer
.
dtype
...
@@ -147,9 +152,12 @@ class ParamStorage(InternalStorage):
...
@@ -147,9 +152,12 @@ class ParamStorage(InternalStorage):
with
device_guard
(
dev_id
,
"cpu"
):
with
device_guard
(
dev_id
,
"cpu"
):
tmp_var
=
core
.
VarBase
(
tensor
=
self
.
buffer
.
_slice
(
self
.
_fill
,
tmp_var
=
core
.
VarBase
(
tensor
=
self
.
buffer
.
_slice
(
self
.
_fill
,
var_end
))
var_end
))
param_cpu
=
param
.
cpu
()
if
convert_gpu
:
param
.
value
().
get_tensor
().
_clear
()
param_cpu
=
param
.
cpu
()
tmp_var
.
set_value
(
param_cpu
)
param
.
value
().
get_tensor
().
_clear
()
tmp_var
.
set_value
(
param_cpu
)
else
:
tmp_var
.
set_value
(
param
)
self
.
_fill
=
offset
self
.
_fill
=
offset
return
p_shape
return
p_shape
...
@@ -186,10 +194,16 @@ class GradStorage(InternalStorage):
...
@@ -186,10 +194,16 @@ class GradStorage(InternalStorage):
This is a basic class to simplify the handling of gradient InternalStorages
This is a basic class to simplify the handling of gradient InternalStorages
"""
"""
def
__init__
(
self
,
size
,
dtype
,
device
,
destination
,
parm2align
):
def
__init__
(
self
,
size
,
dtype
,
device
,
destination
,
parm2align
,
convert_cpu
=
False
):
if
isinstance
(
size
,
np
.
int64
):
if
isinstance
(
size
,
np
.
int64
):
size
=
size
.
tolist
()
size
=
size
.
tolist
()
super
().
__init__
(
size
,
dtype
,
device
)
super
().
__init__
(
size
,
dtype
,
device
,
convert_cpu
)
self
.
_max_size
=
size
self
.
_max_size
=
size
self
.
_release
=
False
self
.
_release
=
False
...
...
python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
浏览文件 @
f74ebd8a
...
@@ -40,24 +40,16 @@ paddle.seed(seed)
...
@@ -40,24 +40,16 @@ paddle.seed(seed)
def
train_mlp
(
model
,
offload
=
False
):
def
train_mlp
(
model
,
offload
=
False
):
group
=
paddle
.
distributed
.
new_group
([
0
,
1
])
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
True
)
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
True
)
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
,
save_dtype
=
'float32'
)
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
,
save_dtype
=
'float32'
)
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
32768
)
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
32768
)
scaler
=
ShardingScaler
(
scaler
,
group
)
scaler
=
ShardingScaler
(
scaler
)
optimizer
=
ShardingOptimizerStage2
(
optimizer
=
ShardingOptimizerStage2
(
params
=
model
.
parameters
(),
params
=
model
.
parameters
(),
optim
=
optimizer
,
offload
=
offload
)
optim
=
optimizer
,
group
=
group
,
offload
=
offload
)
model
=
ShardingStage2
(
model
=
ShardingStage2
(
model
,
model
,
optimizer
,
buffer_max_size
=
2
**
21
,
accumulate_grads
=
True
)
optimizer
,
group
=
group
,
buffer_max_size
=
2
**
21
,
accumulate_grads
=
True
)
train_reader
=
paddle
.
batch
(
train_reader
=
paddle
.
batch
(
reader_decorator
(
linear_size
),
batch_size
=
batch_size
,
drop_last
=
True
)
reader_decorator
(
linear_size
),
batch_size
=
batch_size
,
drop_last
=
True
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录