Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
2bbdc47a
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
2bbdc47a
编写于
12月 27, 2022
作者:
W
wanghuancoder
提交者:
GitHub
12月 27, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
delete old dygraph sharding (#49334)
* delete old dygraph sharding
上级
2ca3d3f7
变更
14
展开全部
隐藏空白更改
内联
并排
Showing
14 changed file
with
31 addition
and
4128 deletion
+31
-4128
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
...optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+0
-468
python/paddle/distributed/fleet/meta_parallel/sharding/__init__.py
...ddle/distributed/fleet/meta_parallel/sharding/__init__.py
+0
-2
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
...stributed/fleet/meta_parallel/sharding/sharding_stage2.py
+0
-619
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
...stributed/fleet/meta_parallel/sharding/sharding_stage3.py
+0
-1055
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
...istributed/fleet/meta_parallel/sharding/sharding_utils.py
+0
-252
python/paddle/distributed/fleet/utils/internal_storage.py
python/paddle/distributed/fleet/utils/internal_storage.py
+0
-348
python/paddle/distributed/sharding/group_sharded.py
python/paddle/distributed/sharding/group_sharded.py
+31
-80
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_optimizer_stage2.py
...sts/collective/fleet/dygraph_sharding_optimizer_stage2.py
+0
-144
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage2.py
...sts/unittests/collective/fleet/dygraph_sharding_stage2.py
+0
-242
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage2_offload.py
...tests/collective/fleet/dygraph_sharding_stage2_offload.py
+0
-122
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage3.py
...sts/unittests/collective/fleet/dygraph_sharding_stage3.py
+0
-319
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage3_offload.py
...tests/collective/fleet/dygraph_sharding_stage3_offload.py
+0
-219
python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_sharding_stage2.py
...ttests/collective/multinode/mn_dygraph_sharding_stage2.py
+0
-251
python/paddle/fluid/tests/unittests/collective/multinode/test_multinode_dygraph_sharding.py
...s/collective/multinode/test_multinode_dygraph_sharding.py
+0
-7
未找到文件。
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# The file has been adapted from fairscale file:
# https://github.com/facebookresearch/fairscale/blob/main/fairscale/optim/oss.py
# Git commit hash: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e
# We retain the following license from the original files:
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
import
logging
from
collections
import
OrderedDict
import
numpy
as
np
import
paddle
import
paddle.distributed
as
dist
from
paddle.distributed.collective
import
_get_global_group
,
new_group
from
paddle.fluid.clip
import
ClipGradByGlobalNorm
from
paddle.framework
import
core
from
paddle.optimizer
import
Optimizer
from
...meta_parallel.sharding.sharding_utils
import
(
ShardingClipGrad
,
Type
,
device_guard
,
)
from
...utils.internal_storage
import
GradStorage
,
ParamStorage
# CUDA alignment 256 bytes, cpu alignment 4096 bytes
alignment
=
{
"gpu"
:
256
,
"cpu"
:
4096
}
align
=
{
Type
.
fp16
.
value
:
2
,
Type
.
fp32
.
value
:
4
,
}
class
ShardingOptimizerStage2
(
Optimizer
):
"""
A wrapper for Sharding Stage2 Optimizer in Dygraph.
.. warning: ShardingOptimizer encapsulates the optimization strategy and integrates it into the optimizer.
.. ZeRO: 1.https://arxiv.org/pdf/1910.02054.pdf 2.https://arxiv.org/pdf/1910.02054.pdf.
"""
# TODO (Baibaifan)
# Feature Notes:
# 1. Unified memory for parameters and parameters.grad to InternalStorage.
# 2. Support the segmentation of optimizer parameters and partial updating of parameters.
# 3. Dynamically adjust training parameters and models.
# 4. Support offload function.
# 5. Support the establishment of independent communication groups.
# 6. Broadcast_fp16 is not supported now.
def
__init__
(
self
,
params
,
optim
,
group
=
None
,
offload
=
False
,
device
=
"gpu"
,
pertrain_sync_models
=
True
,
**
kw
):
super
().
__init__
(
optim
.
_learning_rate
,
params
,
kw
)
# Segmentation information
self
.
_dtype_rank_params
=
(
OrderedDict
()
)
# {dtype:[param1,param2]} device, rank, params
self
.
_param2rank
=
{}
self
.
__segment_params
=
[]
self
.
_rank_buffer_size
=
{}
# {dtype: {rank: numel+alignment}}
self
.
_param2align
=
{}
# {param.name: align}
# Default information
self
.
_optim_defaults
=
kw
self
.
_optim
=
optim
assert
hasattr
(
self
.
_optim
,
"_master_weights"
),
"Must use optimizer with _master_weights attribute"
self
.
_local_params
=
params
self
.
_default_device
=
device
self
.
_pfp16
=
(
len
(
list
(
filter
(
lambda
x
:
x
.
trainable
and
x
.
dtype
==
Type
.
fp16
.
value
,
self
.
_local_params
,
)
)
)
>
0
)
self
.
group
=
(
new_group
(
_get_global_group
().
ranks
)
if
group
is
None
else
group
)
self
.
world_size
=
self
.
group
.
nranks
self
.
rank
=
self
.
group
.
rank
self
.
_global_root_rank
=
self
.
group
.
ranks
[
0
]
# Synchronous all ranks models
if
pertrain_sync_models
:
self
.
_sync_params_and_buffers
()
self
.
param_storages
=
{}
# {dtype: {rank: InternalStorage}}
if
isinstance
(
self
.
_optim
.
_grad_clip
,
ClipGradByGlobalNorm
):
logging
.
warning
(
"While using ClipGradByGlobalNorm in ShardingOptimizer, the grad clip of original optimizer will be changed."
)
self
.
_optim
.
_grad_clip
=
ShardingClipGrad
(
self
.
_optim
.
_grad_clip
,
paddle
.
get_device
(),
self
.
group
)
if
self
.
_optim
.
_parameter_list
and
isinstance
(
self
.
_optim
.
_parameter_list
[
0
],
dict
):
for
item
in
self
.
_optim
.
_param_groups
:
if
"grad_clip"
in
item
.
keys
():
item
[
"grad_clip"
]
=
ShardingClipGrad
(
self
.
_optim
.
_grad_clip
,
paddle
.
get_device
(),
self
.
group
,
)
if
offload
:
assert
(
self
.
_pfp16
),
"Only support offload strategy while using
\'
Adam
\'
,
\'
AdamW
\'
and
\'
Momentum
\'
optimizer with AMP/Pure FP16"
self
.
offload
=
offload
# Using for offload
self
.
offload_device
=
"cpu"
self
.
offload_buffer_size
=
0
self
.
offload_param2align
=
{}
self
.
offload_params
=
None
self
.
offload_grads
=
None
self
.
_master_params
=
{}
# Update optimizer parameters and adjust parameter storage and use according to rank.
self
.
_update_opt_status
()
@
paddle
.
autograd
.
no_grad
()
def
_sync_params_and_buffers
(
self
):
"""
Sync all model states for all ranks
"""
for
p
in
self
.
_local_params
:
dist
.
broadcast
(
p
,
src
=
self
.
_global_root_rank
,
group
=
self
.
group
,
sync_op
=
True
)
# Multi stream operation will be supported later
dist
.
wait
(
tensor
=
p
,
group
=
self
.
group
,
use_calc_stream
=
True
)
def
_generate_master_params
(
self
,
trainable_params
):
if
self
.
offload
:
for
param
in
trainable_params
:
if
param
.
name
not
in
self
.
_master_params
.
keys
():
self
.
_master_params
[
param
.
name
]
=
core
.
VarBase
(
name
=
param
.
name
,
value
=
param
.
cast
(
dtype
=
Type
.
fp32
.
value
).
numpy
(),
place
=
core
.
CPUPlace
(),
stop_gradient
=
param
.
stop_gradient
,
)
else
:
for
param
in
trainable_params
:
if
param
.
dtype
==
Type
.
fp16
.
value
:
self
.
_optim
.
_master_weights
[
param
.
name
]
=
paddle
.
cast
(
param
,
Type
.
fp32
.
value
)
def
_update_opt_status
(
self
):
"""Update optimizer status and parameter storage information, and special functions to be developed."""
# func 1
self
.
_integration_params
()
# fun 2 TODO
# Segement helpers
def
_segment_params
(
self
):
"""
Divide all optimizer parameters equally into rank.
"""
if
len
(
self
.
__segment_params
)
==
0
:
self
.
__segment_params
,
param_lists
=
[
[]
for
_
in
range
(
self
.
world_size
)
],
[[]
for
_
in
range
(
self
.
world_size
)]
sizes
=
[
0
]
*
self
.
world_size
for
param
in
self
.
_local_params
:
# Add this param to rank with smallest size.
rank
=
sizes
.
index
(
min
(
sizes
))
param_lists
[
rank
].
append
(
param
)
# Statistical real numels
sizes
[
rank
]
+=
np
.
prod
(
param
.
shape
)
if
param
.
trainable
else
0
for
rank
,
params
in
enumerate
(
param_lists
):
self
.
__segment_params
[
rank
].
extend
(
params
)
return
self
.
__segment_params
@
property
def
local_params
(
self
):
return
self
.
_local_params
@
property
def
param2rank
(
self
):
"""Map the params to the rank which owns them"""
if
len
(
self
.
_param2rank
)
==
0
:
for
rank
,
params
in
enumerate
(
self
.
_segment_params
()):
for
param
in
params
:
self
.
_param2rank
[
param
.
name
]
=
rank
return
self
.
_param2rank
@
property
def
dtype_rank_params
(
self
):
"""
Divide the parameters into groups according to rank and dtype.
"""
if
len
(
self
.
_dtype_rank_params
)
==
0
:
# Assign the parameters of each rank according to the type
for
param
in
self
.
_local_params
:
if
param
.
dtype
not
in
self
.
_dtype_rank_params
.
keys
():
self
.
_dtype_rank_params
[
param
.
dtype
]
=
[
[]
for
_
in
range
(
self
.
world_size
)
]
self
.
_dtype_rank_params
[
param
.
dtype
][
self
.
param2rank
[
param
.
name
]
].
append
(
param
)
# Sort per rank params by size
for
dtype
in
self
.
_dtype_rank_params
.
keys
():
for
rank_params
in
self
.
_dtype_rank_params
[
dtype
]:
rank_params
.
sort
(
key
=
lambda
x
:
np
.
prod
(
x
.
shape
))
return
self
.
_dtype_rank_params
@
property
def
rank_buffer_size
(
self
):
"""
Count the memory size of the parameters corresponding to rank under the corresponding dtype.
"""
# CUDA alignment 256 bytes
if
len
(
self
.
_rank_buffer_size
)
==
0
:
for
dtype
in
self
.
dtype_rank_params
.
keys
():
if
dtype
not
in
self
.
_rank_buffer_size
.
keys
():
self
.
_rank_buffer_size
[
dtype
]
=
{}
for
dst_rank
,
per_rank_params
in
enumerate
(
self
.
dtype_rank_params
[
dtype
]
):
if
dst_rank
not
in
self
.
_rank_buffer_size
[
dtype
].
keys
():
self
.
_rank_buffer_size
[
dtype
][
dst_rank
]
=
0
for
param
in
per_rank_params
:
if
not
param
.
trainable
:
continue
size
=
np
.
prod
(
param
.
shape
)
*
align
[
dtype
]
remaining
=
size
%
alignment
[
self
.
_default_device
]
ali
=
(
0
if
remaining
==
0
else
alignment
[
self
.
_default_device
]
-
remaining
)
align_
=
ali
//
align
[
dtype
]
self
.
_rank_buffer_size
[
dtype
][
dst_rank
]
+=
(
np
.
prod
(
param
.
shape
)
+
align_
)
self
.
_param2align
[
param
.
name
]
=
align_
return
self
.
_rank_buffer_size
def
_integration_params
(
self
):
"""
Integrate the parameters into a continuous memory according to rank, and support the update of training parameters.
"""
for
dtype
,
per_rank_params
in
self
.
dtype_rank_params
.
items
():
if
dtype
not
in
self
.
param_storages
.
keys
():
self
.
param_storages
[
dtype
]
=
{}
for
dst_rank
,
params
in
enumerate
(
per_rank_params
):
if
len
(
params
)
>
0
:
# Merge all the trainable params in a single InternalStorage
trainable_params
=
list
(
filter
(
lambda
x
:
x
.
trainable
,
params
)
)
if
self
.
_pfp16
and
dst_rank
==
self
.
rank
:
self
.
_generate_master_params
(
trainable_params
)
if
trainable_params
:
param_storage
=
ParamStorage
(
size
=
self
.
rank_buffer_size
[
dtype
][
dst_rank
],
dtype
=
dtype
,
device
=
self
.
_default_device
,
)
param_storage
.
add_rank_params
(
trainable_params
,
self
.
_param2align
)
self
.
param_storages
[
dtype
][
dst_rank
]
=
param_storage
# Clear the InternalStorage keys which are not in use anymore
dtype_in_use
=
list
(
self
.
dtype_rank_params
.
keys
())
dtype_to_pop
=
list
(
filter
(
lambda
x
:
x
not
in
dtype_in_use
,
self
.
param_storages
.
keys
())
)
for
d
in
dtype_to_pop
:
self
.
param_storages
.
pop
(
d
)
if
self
.
offload
:
self
.
_optim
.
_master_weights
=
self
.
_master_params
cpu_master_params
=
[
p
for
p
in
self
.
_master_params
.
values
()]
for
param
in
cpu_master_params
:
size
=
np
.
prod
(
param
.
shape
)
*
align
[
Type
.
fp32
.
value
]
remaining
=
size
%
alignment
[
self
.
offload_device
]
ali
=
(
0
if
remaining
==
0
else
alignment
[
self
.
offload_device
]
-
remaining
)
align_
=
ali
//
align
[
Type
.
fp32
.
value
]
self
.
offload_buffer_size
+=
np
.
prod
(
param
.
shape
)
+
align_
self
.
offload_param2align
[
param
.
name
]
=
align_
if
cpu_master_params
:
with
device_guard
(
self
.
rank
,
self
.
offload_device
):
self
.
offload_params
=
ParamStorage
(
size
=
self
.
offload_buffer_size
,
dtype
=
Type
.
fp32
.
value
,
device
=
self
.
offload_device
,
)
self
.
offload_params
.
add_rank_params
(
cpu_master_params
,
self
.
offload_param2align
,
False
)
self
.
offload_params
.
buffer
.
stop_gradient
=
False
self
.
offload_grads
=
GradStorage
(
size
=
self
.
offload_buffer_size
,
dtype
=
Type
.
fp32
.
value
,
device
=
self
.
offload_device
,
destination
=
self
.
rank
,
parm2align
=
self
.
offload_param2align
,
convert_cpu
=
True
,
)
for
p
in
cpu_master_params
:
self
.
offload_grads
.
add_grad
(
p
,
self
.
offload_param2align
[
p
.
name
]
)
self
.
_optim
.
_master_weights
[
self
.
offload_params
.
buffer
.
name
]
=
self
.
offload_params
.
buffer
def
_offload_acc_grad
(
self
,
param_name
,
grad_fp32_cpu
):
"""accumulate grads with offload strategy"""
with
device_guard
(
self
.
rank
,
self
.
offload_device
):
if
param_name
in
self
.
_master_params
.
keys
():
if
self
.
_master_params
[
param_name
].
grad
is
None
:
self
.
_master_params
[
param_name
].
_copy_gradient_from
(
grad_fp32_cpu
)
else
:
self
.
_master_params
[
param_name
].
grad
.
add_
(
grad_fp32_cpu
)
self
.
offload_params
.
buffer
.
_copy_gradient_from
(
self
.
offload_grads
.
buffer
)
def
_offload_scale_grad
(
self
,
scale_size
):
"""scale grads with offload strategy"""
with
device_guard
(
self
.
rank
,
self
.
offload_device
):
self
.
offload_grads
.
buffer
.
scale_
(
scale
=
scale_size
)
def
_offload_clear_grad
(
self
):
"""clear grads with offload strategy"""
with
device_guard
(
self
.
rank
,
self
.
offload_device
):
self
.
offload_grads
.
buffer
.
zero_
()
def
step
(
self
):
"""
A wrapper for Optimizer's step function to finish the update operation of the optimizer.
"""
if
self
.
offload
:
params_list
=
[
self
.
offload_params
.
buffer
]
# TODO(Baibaifan): Offload will support param_groups later
if
not
isinstance
(
self
.
_optim
.
_param_groups
[
0
],
dict
):
self
.
_optim
.
_parameter_list
=
params_list
self
.
_optim
.
_param_groups
=
params_list
# Run the optimizer of the current rank step
if
self
.
offload
:
with
device_guard
(
device
=
self
.
offload_device
):
self
.
_optim
.
step
()
dev_id
=
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
for
param
in
self
.
_local_params
:
if
param
.
name
in
self
.
_master_params
.
keys
():
param
.
set_value
(
self
.
_master_params
[
param
.
name
]
.
cuda
(
dev_id
)
.
cast
(
dtype
=
param
.
dtype
)
)
else
:
self
.
_optim
.
step
()
# Synchronize all the updated shards in between the ranks
self
.
_broadcast_params
()
def
minimize
(
self
):
raise
RuntimeError
(
"optimizer.minimize() not support now, please use optimizer.step()"
)
def
set_state_dict
(
self
,
state_dict
):
self
.
_optim
.
set_state_dict
(
state_dict
)
def
state_dict
(
self
):
return
self
.
_optim
.
state_dict
()
def
_clear_cache
(
self
):
self
.
__segment_params
.
clear
()
self
.
_dtype_rank_params
.
clear
()
self
.
_param2rank
.
clear
()
@
paddle
.
autograd
.
no_grad
()
def
_broadcast_params
(
self
):
"""Broadcast the parameters of the current rank to each rank"""
assert
self
.
_default_device
==
"gpu"
,
"Only supported gpu"
# Exchange all the shards with the other ranks
for
dtype_per_rank
in
self
.
param_storages
.
values
():
for
dst_rank
,
internal_storage
in
dtype_per_rank
.
items
():
dist
.
broadcast
(
tensor
=
internal_storage
.
buffer
,
src
=
self
.
group
.
ranks
[
dst_rank
],
group
=
self
.
group
,
sync_op
=
True
,
)
# Multi stream operation will be supported later
dist
.
wait
(
tensor
=
internal_storage
.
buffer
,
group
=
self
.
group
,
use_calc_stream
=
True
,
)
python/paddle/distributed/fleet/meta_parallel/sharding/__init__.py
浏览文件 @
2bbdc47a
...
...
@@ -11,5 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.sharding_utils
import
Type
,
device_guard
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
此差异已折叠。
点击以展开。
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
此差异已折叠。
点击以展开。
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
contextlib
from
enum
import
Enum
from
types
import
MethodType
import
numpy
as
np
import
paddle
from
paddle
import
_legacy_C_ops
from
paddle.fluid
import
core
,
layers
from
paddle.fluid.dygraph
import
base
as
imperative_base
from
paddle.fluid.dygraph
import
to_variable
from
paddle.fluid.framework
import
dygraph_only
class
Taskflow
:
"""
Task flows, one way linked list for task acquisition.
"""
def
__init__
(
self
,
task
,
callback
):
self
.
task
=
task
self
.
callback
=
callback
class
Type
(
Enum
):
"""
Type of trainable parameters
"""
fp16
=
paddle
.
float16
bf16
=
paddle
.
bfloat16
fp32
=
paddle
.
float32
class
ShardingClipGrad
:
def
__init__
(
self
,
clip
,
device
,
group
):
self
.
_clip
=
clip
self
.
_device
=
device
self
.
_group
=
group
@
imperative_base
.
no_grad
def
_dygraph_clip
(
self
,
params_grads
):
sum_square_fp32
,
sum_square_fp16
=
[],
[]
unslice_params_fp32
,
unslice_params_fp16
=
[],
[]
for
p
,
g
in
params_grads
:
p_slice
=
True
# using for slice parameter in sharding stage3
if
g
is
None
or
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
continue
if
hasattr
(
p
,
"unslice"
):
p_slice
=
False
merge_grad
=
g
if
g
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
merge_grad
=
layers
.
get_tensor_from_selected_rows
(
layers
.
merge_selected_rows
(
g
)
)
square
=
paddle
.
square
(
merge_grad
)
sum_square
=
paddle
.
sum
(
square
)
if
p
.
dtype
==
paddle
.
float16
:
if
p_slice
:
sum_square_fp16
.
append
(
sum_square
)
else
:
unslice_params_fp16
.
append
(
sum_square
)
elif
p
.
dtype
==
paddle
.
float32
:
if
p_slice
:
sum_square_fp32
.
append
(
sum_square
)
else
:
unslice_params_fp32
.
append
(
sum_square
)
# global norm of non-distributed FP16 params_and_grads
if
len
(
sum_square_fp16
)
==
0
:
global_norm_fp16
=
paddle
.
to_tensor
([
0.0
],
dtype
=
paddle
.
float32
)
else
:
global_norm_fp16
=
layers
.
concat
(
sum_square_fp16
)
global_norm_fp16
=
paddle
.
sum
(
global_norm_fp16
)
global_norm_fp16
=
paddle
.
cast
(
global_norm_fp16
,
dtype
=
paddle
.
float32
)
# global norm of non-distributed FP16 params_and_grads for unslice parameter
if
len
(
unslice_params_fp16
)
==
0
:
global_unslice_fp16
=
paddle
.
to_tensor
([
0.0
],
dtype
=
paddle
.
float32
)
else
:
global_unslice_fp16
=
layers
.
concat
(
unslice_params_fp16
)
global_unslice_fp16
=
paddle
.
sum
(
global_unslice_fp16
)
global_unslice_fp16
=
paddle
.
cast
(
global_unslice_fp16
,
dtype
=
paddle
.
float32
)
# global norm of non-distributed FP32 params_and_grads
global_norm_fp32
=
(
layers
.
concat
(
sum_square_fp32
)
if
len
(
sum_square_fp32
)
!=
0
else
paddle
.
to_tensor
([
0.0
],
dtype
=
paddle
.
float32
)
)
global_norm_fp32
=
paddle
.
sum
(
global_norm_fp32
)
# global norm of non-distributed FP32 params_and_grads for unslice parameter
global_unslice_fp32
=
(
layers
.
concat
(
unslice_params_fp32
)
if
len
(
unslice_params_fp32
)
!=
0
else
paddle
.
to_tensor
([
0.0
],
dtype
=
paddle
.
float32
)
)
global_unslice_fp32
=
paddle
.
sum
(
global_unslice_fp32
)
global_unslice_var
=
global_unslice_fp16
+
global_unslice_fp32
global_norm_var
=
(
global_norm_fp16
+
global_norm_fp32
+
1.0
/
self
.
_group
.
nranks
*
global_unslice_var
)
# add all reduce to get global norm of distributed params_and_grads
dev_id
=
int
(
self
.
_device
.
split
(
":"
)[
1
])
with
device_guard
(
dev_id
,
"gpu"
):
paddle
.
distributed
.
all_reduce
(
global_norm_var
,
group
=
self
.
_group
)
global_norm_var
=
paddle
.
sqrt
(
global_norm_var
)
max_global_norm
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
)
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
)
clip_var_fp16
=
paddle
.
cast
(
clip_var
,
paddle
.
float16
)
for
p
,
g
in
params_grads
:
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
or
g
is
None
:
continue
origin_state
=
g
.
stop_gradient
g
.
stop_gradient
=
True
if
p
.
dtype
==
paddle
.
float16
:
g
.
scale_
(
clip_var_fp16
)
else
:
g
.
scale_
(
clip_var
)
g
.
stop_gradient
=
origin_state
p
.
_reset_grad_inplace_version
(
True
)
return
params_grads
def
__getattr__
(
self
,
item
):
return
getattr
(
self
.
_clip
,
item
)
def
__call__
(
self
,
params_grads
):
return
self
.
_dygraph_clip
(
params_grads
)
@
contextlib
.
contextmanager
def
device_guard
(
dev_id
=
0
,
device
=
"cpu"
):
origin_device
=
paddle
.
device
.
get_device
()
if
device
==
"cpu"
:
paddle
.
set_device
(
device
)
elif
device
==
"gpu"
:
paddle
.
set_device
(
"gpu:{}"
.
format
(
dev_id
))
try
:
yield
finally
:
paddle
.
set_device
(
origin_device
)
@
dygraph_only
def
ShardingScaler
(
scaler
):
def
unscale_method
(
self
,
optimizer
):
if
not
self
.
_enable
:
return
param_grads
=
[]
param_grads_fp16
=
[]
param_grads_fp32
=
[]
if
hasattr
(
optimizer
,
"update_slice"
):
optimizer
.
update_slice
()
optimizer
.
update_scaler
=
True
if
getattr
(
optimizer
.
_optim
,
'_param_groups'
,
None
)
and
isinstance
(
optimizer
.
_optim
.
_param_groups
[
0
],
dict
):
for
group
in
optimizer
.
_optim
.
_param_groups
:
for
param
in
group
[
'params'
]:
if
param
.
_grad_ivar
()
is
not
None
:
param_grads
.
append
(
param
.
_grad_ivar
())
if
param
.
_grad_ivar
().
dtype
in
[
core
.
VarDesc
.
VarType
.
FP16
,
paddle
.
float16
,
]:
param_grads_fp16
.
append
(
param
.
_grad_ivar
())
else
:
param_grads_fp32
.
append
(
param
.
_grad_ivar
())
else
:
for
param
in
optimizer
.
_optim
.
_parameter_list
:
if
param
.
grad
is
not
None
:
param_grads
.
append
(
param
.
grad
)
if
param
.
grad
.
dtype
in
[
core
.
VarDesc
.
VarType
.
FP16
,
paddle
.
float16
,
]:
param_grads_fp16
.
append
(
param
.
grad
)
else
:
param_grads_fp32
.
append
(
param
.
grad
)
temp_found_inf_fp16
=
to_variable
(
np
.
array
([
0
]).
astype
(
np
.
bool_
))
temp_found_inf_fp32
=
to_variable
(
np
.
array
([
0
]).
astype
(
np
.
bool_
))
device
=
"cpu"
if
optimizer
.
offload
else
"gpu"
dev_id
=
(
0
if
device
==
"cpu"
else
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
)
with
device_guard
(
dev_id
,
device
):
if
len
(
param_grads_fp16
):
_legacy_C_ops
.
check_finite_and_unscale
(
param_grads_fp16
,
self
.
_scale
,
param_grads_fp16
,
temp_found_inf_fp16
,
)
if
len
(
param_grads_fp32
):
_legacy_C_ops
.
check_finite_and_unscale
(
param_grads_fp32
,
self
.
_scale
,
param_grads_fp32
,
temp_found_inf_fp32
,
)
self
.
_found_inf
=
1
if
temp_found_inf_fp16
or
temp_found_inf_fp32
else
0
is_found_inf
=
paddle
.
to_tensor
([
self
.
_found_inf
],
dtype
=
"int32"
)
paddle
.
distributed
.
all_reduce
(
is_found_inf
,
op
=
paddle
.
distributed
.
ReduceOp
.
MAX
,
group
=
optimizer
.
group
,
)
self
.
_found_inf
=
is_found_inf
.
numpy
()[
0
]
scaler
.
_unscale
=
MethodType
(
unscale_method
,
scaler
)
return
scaler
python/paddle/distributed/fleet/utils/internal_storage.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# The file has been adapted from fairscale file:
# https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/misc/param_bucket.py
# Git commit hash: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e
# We retain the following license from the original files:
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
import
numpy
as
np
import
paddle
from
paddle
import
framework
# (TODO: GhostScreaming) It will be removed later.
from
paddle.fluid
import
core
from
..meta_parallel.sharding.sharding_utils
import
Type
,
device_guard
class
InternalStorage
:
"""
This is a basic class, which is responsible for consolidating the basic storage tensor.
"""
# Support integration parameter tensor
def
__init__
(
self
,
size
,
dtype
,
device
,
convert_cpu
=
False
):
self
.
_params
=
[]
self
.
_param_ids
=
[]
self
.
_fill
=
0
self
.
_device
=
device
self
.
_dtype
=
dtype
# The actual flat tensor
size
=
[
size
]
if
isinstance
(
size
,
int
)
else
size
if
convert_cpu
:
value
=
(
np
.
zeros
(
size
,
dtype
=
np
.
float16
)
if
Type
.
fp16
.
value
==
dtype
else
np
.
zeros
(
size
,
dtype
=
np
.
float32
)
)
self
.
buffer
=
core
.
VarBase
(
value
=
value
,
place
=
core
.
CPUPlace
())
else
:
self
.
buffer
=
paddle
.
zeros
(
size
,
dtype
=
dtype
)
def
to
(
self
,
device
,
dtype
=
None
,
keep_alignment
=
True
):
"""
Move the underlying buffer
"""
assert
(
self
.
buffer
is
not
None
),
"Cannot move a collapsed bucket, please rebuild it"
assert
(
dtype
==
Type
.
fp32
.
value
or
Type
.
fp16
.
value
),
"Conversion type is not supported now"
dev_id
=
(
0
if
paddle
.
get_device
()
==
"cpu"
else
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
)
if
self
.
_device
!=
device
:
tmp_buffer
=
(
self
.
buffer
.
cuda
(
dev_id
)
if
device
==
"gpu"
else
self
.
buffer
.
cpu
()
)
for
param
in
self
.
_params
:
param
.
clear_gradient
(
False
)
param
.
_gradient_set_empty
(
False
)
self
.
buffer
.
value
().
get_tensor
().
_clear
()
self
.
buffer
=
tmp_buffer
self
.
_device
=
device
if
dtype
is
not
None
:
self
.
buffer
=
self
.
buffer
.
cast
(
dtype
=
dtype
)
self
.
_dtype
=
dtype
class
ParamStorage
(
InternalStorage
):
"""
This is a basic class to simplify the handling of parameter InternalStorages.
"""
def
__init__
(
self
,
size
,
dtype
,
device
):
super
().
__init__
(
size
,
dtype
,
device
,
convert_cpu
=
True
)
self
.
param2align
=
None
def
to
(
self
,
device
,
dtype
=
None
,
keep_alignment
=
True
):
"""
Move the underlying buffer
"""
super
().
to
(
device
,
dtype
)
if
keep_alignment
:
self
.
_array_params
()
@
framework
.
no_grad
()
def
add_rank_params
(
self
,
trainable_params
,
param2align
,
convert_gpu
=
True
):
"""
Add new parameters to the InternalStorage. Params becomes a view of this InternalStorage buffer.
"""
assert
all
(
[
id
(
param
)
not
in
self
.
_param_ids
for
param
in
trainable_params
]
),
"The same param cannot be checked in twice"
assert
self
.
buffer
is
not
None
self
.
param2align
=
param2align
cpu_param_shape
=
list
()
for
param
in
trainable_params
:
p_shape
=
self
.
_add_param_as_view
(
param
,
param2align
[
param
.
name
],
convert_gpu
)
cpu_param_shape
.
append
(
p_shape
)
if
convert_gpu
:
# buffer convert from cpu to cuda
dev_id
=
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
self
.
buffer
=
self
.
buffer
.
cuda
(
dev_id
)
self
.
_fill
=
0
for
idx
,
param
in
enumerate
(
trainable_params
):
self
.
_convert_buffer
(
param
,
cpu_param_shape
[
idx
],
param2align
[
param
.
name
]
)
self
.
_params
.
append
(
param
)
self
.
_param_ids
.
append
(
id
(
param
))
@
framework
.
no_grad
()
def
_add_param_as_view
(
self
,
param
,
align
,
convert_gpu
=
True
):
assert
(
param
.
dtype
==
self
.
buffer
.
dtype
),
"Different types for the InternalStorage and the param, cannot proceed: {} - {}"
.
format
(
param
.
dtype
,
self
.
buffer
.
dtype
)
var_end
=
self
.
_fill
+
np
.
prod
(
param
.
shape
)
offset
=
var_end
+
align
assert
offset
<=
np
.
prod
(
self
.
buffer
.
shape
)
p_shape
=
param
.
shape
origin_state
=
param
.
stop_gradient
param
.
stop_gradient
=
True
param
.
flatten_
()
param
.
stop_gradient
=
origin_state
# Copy the current param value
dev_id
=
(
0
if
paddle
.
get_device
()
==
"cpu"
else
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
)
with
device_guard
(
dev_id
,
"cpu"
):
tmp_var
=
core
.
VarBase
(
tensor
=
self
.
buffer
.
_slice
(
self
.
_fill
,
var_end
)
)
if
convert_gpu
:
param_cpu
=
param
.
cpu
()
param
.
value
().
get_tensor
().
_clear
()
tmp_var
.
set_value
(
param_cpu
)
else
:
tmp_var
.
set_value
(
param
)
self
.
_fill
=
offset
return
p_shape
@
framework
.
no_grad
()
def
_convert_buffer
(
self
,
param
,
p_shape
,
align
):
var_end
=
self
.
_fill
+
np
.
prod
(
p_shape
)
offset
=
var_end
+
align
assert
offset
<=
np
.
prod
(
self
.
buffer
.
shape
)
# Convert the param value
tmp_tensor
=
self
.
buffer
.
_slice
(
self
.
_fill
,
var_end
)
param
.
value
().
get_tensor
().
_share_data_with
(
tmp_tensor
)
param
.
value
().
get_tensor
().
_set_dims
(
p_shape
)
self
.
_fill
=
offset
@
framework
.
no_grad
()
def
_array_params
(
self
):
"""
Given the parameters which have been registered previously, rebuild the whole InternalStorage.
"""
assert
len
(
self
.
_params
)
>
0
assert
self
.
param2align
is
not
None
self
.
_fill
=
0
for
p
in
self
.
_params
:
self
.
_convert_buffer
(
p
,
p
.
shape
,
self
.
param2align
[
p
.
name
])
# modify
class
GradStorage
(
InternalStorage
):
"""
This is a basic class to simplify the handling of gradient InternalStorages
"""
def
__init__
(
self
,
size
,
dtype
,
device
,
destination
,
parm2align
,
convert_cpu
=
False
):
if
isinstance
(
size
,
np
.
int64
):
size
=
size
.
tolist
()
super
().
__init__
(
size
,
dtype
,
device
,
convert_cpu
)
self
.
_max_size
=
size
self
.
_release
=
False
self
.
params_checked_in
=
0
self
.
destination
=
destination
self
.
_parm2align
=
parm2align
self
.
sent
=
False
def
reset_checked_in
(
self
):
"""Reset the counter of the parameter grads which have been checked in"""
self
.
params_checked_in
=
0
self
.
sent
=
False
@
property
def
all_checked_in
(
self
):
"""Judge all the expected gradient check-in happened"""
return
len
(
self
.
_params
)
==
self
.
params_checked_in
def
can_add_grad_view
(
self
,
param
,
align
):
"""Is there enough InternalStorage to add this parameter gradient, and whether this param have already checked in."""
return
(
self
.
_fill
+
np
.
prod
(
param
.
shape
)
+
align
<=
self
.
_max_size
and
id
(
param
)
not
in
self
.
_param_ids
)
def
to
(
self
,
device
,
dtype
=
None
,
keep_alignment
=
True
):
"""
Move the underlying buffer
"""
if
self
.
_release
:
self
.
rebuild
()
super
().
to
(
device
,
dtype
)
if
keep_alignment
:
self
.
_array_grads
()
@
framework
.
no_grad
()
def
add_grad
(
self
,
param
,
align
):
"""
Add a new parameter gradient to the InternalStorage. Param.grad becomes a view of this InternalStorage buffer.
"""
assert
(
id
(
param
)
not
in
self
.
_param_ids
),
"The same gradients cannot be checked in twice"
self
.
_add_grad_as_view
(
param
,
align
)
self
.
_params
.
append
(
param
)
self
.
_param_ids
.
append
(
id
(
param
))
@
framework
.
no_grad
()
def
manumal_relase
(
self
):
"""
Release the buffer from InternalStorage. The InternalStorage will need to be rebuilt before use.
"""
if
not
self
.
_release
:
for
p
in
self
.
_params
:
if
p
.
grad
is
not
None
:
p
.
clear_gradient
(
False
)
p
.
_gradient_set_empty
(
False
)
self
.
buffer
=
None
self
.
_fill
=
0
self
.
params_checked_in
=
0
self
.
_release
=
True
@
framework
.
no_grad
()
def
rebuild
(
self
):
"""
Given the parameter gradients which have been registered previously, rebuild the whole InternalStorage.
"""
if
self
.
_release
:
self
.
buffer
=
paddle
.
zeros
([
self
.
_max_size
],
dtype
=
self
.
_dtype
)
for
p
in
self
.
_params
:
self
.
_add_grad_as_view
(
p
,
self
.
_parm2align
[
p
.
name
])
self
.
_release
=
False
@
framework
.
no_grad
()
def
_array_grads
(
self
):
"""
Given the parameters gradients which have been registered previously, rebuild the whole InternalStorage.
"""
if
len
(
self
.
_params
)
>
0
:
self
.
_fill
=
0
for
p
in
self
.
_params
:
self
.
_add_grad_as_view
(
p
,
self
.
_parm2align
[
p
.
name
])
@
framework
.
no_grad
()
def
_add_grad_as_view
(
self
,
param
,
align
):
assert
(
np
.
prod
(
self
.
buffer
.
shape
)
>
0
),
"Cannot add a gradient to a released InternalStorage, please rebuild"
assert
param
.
dtype
==
self
.
buffer
.
dtype
grad_end
=
self
.
_fill
+
np
.
prod
(
param
.
shape
)
offset
=
grad_end
+
align
assert
offset
<=
np
.
prod
(
self
.
buffer
.
shape
)
# Copy the current grad value to InternalStorage
dev_id
=
(
0
if
paddle
.
get_device
()
==
"cpu"
else
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
)
if
self
.
_device
==
"cpu"
:
with
device_guard
(
dev_id
,
self
.
_device
):
tmp_var
=
core
.
VarBase
(
self
.
buffer
.
_slice
(
self
.
_fill
,
grad_end
))
param
.
_copy_gradient_from
(
tmp_var
)
tmp_var
.
value
().
get_tensor
().
_clear
()
elif
self
.
_device
==
"gpu"
:
tmp_var
=
core
.
VarBase
(
self
.
buffer
.
_slice
(
self
.
_fill
,
grad_end
))
param
.
_copy_gradient_from
(
tmp_var
)
tmp_var
.
value
().
get_tensor
().
_clear
()
self
.
_fill
=
offset
python/paddle/distributed/sharding/group_sharded.py
浏览文件 @
2bbdc47a
...
...
@@ -16,13 +16,6 @@ import logging
import
os
import
paddle
# Old version
from
paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2
import
(
ShardingOptimizerStage2
,
)
# New version
from
paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2
import
(
GroupShardedOptimizerStage2
,
)
...
...
@@ -35,17 +28,7 @@ from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import
from
paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils
import
(
GroupShardedScaler
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2
import
(
ShardingStage2
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3
import
(
ShardingStage3
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_utils
import
(
ShardingScaler
,
)
from
paddle.distributed.utils.log_utils
import
get_logger
from
paddle.fluid.framework
import
in_dygraph_mode
from
paddle.optimizer
import
Optimizer
logger_
=
get_logger
(
logging
.
WARNING
)
...
...
@@ -148,71 +131,39 @@ def group_sharded_parallel(
logger_
.
info
(
"*"
*
30
)
logger_
.
info
(
"Sharded level os uses sharded level os_g achieved now."
)
logger_
.
info
(
"*"
*
30
)
if
in_dygraph_mode
():
optimizer
=
GroupShardedOptimizerStage2
(
params
=
optimizer
.
_parameter_list
,
optim
=
optimizer
,
group
=
group
,
offload
=
offload
,
dp_group
=
dp_group
,
device
=
device
,
)
model
=
GroupShardedStage2
(
model
,
optimizer
,
group
=
group
,
sync_buffers
=
sync_buffers
,
buffer_max_size
=
buffer_max_size
,
dp_group
=
dp_group
,
device
=
device
,
)
else
:
optimizer
=
ShardingOptimizerStage2
(
params
=
model
.
parameters
(),
optim
=
optimizer
,
group
=
group
,
offload
=
offload
,
device
=
device
,
)
model
=
ShardingStage2
(
model
,
optimizer
,
group
=
group
,
sync_buffers
=
sync_buffers
,
buffer_max_size
=
buffer_max_size
,
device
=
device
,
)
optimizer
=
GroupShardedOptimizerStage2
(
params
=
optimizer
.
_parameter_list
,
optim
=
optimizer
,
group
=
group
,
offload
=
offload
,
dp_group
=
dp_group
,
device
=
device
,
)
model
=
GroupShardedStage2
(
model
,
optimizer
,
group
=
group
,
sync_buffers
=
sync_buffers
,
buffer_max_size
=
buffer_max_size
,
dp_group
=
dp_group
,
device
=
device
,
)
elif
level
==
'p_g_os'
:
if
in_dygraph_mode
():
model
=
GroupShardedStage3
(
model
,
optimizer
=
optimizer
,
group
=
group
,
sync_buffers
=
sync_buffers
,
segment_size
=
segment_size
,
offload
=
offload
,
sync_comm
=
sync_comm
,
dp_group
=
dp_group
,
device
=
device
,
)
else
:
model
=
ShardingStage3
(
model
,
optimizer
=
optimizer
,
group
=
group
,
sync_buffers
=
sync_buffers
,
segment_size
=
segment_size
,
offload
=
offload
,
sync_comm
=
sync_comm
,
device
=
device
,
)
model
=
GroupShardedStage3
(
model
,
optimizer
=
optimizer
,
group
=
group
,
sync_buffers
=
sync_buffers
,
segment_size
=
segment_size
,
offload
=
offload
,
sync_comm
=
sync_comm
,
dp_group
=
dp_group
,
device
=
device
,
)
else
:
raise
ValueError
(
"Please enter the correct level."
)
if
isinstance
(
scaler
,
paddle
.
amp
.
GradScaler
):
if
in_dygraph_mode
():
scaler
=
GroupShardedScaler
(
scaler
)
else
:
scaler
=
ShardingScaler
(
scaler
)
scaler
=
GroupShardedScaler
(
scaler
)
logger_
.
info
(
"*"
*
30
)
logger_
.
info
(
"If there is a communication hang using group sharded, please check whether the communication operations of each process are unified."
...
...
@@ -275,9 +226,9 @@ def save_group_sharded_model(model, output, optimizer=None):
),
"Saving directory ({}) should be a directory, not a file"
.
format
(
output
)
os
.
makedirs
(
output
,
exist_ok
=
True
)
output_model
=
os
.
path
.
join
(
output
,
"model.pdmodel"
)
if
isinstance
(
model
,
(
ShardingStage2
,
GroupShardedStage2
)
):
if
isinstance
(
model
,
GroupShardedStage2
):
paddle
.
save
(
model
.
_layer
.
state_dict
(),
output_model
)
elif
isinstance
(
model
,
(
ShardingStage3
,
GroupShardedStage3
)
):
elif
isinstance
(
model
,
GroupShardedStage3
):
convert2cpu
=
True
if
model
.
_offload
else
False
model
.
get_all_parameters
(
convert2cpu
=
convert2cpu
)
paddle
.
save
(
model
.
_layer
.
state_dict
(),
output_model
)
...
...
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_optimizer_stage2.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
from
paddle.distributed
import
fleet
from
paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2
import
(
ShardingOptimizerStage2
,
)
from
paddle.distributed.fleet.utils.internal_storage
import
GradStorage
from
paddle.nn
import
Linear
base_lr
=
0.1
momentum_rate
=
0.9
l2_decay
=
1e-4
epoch
=
100
batch_size
=
32
class_dim
=
102
class
MLP
(
fluid
.
Layer
):
def
__init__
(
self
,
param_attr
=
None
,
bias_attr
=
None
):
super
().
__init__
()
self
.
_linear1
=
Linear
(
10
,
10
)
self
.
_linear2
=
Linear
(
10
,
10
)
def
forward
(
self
,
inputs
):
y
=
self
.
_linear1
(
inputs
)
y
=
self
.
_linear2
(
y
)
return
y
def
reader_decorator
():
def
__reader__
():
for
_
in
range
(
100
):
img
=
np
.
random
.
rand
(
10
).
astype
(
'float32'
)
label
=
np
.
ones
(
1
).
astype
(
'int64'
)
yield
img
,
label
return
__reader__
def
optimizer_setting
(
parameter_list
=
None
):
optimizer
=
paddle
.
optimizer
.
Momentum
(
learning_rate
=
base_lr
,
momentum
=
momentum_rate
,
weight_decay
=
paddle
.
regularizer
.
L2Decay
(
l2_decay
),
parameters
=
parameter_list
,
)
return
optimizer
def
train_mlp
():
fleet
.
init
(
is_collective
=
True
)
group
=
paddle
.
distributed
.
new_group
([
0
,
1
])
mlp
=
MLP
()
optimizer
=
optimizer_setting
(
parameter_list
=
mlp
.
parameters
())
oss_optimizer
=
ShardingOptimizerStage2
(
params
=
mlp
.
parameters
(),
optim
=
optimizer
,
group
=
group
)
# cover grad_storage code
trainable_param2align
=
dict
()
for
p
in
mlp
.
parameters
():
trainable_param2align
[
p
.
name
]
=
0
grad_storage
=
GradStorage
(
10000
,
dtype
=
paddle
.
float32
,
device
=
"gpu"
,
destination
=
0
,
parm2align
=
trainable_param2align
,
)
for
p
in
mlp
.
parameters
():
grad_storage
.
can_add_grad_view
(
p
,
trainable_param2align
[
p
.
name
])
grad_storage
.
add_grad
(
p
,
trainable_param2align
[
p
.
name
])
grad_storage
.
manumal_relase
()
grad_storage
.
rebuild
()
grad_storage
.
reset_checked_in
()
train_reader
=
paddle
.
batch
(
reader_decorator
(),
batch_size
=
batch_size
,
drop_last
=
True
)
train_loader
=
paddle
.
io
.
DataLoader
.
from_generator
(
capacity
=
32
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
,
use_multiprocess
=
True
,
)
train_loader
.
set_sample_list_generator
(
train_reader
)
for
eop
in
range
(
epoch
):
mlp
.
train
()
for
batch_id
,
data
in
enumerate
(
train_loader
()):
img
,
label
=
data
label
.
stop_gradient
=
True
img
.
stop_gradient
=
True
out
=
mlp
(
img
)
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
label
=
label
)
avg_loss
=
paddle
.
mean
(
x
=
loss
)
acc_top1
=
paddle
.
metric
.
accuracy
(
input
=
out
,
label
=
label
,
k
=
1
)
acc_top5
=
paddle
.
metric
.
accuracy
(
input
=
out
,
label
=
label
,
k
=
5
)
dy_out
=
avg_loss
.
numpy
()
avg_loss
.
backward
()
oss_optimizer
.
step
()
# oss_optimizer clear cache
oss_optimizer
.
_clear_cache
()
# check optimizer.minimize() error
try
:
oss_optimizer
.
minimize
()
except
:
print
(
"====== Find sharding_stage2_optimizer.minimize() error ======"
)
return
if
__name__
==
'__main__'
:
train_mlp
()
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage2.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
shutil
import
tempfile
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
from
paddle.distributed
import
fleet
from
paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2
import
(
ShardingOptimizerStage2
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2
import
(
ShardingStage2
,
)
from
paddle.nn
import
Linear
seed
=
2022
epoch
=
2
linear_size
=
1000
strategy
=
fleet
.
DistributedStrategy
()
strategy
.
hybrid_configs
=
{
"dp_degree"
:
2
,
"mp_degree"
:
1
,
"pp_degree"
:
1
,
"sharding_degree"
:
1
,
}
np
.
random
.
seed
(
seed
)
paddle
.
seed
(
seed
)
class
MLP
(
fluid
.
Layer
):
def
__init__
(
self
,
linear_size
=
1000
,
param_attr
=
None
,
bias_attr
=
None
):
super
().
__init__
()
self
.
_linear1
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear2
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear3
=
Linear
(
linear_size
,
10
)
def
forward
(
self
,
inputs
):
y
=
self
.
_linear1
(
inputs
)
y
=
self
.
_linear2
(
y
)
y
=
self
.
_linear3
(
y
)
return
y
def
reader_decorator
(
linear_size
=
1000
):
def
__reader__
():
for
_
in
range
(
100
):
img
=
np
.
random
.
rand
(
linear_size
).
astype
(
'float32'
)
label
=
np
.
ones
(
1
).
astype
(
'int64'
)
yield
img
,
label
return
__reader__
def
optimizer_setting
(
model
,
use_pure_fp16
,
opt_group
=
False
):
clip
=
paddle
.
nn
.
ClipGradByGlobalNorm
(
clip_norm
=
1.0
)
optimizer
=
paddle
.
optimizer
.
AdamW
(
parameters
=
[{
"params"
:
model
.
parameters
()}]
if
opt_group
else
model
.
parameters
(),
learning_rate
=
0.001
,
weight_decay
=
0.00001
,
grad_clip
=
clip
,
multi_precision
=
use_pure_fp16
,
)
return
optimizer
def
train_mlp
(
model
,
sharding_stage
,
batch_size
=
100
,
use_pure_fp16
=
False
,
accumulate_grad
=
False
,
opt_group
=
False
,
save_model
=
False
,
):
if
sharding_stage
==
"dp"
:
hcg
=
fleet
.
get_hybrid_communicate_group
()
group
=
hcg
.
get_check_parallel_group
()
else
:
group
=
paddle
.
distributed
.
new_group
([
0
,
1
])
if
opt_group
:
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
use_pure_fp16
,
opt_group
=
opt_group
)
else
:
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
use_pure_fp16
)
if
sharding_stage
==
2
:
optimizer
=
ShardingOptimizerStage2
(
params
=
model
.
parameters
(),
optim
=
optimizer
,
group
=
group
)
model
=
ShardingStage2
(
model
,
optimizer
,
group
=
group
,
buffer_max_size
=
2
**
21
)
else
:
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
)
model
=
fleet
.
distributed_model
(
model
)
train_reader
=
paddle
.
batch
(
reader_decorator
(),
batch_size
=
batch_size
,
drop_last
=
True
)
train_loader
=
paddle
.
io
.
DataLoader
.
from_generator
(
capacity
=
32
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
,
use_multiprocess
=
True
,
)
train_loader
.
set_sample_list_generator
(
train_reader
)
if
sharding_stage
==
2
:
model
.
to
(
device
=
"gpu"
)
for
eop
in
range
(
epoch
):
model
.
train
()
for
batch_id
,
data
in
enumerate
(
train_loader
()):
img
,
label
=
data
label
.
stop_gradient
=
True
img
.
stop_gradient
=
True
out
=
model
(
img
)
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
label
=
label
)
avg_loss
=
paddle
.
mean
(
x
=
loss
.
cast
(
dtype
=
paddle
.
float32
))
if
batch_size
==
20
:
avg_loss
=
avg_loss
/
5
avg_loss
.
backward
()
if
not
accumulate_grad
:
optimizer
.
step
()
optimizer
.
clear_grad
()
if
accumulate_grad
:
optimizer
.
step
()
optimizer
.
clear_grad
()
if
save_model
:
return
model
,
optimizer
return
model
.
parameters
()
def
test_dp_stage2
():
mlp
=
MLP
()
state_dict
=
mlp
.
state_dict
()
mlp1
=
MLP
()
mlp2
=
MLP
()
mlp3
=
MLP
()
mlp4
=
MLP
()
mlp5
=
MLP
()
mlp6
=
MLP
()
mlp1
.
set_state_dict
(
state_dict
)
mlp2
.
set_state_dict
(
state_dict
)
mlp3
.
set_state_dict
(
state_dict
)
mlp4
.
set_state_dict
(
state_dict
)
mlp5
.
set_state_dict
(
state_dict
)
mlp6
.
set_state_dict
(
state_dict
)
# DP VS stage2
dp_params
=
train_mlp
(
mlp1
,
sharding_stage
=
"dp"
,
use_pure_fp16
=
False
,
opt_group
=
False
)
stage2_params
=
train_mlp
(
mlp2
,
sharding_stage
=
2
,
use_pure_fp16
=
False
,
opt_group
=
False
)
for
i
in
range
(
len
(
dp_params
)):
np
.
testing
.
assert_allclose
(
dp_params
[
i
].
numpy
(),
stage2_params
[
i
].
numpy
(),
rtol
=
1e-6
)
# stage2 accumulate grad
stage2_params
=
train_mlp
(
mlp3
,
sharding_stage
=
2
,
accumulate_grad
=
True
)
stage2_accumulate_grad
=
train_mlp
(
mlp4
,
sharding_stage
=
2
,
batch_size
=
20
,
accumulate_grad
=
True
)
for
i
in
range
(
len
(
stage2_params
)):
np
.
testing
.
assert_allclose
(
stage2_params
[
i
].
numpy
(),
stage2_accumulate_grad
[
i
].
numpy
(),
rtol
=
1e-5
,
atol
=
1e-5
,
)
# stage2 param list VS param group
stage2_params
=
train_mlp
(
mlp5
,
sharding_stage
=
2
,
use_pure_fp16
=
False
,
opt_group
=
True
)
for
i
in
range
(
len
(
dp_params
)):
np
.
testing
.
assert_allclose
(
dp_params
[
i
].
numpy
(),
stage2_params
[
i
].
numpy
(),
rtol
=
1e-6
)
# save/load model
output_dir
=
tempfile
.
mkdtemp
()
model_file
=
os
.
path
.
join
(
output_dir
,
"model.pdmodel"
)
optimizer_file
=
os
.
path
.
join
(
output_dir
,
"model.pdopt"
)
model_stage2
,
optimizer_stage2
=
train_mlp
(
mlp6
,
sharding_stage
=
2
,
use_pure_fp16
=
False
,
opt_group
=
False
,
save_model
=
True
,
)
paddle
.
save
(
model_stage2
.
state_dict
(),
model_file
)
paddle
.
save
(
optimizer_stage2
.
state_dict
(),
optimizer_file
)
m_state_dict
=
paddle
.
load
(
model_file
)
opt_state_dict
=
paddle
.
load
(
optimizer_file
)
model_stage2
.
set_state_dict
(
m_state_dict
)
optimizer_stage2
.
set_state_dict
(
opt_state_dict
)
shutil
.
rmtree
(
output_dir
)
return
if
__name__
==
'__main__'
:
fleet
.
init
(
is_collective
=
True
,
strategy
=
strategy
)
test_dp_stage2
()
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage2_offload.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
from
dygraph_sharding_stage2
import
MLP
,
optimizer_setting
,
reader_decorator
import
paddle
from
paddle.distributed
import
fleet
from
paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2
import
(
ShardingOptimizerStage2
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2
import
(
ShardingStage2
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_utils
import
(
ShardingScaler
,
)
seed
=
2021
epoch
=
2
batch_size
=
32
linear_size
=
1000
strategy
=
fleet
.
DistributedStrategy
()
strategy
.
hybrid_configs
=
{
"dp_degree"
:
2
,
"mp_degree"
:
1
,
"pp_degree"
:
1
,
"sharding_degree"
:
1
,
}
np
.
random
.
seed
(
seed
)
paddle
.
seed
(
seed
)
def
train_mlp
(
model
,
offload
=
False
):
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
True
)
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
,
save_dtype
=
'float32'
)
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
1024
)
scaler
=
ShardingScaler
(
scaler
)
optimizer
=
ShardingOptimizerStage2
(
params
=
model
.
parameters
(),
optim
=
optimizer
,
offload
=
offload
)
model
=
ShardingStage2
(
model
,
optimizer
,
buffer_max_size
=
2
**
21
)
train_reader
=
paddle
.
batch
(
reader_decorator
(
linear_size
),
batch_size
=
batch_size
,
drop_last
=
True
)
train_loader
=
paddle
.
io
.
DataLoader
.
from_generator
(
capacity
=
32
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
,
use_multiprocess
=
True
,
)
train_loader
.
set_sample_list_generator
(
train_reader
)
for
eop
in
range
(
epoch
):
model
.
train
()
for
batch_id
,
data
in
enumerate
(
train_loader
()):
img
,
label
=
data
label
.
stop_gradient
=
True
img
.
stop_gradient
=
True
with
paddle
.
amp
.
auto_cast
(
True
,
level
=
'O2'
):
out
=
model
(
img
)
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
label
=
label
)
avg_loss
=
paddle
.
mean
(
x
=
loss
.
cast
(
dtype
=
paddle
.
float32
))
scaler
.
scale
(
avg_loss
).
backward
()
scaler
.
step
(
optimizer
)
scaler
.
update
()
optimizer
.
clear_grad
()
for
dtype
in
optimizer
.
param_storages
:
for
dst_rank
,
param_storage
in
optimizer
.
param_storages
[
dtype
].
items
():
param_storage
.
to
(
device
=
"gpu"
,
dtype
=
dtype
)
return
model
.
parameters
()
def
test_sharding_stage2_offload
():
mlp
=
MLP
(
linear_size
)
mlp_offload
=
MLP
(
linear_size
)
mlp_offload
.
set_state_dict
(
mlp
.
state_dict
())
mlp_params
=
train_mlp
(
mlp
,
offload
=
False
)
mlp_offload_params
=
train_mlp
(
mlp_offload
,
offload
=
True
)
for
i
in
range
(
len
(
mlp_params
)):
np
.
testing
.
assert_allclose
(
mlp_params
[
i
].
numpy
(),
mlp_offload_params
[
i
].
numpy
(),
rtol
=
5e-3
,
atol
=
5e-3
,
)
return
if
__name__
==
'__main__'
:
fleet
.
init
(
is_collective
=
True
,
strategy
=
strategy
)
test_sharding_stage2_offload
()
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage3.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
shutil
import
tempfile
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
from
paddle.distributed
import
fleet
from
paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2
import
(
ShardingOptimizerStage2
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2
import
(
ShardingStage2
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3
import
(
ShardingStage3
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_utils
import
(
ShardingScaler
,
)
from
paddle.nn
import
Linear
epoch
=
10
paddle
.
seed
(
2021
)
np
.
random
.
seed
(
2021
)
base_lr
=
0.1
momentum_rate
=
0.9
l2_decay
=
1e-4
class
MLP
(
fluid
.
Layer
):
def
__init__
(
self
,
linear_size
=
1000
,
param_attr
=
None
,
bias_attr
=
None
):
super
().
__init__
()
self
.
_linear1
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear2
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear3
=
Linear
(
linear_size
,
10
)
def
forward
(
self
,
inputs
):
y
=
self
.
_linear1
(
inputs
)
y
=
self
.
_linear2
(
y
)
y
=
self
.
_linear3
(
y
)
return
y
def
reader_decorator
(
linear_size
=
1000
):
def
__reader__
():
for
_
in
range
(
100
):
img
=
np
.
random
.
rand
(
linear_size
).
astype
(
'float32'
)
label
=
np
.
ones
(
1
).
astype
(
'int64'
)
yield
img
,
label
return
__reader__
def
optimizer_setting
(
model
,
use_pure_fp16
,
opt_group
=
False
):
clip
=
paddle
.
nn
.
ClipGradByGlobalNorm
(
clip_norm
=
1.0
)
optimizer
=
paddle
.
optimizer
.
Momentum
(
parameters
=
[{
"params"
:
list
(
model
.
parameters
())}]
if
opt_group
else
list
(
model
.
parameters
()),
learning_rate
=
0.001
,
weight_decay
=
0.00001
,
grad_clip
=
clip
,
multi_precision
=
use_pure_fp16
,
)
return
optimizer
def
train_mlp
(
model
,
sharding_stage
,
use_pure_fp16
=
False
,
accumulate_grad
=
False
,
batch_size
=
100
,
opt_group
=
False
,
sync_comm
=
False
,
test_minimize
=
False
,
save_model
=
False
,
):
group
=
paddle
.
distributed
.
new_group
([
0
,
1
])
if
opt_group
:
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
use_pure_fp16
,
opt_group
=
opt_group
)
else
:
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
use_pure_fp16
)
if
use_pure_fp16
:
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
,
save_dtype
=
'float32'
)
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
32768
)
scaler
=
ShardingScaler
(
scaler
)
if
sharding_stage
==
2
:
optimizer
=
ShardingOptimizerStage2
(
params
=
model
.
parameters
(),
optim
=
optimizer
,
group
=
group
)
model
=
ShardingStage2
(
model
,
optimizer
,
group
=
group
,
buffer_max_size
=
2
**
21
)
elif
sharding_stage
==
3
:
model
=
ShardingStage3
(
model
,
optimizer
=
optimizer
,
group
=
group
,
sync_comm
=
sync_comm
)
# check optimizer.minimize() error
if
test_minimize
:
try
:
optimizer
.
minimize
()
except
:
print
(
"====== Find sharding_stage3_optimizer.minimize() error ======"
)
return
train_reader
=
paddle
.
batch
(
reader_decorator
(),
batch_size
=
batch_size
,
drop_last
=
True
)
train_loader
=
paddle
.
io
.
DataLoader
.
from_generator
(
capacity
=
32
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
,
use_multiprocess
=
True
,
)
train_loader
.
set_sample_list_generator
(
train_reader
)
for
eop
in
range
(
epoch
):
model
.
train
()
for
batch_id
,
data
in
enumerate
(
train_loader
()):
img
,
label
=
data
label
.
stop_gradient
=
True
img
.
stop_gradient
=
True
with
paddle
.
amp
.
auto_cast
(
True
,
level
=
'O2'
):
out
=
model
(
img
)
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
label
=
label
)
avg_loss
=
paddle
.
mean
(
x
=
loss
.
cast
(
dtype
=
paddle
.
float32
))
if
batch_size
==
20
:
avg_loss
=
avg_loss
/
5
if
not
use_pure_fp16
:
avg_loss
.
backward
()
else
:
scaler
.
scale
(
avg_loss
).
backward
()
if
not
accumulate_grad
:
if
not
use_pure_fp16
:
optimizer
.
step
()
else
:
scaler
.
step
(
optimizer
)
scaler
.
update
()
optimizer
.
clear_grad
()
if
accumulate_grad
:
if
not
use_pure_fp16
:
optimizer
.
step
()
else
:
scaler
.
step
(
optimizer
)
scaler
.
update
()
optimizer
.
clear_grad
()
if
sharding_stage
==
3
:
model
.
get_all_parameters
()
if
save_model
:
return
model
,
optimizer
return
model
.
parameters
()
def
test_stage2_stage3
():
mlp
,
mlp1
,
mlp2
,
mlp3
,
mlp4
,
mlp5
,
mlp6
,
mlp7
,
mlp8
,
mlp9
,
mlp10
=
(
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
)
state_dict
=
mlp
.
state_dict
()
mlp1
.
set_state_dict
(
state_dict
)
mlp2
.
set_state_dict
(
state_dict
)
mlp3
.
set_state_dict
(
state_dict
)
mlp4
.
set_state_dict
(
state_dict
)
mlp5
.
set_state_dict
(
state_dict
)
mlp6
.
set_state_dict
(
state_dict
)
mlp7
.
set_state_dict
(
state_dict
)
mlp8
.
set_state_dict
(
state_dict
)
mlp9
.
set_state_dict
(
state_dict
)
mlp10
.
set_state_dict
(
state_dict
)
# fp32
stage2_params
=
train_mlp
(
mlp1
,
sharding_stage
=
2
,
use_pure_fp16
=
False
,
opt_group
=
False
)
stage3_params
=
train_mlp
(
mlp2
,
sharding_stage
=
3
,
use_pure_fp16
=
False
,
opt_group
=
False
)
for
i
in
range
(
len
(
stage2_params
)):
np
.
testing
.
assert_allclose
(
stage2_params
[
i
].
numpy
(),
stage3_params
[
i
].
numpy
(),
rtol
=
1e-6
,
atol
=
1e-6
,
)
# fp32 accumulate grad
stage3_params
=
train_mlp
(
mlp3
,
sharding_stage
=
3
,
use_pure_fp16
=
False
,
accumulate_grad
=
True
,
opt_group
=
True
,
)
stage3_params_add
=
train_mlp
(
mlp4
,
sharding_stage
=
3
,
use_pure_fp16
=
False
,
accumulate_grad
=
True
,
batch_size
=
20
,
opt_group
=
True
,
)
for
i
in
range
(
len
(
stage3_params
)):
np
.
testing
.
assert_allclose
(
stage3_params
[
i
].
numpy
(),
stage3_params_add
[
i
].
numpy
(),
rtol
=
1e-6
,
atol
=
1e-4
,
)
# fp16
stage2_params
=
train_mlp
(
mlp5
,
sharding_stage
=
2
,
use_pure_fp16
=
True
,
opt_group
=
False
)
stage3_params
=
train_mlp
(
mlp6
,
sharding_stage
=
3
,
use_pure_fp16
=
True
,
opt_group
=
False
)
for
i
in
range
(
len
(
stage2_params
)):
np
.
testing
.
assert_allclose
(
stage2_params
[
i
].
numpy
(),
stage3_params
[
i
].
numpy
(),
rtol
=
1e-4
,
atol
=
1e-3
,
)
# fp16 sync_comm
stage3_params
=
train_mlp
(
mlp7
,
sharding_stage
=
3
,
use_pure_fp16
=
True
,
opt_group
=
False
)
stage3_params_re
=
train_mlp
(
mlp8
,
sharding_stage
=
3
,
use_pure_fp16
=
True
,
opt_group
=
False
,
sync_comm
=
True
,
)
for
i
in
range
(
len
(
stage3_params
)):
np
.
testing
.
assert_allclose
(
stage3_params
[
i
].
numpy
(),
stage3_params_re
[
i
].
numpy
(),
rtol
=
1e-6
)
# save/load model
output_dir
=
tempfile
.
mkdtemp
()
model_file
=
os
.
path
.
join
(
output_dir
,
"model.pdmodel"
)
optimizer_file
=
os
.
path
.
join
(
output_dir
,
"model.pdopt"
)
model_stage3
,
optimizer_stage3
=
train_mlp
(
mlp9
,
sharding_stage
=
3
,
use_pure_fp16
=
False
,
opt_group
=
False
,
save_model
=
True
,
)
paddle
.
save
(
model_stage3
.
state_dict
(),
model_file
)
paddle
.
save
(
optimizer_stage3
.
state_dict
(),
optimizer_file
)
m_state_dict
=
paddle
.
load
(
model_file
)
opt_state_dict
=
paddle
.
load
(
optimizer_file
)
model_stage3
.
set_state_dict
(
m_state_dict
)
optimizer_stage3
.
set_state_dict
(
opt_state_dict
)
shutil
.
rmtree
(
output_dir
)
# check optimizer.minimize() error
train_mlp
(
mlp10
,
sharding_stage
=
3
,
use_pure_fp16
=
False
,
opt_group
=
False
,
test_minimize
=
True
,
)
if
__name__
==
'__main__'
:
fleet
.
init
(
is_collective
=
True
)
test_stage2_stage3
()
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage3_offload.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
from
paddle.distributed
import
fleet
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3
import
(
ShardingStage3
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_utils
import
(
ShardingScaler
,
)
from
paddle.nn
import
Linear
epoch
=
10
paddle
.
seed
(
2022
)
np
.
random
.
seed
(
2022
)
base_lr
=
0.1
momentum_rate
=
0.9
l2_decay
=
1e-4
class
MLP
(
fluid
.
Layer
):
def
__init__
(
self
,
linear_size
=
1000
,
param_attr
=
None
,
bias_attr
=
None
):
super
().
__init__
()
self
.
_linear1
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear2
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear3
=
Linear
(
linear_size
,
10
)
def
forward
(
self
,
inputs
):
y
=
self
.
_linear1
(
inputs
)
y
=
self
.
_linear2
(
y
)
y
=
self
.
_linear3
(
y
)
return
y
def
reader_decorator
(
linear_size
=
1000
):
def
__reader__
():
for
_
in
range
(
100
):
img
=
np
.
random
.
rand
(
linear_size
).
astype
(
'float32'
)
label
=
np
.
ones
(
1
).
astype
(
'int64'
)
yield
img
,
label
return
__reader__
def
optimizer_setting
(
model
,
use_pure_fp16
,
opt_group
=
False
):
clip
=
paddle
.
nn
.
ClipGradByGlobalNorm
(
clip_norm
=
1.0
)
optimizer
=
paddle
.
optimizer
.
AdamW
(
parameters
=
[{
"params"
:
model
.
parameters
()}]
if
opt_group
else
model
.
parameters
(),
learning_rate
=
0.001
,
weight_decay
=
0.00001
,
grad_clip
=
clip
,
multi_precision
=
use_pure_fp16
,
)
return
optimizer
def
train_mlp
(
model
,
use_pure_fp16
=
False
,
accumulate_grad
=
False
,
offload
=
False
,
batch_size
=
100
,
convert2cpu
=
False
,
):
group
=
paddle
.
distributed
.
new_group
([
0
,
1
])
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
use_pure_fp16
)
if
use_pure_fp16
:
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
,
save_dtype
=
'float32'
)
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
32768
)
scaler
=
ShardingScaler
(
scaler
)
model
=
ShardingStage3
(
model
,
optimizer
=
optimizer
,
group
=
group
,
offload
=
offload
)
train_reader
=
paddle
.
batch
(
reader_decorator
(),
batch_size
=
batch_size
,
drop_last
=
True
)
train_loader
=
paddle
.
io
.
DataLoader
.
from_generator
(
capacity
=
32
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
,
use_multiprocess
=
True
,
)
train_loader
.
set_sample_list_generator
(
train_reader
)
for
eop
in
range
(
epoch
):
model
.
train
()
for
batch_id
,
data
in
enumerate
(
train_loader
()):
img
,
label
=
data
label
.
stop_gradient
=
True
img
.
stop_gradient
=
True
with
paddle
.
amp
.
auto_cast
(
True
,
level
=
'O2'
):
out
=
model
(
img
)
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
label
=
label
)
avg_loss
=
paddle
.
mean
(
x
=
loss
.
cast
(
dtype
=
paddle
.
float32
))
if
accumulate_grad
:
avg_loss
=
avg_loss
/
5
if
not
use_pure_fp16
:
avg_loss
.
backward
()
else
:
scaler
.
scale
(
avg_loss
).
backward
()
if
not
accumulate_grad
:
if
not
use_pure_fp16
:
optimizer
.
step
()
else
:
scaler
.
step
(
optimizer
)
scaler
.
update
()
optimizer
.
clear_grad
()
if
accumulate_grad
:
if
not
use_pure_fp16
:
optimizer
.
step
()
else
:
scaler
.
step
(
optimizer
)
scaler
.
update
()
optimizer
.
clear_grad
()
if
not
convert2cpu
:
model
.
get_all_parameters
()
else
:
model
.
get_all_parameters
(
convert2cpu
)
return
model
.
parameters
()
def
test_stage3_offload
():
mlp
,
mlp1
,
mlp2
,
mlp3
,
mlp4
,
mlp5
,
mlp6
=
(
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
)
state_dict
=
mlp
.
state_dict
()
mlp1
.
set_state_dict
(
state_dict
)
mlp2
.
set_state_dict
(
state_dict
)
mlp3
.
set_state_dict
(
state_dict
)
mlp4
.
set_state_dict
(
state_dict
)
mlp5
.
set_state_dict
(
state_dict
)
mlp6
.
set_state_dict
(
state_dict
)
# fp32 offload
stage3_params
=
train_mlp
(
mlp1
,
use_pure_fp16
=
False
)
stage3_params_offload
=
train_mlp
(
mlp2
,
use_pure_fp16
=
False
,
offload
=
True
)
for
i
in
range
(
len
(
stage3_params
)):
np
.
testing
.
assert_allclose
(
stage3_params
[
i
].
numpy
(),
stage3_params_offload
[
i
].
numpy
(),
rtol
=
1e-6
,
atol
=
1e-8
,
)
# fp16 offload
stage3_params
=
train_mlp
(
mlp3
,
use_pure_fp16
=
True
)
stage3_params_offload
=
train_mlp
(
mlp4
,
use_pure_fp16
=
True
,
offload
=
True
)
for
i
in
range
(
len
(
stage3_params
)):
np
.
testing
.
assert_allclose
(
stage3_params
[
i
].
numpy
(),
stage3_params_offload
[
i
].
numpy
(),
rtol
=
1e-2
,
atol
=
1e-2
,
)
# fp32 accumulate grad offload
stage3_params
=
train_mlp
(
mlp5
,
use_pure_fp16
=
False
,
batch_size
=
20
,
accumulate_grad
=
True
)
stage3_params_offload
=
train_mlp
(
mlp6
,
use_pure_fp16
=
False
,
accumulate_grad
=
True
,
offload
=
True
,
batch_size
=
20
,
convert2cpu
=
True
,
)
for
i
in
range
(
len
(
stage3_params
)):
np
.
testing
.
assert_allclose
(
stage3_params
[
i
].
numpy
(),
stage3_params_offload
[
i
].
numpy
(),
rtol
=
1e-6
,
atol
=
1e-8
,
)
return
if
__name__
==
'__main__'
:
fleet
.
init
(
is_collective
=
True
)
test_stage3_offload
()
python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_sharding_stage2.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
shutil
import
tempfile
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
from
paddle.distributed
import
fleet
from
paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2
import
(
ShardingOptimizerStage2
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2
import
(
ShardingStage2
,
)
from
paddle.nn
import
Linear
seed
=
2022
epoch
=
2
linear_size
=
1000
strategy
=
fleet
.
DistributedStrategy
()
strategy
.
hybrid_configs
=
{
"dp_degree"
:
16
,
"mp_degree"
:
1
,
"pp_degree"
:
1
,
"sharding_degree"
:
1
,
}
np
.
random
.
seed
(
seed
)
paddle
.
seed
(
seed
)
class
MLP
(
fluid
.
Layer
):
def
__init__
(
self
,
linear_size
=
1000
,
param_attr
=
None
,
bias_attr
=
None
):
super
().
__init__
()
self
.
_linear1
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear2
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear3
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear4
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear5
=
Linear
(
linear_size
,
10
)
def
forward
(
self
,
inputs
):
y
=
self
.
_linear1
(
inputs
)
y
=
self
.
_linear2
(
y
)
y
=
self
.
_linear3
(
y
)
y
=
self
.
_linear4
(
y
)
y
=
self
.
_linear5
(
y
)
return
y
def
reader_decorator
(
linear_size
=
1000
):
def
__reader__
():
for
_
in
range
(
100
):
img
=
np
.
random
.
rand
(
linear_size
).
astype
(
'float32'
)
label
=
np
.
ones
(
1
).
astype
(
'int64'
)
yield
img
,
label
return
__reader__
def
optimizer_setting
(
model
,
use_pure_fp16
,
opt_group
=
False
):
clip
=
paddle
.
nn
.
ClipGradByGlobalNorm
(
clip_norm
=
1.0
)
optimizer
=
paddle
.
optimizer
.
AdamW
(
parameters
=
[{
"params"
:
model
.
parameters
()}]
if
opt_group
else
model
.
parameters
(),
learning_rate
=
0.001
,
weight_decay
=
0.00001
,
grad_clip
=
clip
,
multi_precision
=
use_pure_fp16
,
)
return
optimizer
def
train_mlp
(
model
,
sharding_stage
,
batch_size
=
100
,
use_pure_fp16
=
False
,
accumulate_grad
=
False
,
opt_group
=
False
,
save_model
=
False
,
):
if
sharding_stage
==
"dp"
:
hcg
=
fleet
.
get_hybrid_communicate_group
()
group
=
hcg
.
get_check_parallel_group
()
else
:
group
=
paddle
.
distributed
.
new_group
(
[
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
]
)
if
opt_group
:
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
use_pure_fp16
,
opt_group
=
opt_group
)
else
:
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
use_pure_fp16
)
if
sharding_stage
==
2
:
optimizer
=
ShardingOptimizerStage2
(
params
=
model
.
parameters
(),
optim
=
optimizer
,
group
=
group
)
model
=
ShardingStage2
(
model
,
optimizer
,
group
=
group
,
buffer_max_size
=
2
**
21
)
else
:
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
)
model
=
fleet
.
distributed_model
(
model
)
train_reader
=
paddle
.
batch
(
reader_decorator
(),
batch_size
=
batch_size
,
drop_last
=
True
)
train_loader
=
paddle
.
io
.
DataLoader
.
from_generator
(
capacity
=
32
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
,
use_multiprocess
=
True
,
)
train_loader
.
set_sample_list_generator
(
train_reader
)
if
sharding_stage
==
2
:
model
.
to
(
device
=
"gpu"
)
for
eop
in
range
(
epoch
):
model
.
train
()
for
batch_id
,
data
in
enumerate
(
train_loader
()):
img
,
label
=
data
label
.
stop_gradient
=
True
img
.
stop_gradient
=
True
out
=
model
(
img
)
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
label
=
label
)
avg_loss
=
paddle
.
mean
(
x
=
loss
.
cast
(
dtype
=
paddle
.
float32
))
if
batch_size
==
20
:
avg_loss
=
avg_loss
/
5
avg_loss
.
backward
()
if
not
accumulate_grad
:
optimizer
.
step
()
optimizer
.
clear_grad
()
if
accumulate_grad
:
optimizer
.
step
()
optimizer
.
clear_grad
()
if
save_model
:
return
model
,
optimizer
return
model
.
parameters
()
def
test_dp_stage2
():
mlp
=
MLP
()
state_dict
=
mlp
.
state_dict
()
mlp1
=
MLP
()
mlp2
=
MLP
()
mlp3
=
MLP
()
mlp4
=
MLP
()
mlp5
=
MLP
()
mlp6
=
MLP
()
mlp1
.
set_state_dict
(
state_dict
)
mlp2
.
set_state_dict
(
state_dict
)
mlp3
.
set_state_dict
(
state_dict
)
mlp4
.
set_state_dict
(
state_dict
)
mlp5
.
set_state_dict
(
state_dict
)
mlp6
.
set_state_dict
(
state_dict
)
# DP VS stage2
dp_params
=
train_mlp
(
mlp1
,
sharding_stage
=
"dp"
,
use_pure_fp16
=
False
,
opt_group
=
False
)
stage2_params
=
train_mlp
(
mlp2
,
sharding_stage
=
2
,
use_pure_fp16
=
False
,
opt_group
=
False
)
for
i
in
range
(
len
(
dp_params
)):
np
.
testing
.
assert_allclose
(
dp_params
[
i
].
numpy
(),
stage2_params
[
i
].
numpy
(),
rtol
=
1e-6
,
atol
=
5e-4
)
# stage2 accumulate grad
stage2_params
=
train_mlp
(
mlp3
,
sharding_stage
=
2
,
accumulate_grad
=
True
)
stage2_accumulate_grad
=
train_mlp
(
mlp4
,
sharding_stage
=
2
,
batch_size
=
20
,
accumulate_grad
=
True
)
for
i
in
range
(
len
(
stage2_params
)):
np
.
testing
.
assert_allclose
(
stage2_params
[
i
].
numpy
(),
stage2_accumulate_grad
[
i
].
numpy
(),
rtol
=
1e-5
,
atol
=
1e-5
,
)
# stage2 param list VS param group
stage2_params
=
train_mlp
(
mlp5
,
sharding_stage
=
2
,
use_pure_fp16
=
False
,
opt_group
=
True
)
for
i
in
range
(
len
(
dp_params
)):
np
.
testing
.
assert_allclose
(
dp_params
[
i
].
numpy
(),
stage2_params
[
i
].
numpy
(),
rtol
=
1e-6
,
atol
=
5e-4
)
# save/load model
output_dir
=
tempfile
.
mkdtemp
()
try
:
model_file
=
os
.
path
.
join
(
output_dir
,
"model.pdmodel"
)
optimizer_file
=
os
.
path
.
join
(
output_dir
,
"model.pdopt"
)
model_stage2
,
optimizer_stage2
=
train_mlp
(
mlp6
,
sharding_stage
=
2
,
use_pure_fp16
=
False
,
opt_group
=
False
,
save_model
=
True
,
)
paddle
.
save
(
model_stage2
.
state_dict
(),
model_file
)
paddle
.
save
(
optimizer_stage2
.
state_dict
(),
optimizer_file
)
m_state_dict
=
paddle
.
load
(
model_file
)
opt_state_dict
=
paddle
.
load
(
optimizer_file
)
model_stage2
.
set_state_dict
(
m_state_dict
)
optimizer_stage2
.
set_state_dict
(
opt_state_dict
)
except
Exception
as
e
:
shutil
.
rmtree
(
output_dir
)
raise
e
else
:
shutil
.
rmtree
(
output_dir
)
if
__name__
==
'__main__'
:
fleet
.
init
(
is_collective
=
True
,
strategy
=
strategy
)
test_dp_stage2
()
python/paddle/fluid/tests/unittests/collective/multinode/test_multinode_dygraph_sharding.py
浏览文件 @
2bbdc47a
...
...
@@ -25,13 +25,6 @@ class TestDYgrapShardingDP(TestDistBase):
self
.
_trainers
=
16
self
.
_init_env
()
def
test_hybrid_sharding_stage2
(
self
):
self
.
check_with_place
(
"mn_dygraph_sharding_stage2.py"
,
backend
=
"nccl"
,
need_envs
=
os
.
environ
,
)
def
test_hybrid_sharding_stage3
(
self
):
self
.
check_with_place
(
"mn_dygraph_group_sharded_stage3.py"
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录