Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
2bbdc47a
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2305
Star
20932
Fork
5423
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
2bbdc47a
编写于
12月 27, 2022
作者:
W
wanghuancoder
提交者:
GitHub
12月 27, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
delete old dygraph sharding (#49334)
* delete old dygraph sharding
上级
2ca3d3f7
变更
14
展开全部
隐藏空白更改
内联
并排
Showing
14 changed file
with
31 addition
and
4128 deletion
+31
-4128
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
...optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+0
-468
python/paddle/distributed/fleet/meta_parallel/sharding/__init__.py
...ddle/distributed/fleet/meta_parallel/sharding/__init__.py
+0
-2
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
...stributed/fleet/meta_parallel/sharding/sharding_stage2.py
+0
-619
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
...stributed/fleet/meta_parallel/sharding/sharding_stage3.py
+0
-1055
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
...istributed/fleet/meta_parallel/sharding/sharding_utils.py
+0
-252
python/paddle/distributed/fleet/utils/internal_storage.py
python/paddle/distributed/fleet/utils/internal_storage.py
+0
-348
python/paddle/distributed/sharding/group_sharded.py
python/paddle/distributed/sharding/group_sharded.py
+31
-80
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_optimizer_stage2.py
...sts/collective/fleet/dygraph_sharding_optimizer_stage2.py
+0
-144
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage2.py
...sts/unittests/collective/fleet/dygraph_sharding_stage2.py
+0
-242
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage2_offload.py
...tests/collective/fleet/dygraph_sharding_stage2_offload.py
+0
-122
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage3.py
...sts/unittests/collective/fleet/dygraph_sharding_stage3.py
+0
-319
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage3_offload.py
...tests/collective/fleet/dygraph_sharding_stage3_offload.py
+0
-219
python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_sharding_stage2.py
...ttests/collective/multinode/mn_dygraph_sharding_stage2.py
+0
-251
python/paddle/fluid/tests/unittests/collective/multinode/test_multinode_dygraph_sharding.py
...s/collective/multinode/test_multinode_dygraph_sharding.py
+0
-7
未找到文件。
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# The file has been adapted from fairscale file:
# https://github.com/facebookresearch/fairscale/blob/main/fairscale/optim/oss.py
# Git commit hash: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e
# We retain the following license from the original files:
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
import
logging
from
collections
import
OrderedDict
import
numpy
as
np
import
paddle
import
paddle.distributed
as
dist
from
paddle.distributed.collective
import
_get_global_group
,
new_group
from
paddle.fluid.clip
import
ClipGradByGlobalNorm
from
paddle.framework
import
core
from
paddle.optimizer
import
Optimizer
from
...meta_parallel.sharding.sharding_utils
import
(
ShardingClipGrad
,
Type
,
device_guard
,
)
from
...utils.internal_storage
import
GradStorage
,
ParamStorage
# CUDA alignment 256 bytes, cpu alignment 4096 bytes
alignment
=
{
"gpu"
:
256
,
"cpu"
:
4096
}
align
=
{
Type
.
fp16
.
value
:
2
,
Type
.
fp32
.
value
:
4
,
}
class
ShardingOptimizerStage2
(
Optimizer
):
"""
A wrapper for Sharding Stage2 Optimizer in Dygraph.
.. warning: ShardingOptimizer encapsulates the optimization strategy and integrates it into the optimizer.
.. ZeRO: 1.https://arxiv.org/pdf/1910.02054.pdf 2.https://arxiv.org/pdf/1910.02054.pdf.
"""
# TODO (Baibaifan)
# Feature Notes:
# 1. Unified memory for parameters and parameters.grad to InternalStorage.
# 2. Support the segmentation of optimizer parameters and partial updating of parameters.
# 3. Dynamically adjust training parameters and models.
# 4. Support offload function.
# 5. Support the establishment of independent communication groups.
# 6. Broadcast_fp16 is not supported now.
def
__init__
(
self
,
params
,
optim
,
group
=
None
,
offload
=
False
,
device
=
"gpu"
,
pertrain_sync_models
=
True
,
**
kw
):
super
().
__init__
(
optim
.
_learning_rate
,
params
,
kw
)
# Segmentation information
self
.
_dtype_rank_params
=
(
OrderedDict
()
)
# {dtype:[param1,param2]} device, rank, params
self
.
_param2rank
=
{}
self
.
__segment_params
=
[]
self
.
_rank_buffer_size
=
{}
# {dtype: {rank: numel+alignment}}
self
.
_param2align
=
{}
# {param.name: align}
# Default information
self
.
_optim_defaults
=
kw
self
.
_optim
=
optim
assert
hasattr
(
self
.
_optim
,
"_master_weights"
),
"Must use optimizer with _master_weights attribute"
self
.
_local_params
=
params
self
.
_default_device
=
device
self
.
_pfp16
=
(
len
(
list
(
filter
(
lambda
x
:
x
.
trainable
and
x
.
dtype
==
Type
.
fp16
.
value
,
self
.
_local_params
,
)
)
)
>
0
)
self
.
group
=
(
new_group
(
_get_global_group
().
ranks
)
if
group
is
None
else
group
)
self
.
world_size
=
self
.
group
.
nranks
self
.
rank
=
self
.
group
.
rank
self
.
_global_root_rank
=
self
.
group
.
ranks
[
0
]
# Synchronous all ranks models
if
pertrain_sync_models
:
self
.
_sync_params_and_buffers
()
self
.
param_storages
=
{}
# {dtype: {rank: InternalStorage}}
if
isinstance
(
self
.
_optim
.
_grad_clip
,
ClipGradByGlobalNorm
):
logging
.
warning
(
"While using ClipGradByGlobalNorm in ShardingOptimizer, the grad clip of original optimizer will be changed."
)
self
.
_optim
.
_grad_clip
=
ShardingClipGrad
(
self
.
_optim
.
_grad_clip
,
paddle
.
get_device
(),
self
.
group
)
if
self
.
_optim
.
_parameter_list
and
isinstance
(
self
.
_optim
.
_parameter_list
[
0
],
dict
):
for
item
in
self
.
_optim
.
_param_groups
:
if
"grad_clip"
in
item
.
keys
():
item
[
"grad_clip"
]
=
ShardingClipGrad
(
self
.
_optim
.
_grad_clip
,
paddle
.
get_device
(),
self
.
group
,
)
if
offload
:
assert
(
self
.
_pfp16
),
"Only support offload strategy while using
\'
Adam
\'
,
\'
AdamW
\'
and
\'
Momentum
\'
optimizer with AMP/Pure FP16"
self
.
offload
=
offload
# Using for offload
self
.
offload_device
=
"cpu"
self
.
offload_buffer_size
=
0
self
.
offload_param2align
=
{}
self
.
offload_params
=
None
self
.
offload_grads
=
None
self
.
_master_params
=
{}
# Update optimizer parameters and adjust parameter storage and use according to rank.
self
.
_update_opt_status
()
@
paddle
.
autograd
.
no_grad
()
def
_sync_params_and_buffers
(
self
):
"""
Sync all model states for all ranks
"""
for
p
in
self
.
_local_params
:
dist
.
broadcast
(
p
,
src
=
self
.
_global_root_rank
,
group
=
self
.
group
,
sync_op
=
True
)
# Multi stream operation will be supported later
dist
.
wait
(
tensor
=
p
,
group
=
self
.
group
,
use_calc_stream
=
True
)
def
_generate_master_params
(
self
,
trainable_params
):
if
self
.
offload
:
for
param
in
trainable_params
:
if
param
.
name
not
in
self
.
_master_params
.
keys
():
self
.
_master_params
[
param
.
name
]
=
core
.
VarBase
(
name
=
param
.
name
,
value
=
param
.
cast
(
dtype
=
Type
.
fp32
.
value
).
numpy
(),
place
=
core
.
CPUPlace
(),
stop_gradient
=
param
.
stop_gradient
,
)
else
:
for
param
in
trainable_params
:
if
param
.
dtype
==
Type
.
fp16
.
value
:
self
.
_optim
.
_master_weights
[
param
.
name
]
=
paddle
.
cast
(
param
,
Type
.
fp32
.
value
)
def
_update_opt_status
(
self
):
"""Update optimizer status and parameter storage information, and special functions to be developed."""
# func 1
self
.
_integration_params
()
# fun 2 TODO
# Segement helpers
def
_segment_params
(
self
):
"""
Divide all optimizer parameters equally into rank.
"""
if
len
(
self
.
__segment_params
)
==
0
:
self
.
__segment_params
,
param_lists
=
[
[]
for
_
in
range
(
self
.
world_size
)
],
[[]
for
_
in
range
(
self
.
world_size
)]
sizes
=
[
0
]
*
self
.
world_size
for
param
in
self
.
_local_params
:
# Add this param to rank with smallest size.
rank
=
sizes
.
index
(
min
(
sizes
))
param_lists
[
rank
].
append
(
param
)
# Statistical real numels
sizes
[
rank
]
+=
np
.
prod
(
param
.
shape
)
if
param
.
trainable
else
0
for
rank
,
params
in
enumerate
(
param_lists
):
self
.
__segment_params
[
rank
].
extend
(
params
)
return
self
.
__segment_params
@
property
def
local_params
(
self
):
return
self
.
_local_params
@
property
def
param2rank
(
self
):
"""Map the params to the rank which owns them"""
if
len
(
self
.
_param2rank
)
==
0
:
for
rank
,
params
in
enumerate
(
self
.
_segment_params
()):
for
param
in
params
:
self
.
_param2rank
[
param
.
name
]
=
rank
return
self
.
_param2rank
@
property
def
dtype_rank_params
(
self
):
"""
Divide the parameters into groups according to rank and dtype.
"""
if
len
(
self
.
_dtype_rank_params
)
==
0
:
# Assign the parameters of each rank according to the type
for
param
in
self
.
_local_params
:
if
param
.
dtype
not
in
self
.
_dtype_rank_params
.
keys
():
self
.
_dtype_rank_params
[
param
.
dtype
]
=
[
[]
for
_
in
range
(
self
.
world_size
)
]
self
.
_dtype_rank_params
[
param
.
dtype
][
self
.
param2rank
[
param
.
name
]
].
append
(
param
)
# Sort per rank params by size
for
dtype
in
self
.
_dtype_rank_params
.
keys
():
for
rank_params
in
self
.
_dtype_rank_params
[
dtype
]:
rank_params
.
sort
(
key
=
lambda
x
:
np
.
prod
(
x
.
shape
))
return
self
.
_dtype_rank_params
@
property
def
rank_buffer_size
(
self
):
"""
Count the memory size of the parameters corresponding to rank under the corresponding dtype.
"""
# CUDA alignment 256 bytes
if
len
(
self
.
_rank_buffer_size
)
==
0
:
for
dtype
in
self
.
dtype_rank_params
.
keys
():
if
dtype
not
in
self
.
_rank_buffer_size
.
keys
():
self
.
_rank_buffer_size
[
dtype
]
=
{}
for
dst_rank
,
per_rank_params
in
enumerate
(
self
.
dtype_rank_params
[
dtype
]
):
if
dst_rank
not
in
self
.
_rank_buffer_size
[
dtype
].
keys
():
self
.
_rank_buffer_size
[
dtype
][
dst_rank
]
=
0
for
param
in
per_rank_params
:
if
not
param
.
trainable
:
continue
size
=
np
.
prod
(
param
.
shape
)
*
align
[
dtype
]
remaining
=
size
%
alignment
[
self
.
_default_device
]
ali
=
(
0
if
remaining
==
0
else
alignment
[
self
.
_default_device
]
-
remaining
)
align_
=
ali
//
align
[
dtype
]
self
.
_rank_buffer_size
[
dtype
][
dst_rank
]
+=
(
np
.
prod
(
param
.
shape
)
+
align_
)
self
.
_param2align
[
param
.
name
]
=
align_
return
self
.
_rank_buffer_size
def
_integration_params
(
self
):
"""
Integrate the parameters into a continuous memory according to rank, and support the update of training parameters.
"""
for
dtype
,
per_rank_params
in
self
.
dtype_rank_params
.
items
():
if
dtype
not
in
self
.
param_storages
.
keys
():
self
.
param_storages
[
dtype
]
=
{}
for
dst_rank
,
params
in
enumerate
(
per_rank_params
):
if
len
(
params
)
>
0
:
# Merge all the trainable params in a single InternalStorage
trainable_params
=
list
(
filter
(
lambda
x
:
x
.
trainable
,
params
)
)
if
self
.
_pfp16
and
dst_rank
==
self
.
rank
:
self
.
_generate_master_params
(
trainable_params
)
if
trainable_params
:
param_storage
=
ParamStorage
(
size
=
self
.
rank_buffer_size
[
dtype
][
dst_rank
],
dtype
=
dtype
,
device
=
self
.
_default_device
,
)
param_storage
.
add_rank_params
(
trainable_params
,
self
.
_param2align
)
self
.
param_storages
[
dtype
][
dst_rank
]
=
param_storage
# Clear the InternalStorage keys which are not in use anymore
dtype_in_use
=
list
(
self
.
dtype_rank_params
.
keys
())
dtype_to_pop
=
list
(
filter
(
lambda
x
:
x
not
in
dtype_in_use
,
self
.
param_storages
.
keys
())
)
for
d
in
dtype_to_pop
:
self
.
param_storages
.
pop
(
d
)
if
self
.
offload
:
self
.
_optim
.
_master_weights
=
self
.
_master_params
cpu_master_params
=
[
p
for
p
in
self
.
_master_params
.
values
()]
for
param
in
cpu_master_params
:
size
=
np
.
prod
(
param
.
shape
)
*
align
[
Type
.
fp32
.
value
]
remaining
=
size
%
alignment
[
self
.
offload_device
]
ali
=
(
0
if
remaining
==
0
else
alignment
[
self
.
offload_device
]
-
remaining
)
align_
=
ali
//
align
[
Type
.
fp32
.
value
]
self
.
offload_buffer_size
+=
np
.
prod
(
param
.
shape
)
+
align_
self
.
offload_param2align
[
param
.
name
]
=
align_
if
cpu_master_params
:
with
device_guard
(
self
.
rank
,
self
.
offload_device
):
self
.
offload_params
=
ParamStorage
(
size
=
self
.
offload_buffer_size
,
dtype
=
Type
.
fp32
.
value
,
device
=
self
.
offload_device
,
)
self
.
offload_params
.
add_rank_params
(
cpu_master_params
,
self
.
offload_param2align
,
False
)
self
.
offload_params
.
buffer
.
stop_gradient
=
False
self
.
offload_grads
=
GradStorage
(
size
=
self
.
offload_buffer_size
,
dtype
=
Type
.
fp32
.
value
,
device
=
self
.
offload_device
,
destination
=
self
.
rank
,
parm2align
=
self
.
offload_param2align
,
convert_cpu
=
True
,
)
for
p
in
cpu_master_params
:
self
.
offload_grads
.
add_grad
(
p
,
self
.
offload_param2align
[
p
.
name
]
)
self
.
_optim
.
_master_weights
[
self
.
offload_params
.
buffer
.
name
]
=
self
.
offload_params
.
buffer
def
_offload_acc_grad
(
self
,
param_name
,
grad_fp32_cpu
):
"""accumulate grads with offload strategy"""
with
device_guard
(
self
.
rank
,
self
.
offload_device
):
if
param_name
in
self
.
_master_params
.
keys
():
if
self
.
_master_params
[
param_name
].
grad
is
None
:
self
.
_master_params
[
param_name
].
_copy_gradient_from
(
grad_fp32_cpu
)
else
:
self
.
_master_params
[
param_name
].
grad
.
add_
(
grad_fp32_cpu
)
self
.
offload_params
.
buffer
.
_copy_gradient_from
(
self
.
offload_grads
.
buffer
)
def
_offload_scale_grad
(
self
,
scale_size
):
"""scale grads with offload strategy"""
with
device_guard
(
self
.
rank
,
self
.
offload_device
):
self
.
offload_grads
.
buffer
.
scale_
(
scale
=
scale_size
)
def
_offload_clear_grad
(
self
):
"""clear grads with offload strategy"""
with
device_guard
(
self
.
rank
,
self
.
offload_device
):
self
.
offload_grads
.
buffer
.
zero_
()
def
step
(
self
):
"""
A wrapper for Optimizer's step function to finish the update operation of the optimizer.
"""
if
self
.
offload
:
params_list
=
[
self
.
offload_params
.
buffer
]
# TODO(Baibaifan): Offload will support param_groups later
if
not
isinstance
(
self
.
_optim
.
_param_groups
[
0
],
dict
):
self
.
_optim
.
_parameter_list
=
params_list
self
.
_optim
.
_param_groups
=
params_list
# Run the optimizer of the current rank step
if
self
.
offload
:
with
device_guard
(
device
=
self
.
offload_device
):
self
.
_optim
.
step
()
dev_id
=
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
for
param
in
self
.
_local_params
:
if
param
.
name
in
self
.
_master_params
.
keys
():
param
.
set_value
(
self
.
_master_params
[
param
.
name
]
.
cuda
(
dev_id
)
.
cast
(
dtype
=
param
.
dtype
)
)
else
:
self
.
_optim
.
step
()
# Synchronize all the updated shards in between the ranks
self
.
_broadcast_params
()
def
minimize
(
self
):
raise
RuntimeError
(
"optimizer.minimize() not support now, please use optimizer.step()"
)
def
set_state_dict
(
self
,
state_dict
):
self
.
_optim
.
set_state_dict
(
state_dict
)
def
state_dict
(
self
):
return
self
.
_optim
.
state_dict
()
def
_clear_cache
(
self
):
self
.
__segment_params
.
clear
()
self
.
_dtype_rank_params
.
clear
()
self
.
_param2rank
.
clear
()
@
paddle
.
autograd
.
no_grad
()
def
_broadcast_params
(
self
):
"""Broadcast the parameters of the current rank to each rank"""
assert
self
.
_default_device
==
"gpu"
,
"Only supported gpu"
# Exchange all the shards with the other ranks
for
dtype_per_rank
in
self
.
param_storages
.
values
():
for
dst_rank
,
internal_storage
in
dtype_per_rank
.
items
():
dist
.
broadcast
(
tensor
=
internal_storage
.
buffer
,
src
=
self
.
group
.
ranks
[
dst_rank
],
group
=
self
.
group
,
sync_op
=
True
,
)
# Multi stream operation will be supported later
dist
.
wait
(
tensor
=
internal_storage
.
buffer
,
group
=
self
.
group
,
use_calc_stream
=
True
,
)
python/paddle/distributed/fleet/meta_parallel/sharding/__init__.py
浏览文件 @
2bbdc47a
...
...
@@ -11,5 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.sharding_utils
import
Type
,
device_guard
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
此差异已折叠。
点击以展开。
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
此差异已折叠。
点击以展开。
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
contextlib
from
enum
import
Enum
from
types
import
MethodType
import
numpy
as
np
import
paddle
from
paddle
import
_legacy_C_ops
from
paddle.fluid
import
core
,
layers
from
paddle.fluid.dygraph
import
base
as
imperative_base
from
paddle.fluid.dygraph
import
to_variable
from
paddle.fluid.framework
import
dygraph_only
class
Taskflow
:
"""
Task flows, one way linked list for task acquisition.
"""
def
__init__
(
self
,
task
,
callback
):
self
.
task
=
task
self
.
callback
=
callback
class
Type
(
Enum
):
"""
Type of trainable parameters
"""
fp16
=
paddle
.
float16
bf16
=
paddle
.
bfloat16
fp32
=
paddle
.
float32
class
ShardingClipGrad
:
def
__init__
(
self
,
clip
,
device
,
group
):
self
.
_clip
=
clip
self
.
_device
=
device
self
.
_group
=
group
@
imperative_base
.
no_grad
def
_dygraph_clip
(
self
,
params_grads
):
sum_square_fp32
,
sum_square_fp16
=
[],
[]
unslice_params_fp32
,
unslice_params_fp16
=
[],
[]
for
p
,
g
in
params_grads
:
p_slice
=
True
# using for slice parameter in sharding stage3
if
g
is
None
or
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
continue
if
hasattr
(
p
,
"unslice"
):
p_slice
=
False
merge_grad
=
g
if
g
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
merge_grad
=
layers
.
get_tensor_from_selected_rows
(
layers
.
merge_selected_rows
(
g
)
)
square
=
paddle
.
square
(
merge_grad
)
sum_square
=
paddle
.
sum
(
square
)
if
p
.
dtype
==
paddle
.
float16
:
if
p_slice
:
sum_square_fp16
.
append
(
sum_square
)
else
:
unslice_params_fp16
.
append
(
sum_square
)
elif
p
.
dtype
==
paddle
.
float32
:
if
p_slice
:
sum_square_fp32
.
append
(
sum_square
)
else
:
unslice_params_fp32
.
append
(
sum_square
)
# global norm of non-distributed FP16 params_and_grads
if
len
(
sum_square_fp16
)
==
0
:
global_norm_fp16
=
paddle
.
to_tensor
([
0.0
],
dtype
=
paddle
.
float32
)
else
:
global_norm_fp16
=
layers
.
concat
(
sum_square_fp16
)
global_norm_fp16
=
paddle
.
sum
(
global_norm_fp16
)
global_norm_fp16
=
paddle
.
cast
(
global_norm_fp16
,
dtype
=
paddle
.
float32
)
# global norm of non-distributed FP16 params_and_grads for unslice parameter
if
len
(
unslice_params_fp16
)
==
0
:
global_unslice_fp16
=
paddle
.
to_tensor
([
0.0
],
dtype
=
paddle
.
float32
)
else
:
global_unslice_fp16
=
layers
.
concat
(
unslice_params_fp16
)
global_unslice_fp16
=
paddle
.
sum
(
global_unslice_fp16
)
global_unslice_fp16
=
paddle
.
cast
(
global_unslice_fp16
,
dtype
=
paddle
.
float32
)
# global norm of non-distributed FP32 params_and_grads
global_norm_fp32
=
(
layers
.
concat
(
sum_square_fp32
)
if
len
(
sum_square_fp32
)
!=
0
else
paddle
.
to_tensor
([
0.0
],
dtype
=
paddle
.
float32
)
)
global_norm_fp32
=
paddle
.
sum
(
global_norm_fp32
)
# global norm of non-distributed FP32 params_and_grads for unslice parameter
global_unslice_fp32
=
(
layers
.
concat
(
unslice_params_fp32
)
if
len
(
unslice_params_fp32
)
!=
0
else
paddle
.
to_tensor
([
0.0
],
dtype
=
paddle
.
float32
)
)
global_unslice_fp32
=
paddle
.
sum
(
global_unslice_fp32
)
global_unslice_var
=
global_unslice_fp16
+
global_unslice_fp32
global_norm_var
=
(
global_norm_fp16
+
global_norm_fp32
+
1.0
/
self
.
_group
.
nranks
*
global_unslice_var
)
# add all reduce to get global norm of distributed params_and_grads
dev_id
=
int
(
self
.
_device
.
split
(
":"
)[
1
])
with
device_guard
(
dev_id
,
"gpu"
):
paddle
.
distributed
.
all_reduce
(
global_norm_var
,
group
=
self
.
_group
)
global_norm_var
=
paddle
.
sqrt
(
global_norm_var
)
max_global_norm
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
)
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
)
clip_var_fp16
=
paddle
.
cast
(
clip_var
,
paddle
.
float16
)
for
p
,
g
in
params_grads
:
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
or
g
is
None
:
continue
origin_state
=
g
.
stop_gradient
g
.
stop_gradient
=
True
if
p
.
dtype
==
paddle
.
float16
:
g
.
scale_
(
clip_var_fp16
)
else
:
g
.
scale_
(
clip_var
)
g
.
stop_gradient
=
origin_state
p
.
_reset_grad_inplace_version
(
True
)
return
params_grads
def
__getattr__
(
self
,
item
):
return
getattr
(
self
.
_clip
,
item
)
def
__call__
(
self
,
params_grads
):
return
self
.
_dygraph_clip
(
params_grads
)
@
contextlib
.
contextmanager
def
device_guard
(
dev_id
=
0
,
device
=
"cpu"
):
origin_device
=
paddle
.
device
.
get_device
()
if
device
==
"cpu"
:
paddle
.
set_device
(
device
)
elif
device
==
"gpu"
:
paddle
.
set_device
(
"gpu:{}"
.
format
(
dev_id
))
try
:
yield
finally
:
paddle
.
set_device
(
origin_device
)
@
dygraph_only
def
ShardingScaler
(
scaler
):
def
unscale_method
(
self
,
optimizer
):
if
not
self
.
_enable
:
return
param_grads
=
[]
param_grads_fp16
=
[]
param_grads_fp32
=
[]
if
hasattr
(
optimizer
,
"update_slice"
):
optimizer
.
update_slice
()
optimizer
.
update_scaler
=
True
if
getattr
(
optimizer
.
_optim
,
'_param_groups'
,
None
)
and
isinstance
(
optimizer
.
_optim
.
_param_groups
[
0
],
dict
):
for
group
in
optimizer
.
_optim
.
_param_groups
:
for
param
in
group
[
'params'
]:
if
param
.
_grad_ivar
()
is
not
None
:
param_grads
.
append
(
param
.
_grad_ivar
())
if
param
.
_grad_ivar
().
dtype
in
[
core
.
VarDesc
.
VarType
.
FP16
,
paddle
.
float16
,
]:
param_grads_fp16
.
append
(
param
.
_grad_ivar
())
else
:
param_grads_fp32
.
append
(
param
.
_grad_ivar
())
else
:
for
param
in
optimizer
.
_optim
.
_parameter_list
:
if
param
.
grad
is
not
None
:
param_grads
.
append
(
param
.
grad
)
if
param
.
grad
.
dtype
in
[
core
.
VarDesc
.
VarType
.
FP16
,
paddle
.
float16
,
]:
param_grads_fp16
.
append
(
param
.
grad
)
else
:
param_grads_fp32
.
append
(
param
.
grad
)
temp_found_inf_fp16
=
to_variable
(
np
.
array
([
0
]).
astype
(
np
.
bool_
))
temp_found_inf_fp32
=
to_variable
(
np
.
array
([
0
]).
astype
(
np
.
bool_
))
device
=
"cpu"
if
optimizer
.
offload
else
"gpu"
dev_id
=
(
0
if
device
==
"cpu"
else
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
)
with
device_guard
(
dev_id
,
device
):
if
len
(
param_grads_fp16
):
_legacy_C_ops
.
check_finite_and_unscale
(
param_grads_fp16
,
self
.
_scale
,
param_grads_fp16
,
temp_found_inf_fp16
,
)
if
len
(
param_grads_fp32
):
_legacy_C_ops
.
check_finite_and_unscale
(
param_grads_fp32
,
self
.
_scale
,
param_grads_fp32
,
temp_found_inf_fp32
,
)
self
.
_found_inf
=
1
if
temp_found_inf_fp16
or
temp_found_inf_fp32
else
0
is_found_inf
=
paddle
.
to_tensor
([
self
.
_found_inf
],
dtype
=
"int32"
)
paddle
.
distributed
.
all_reduce
(
is_found_inf
,
op
=
paddle
.
distributed
.
ReduceOp
.
MAX
,
group
=
optimizer
.
group
,
)
self
.
_found_inf
=
is_found_inf
.
numpy
()[
0
]
scaler
.
_unscale
=
MethodType
(
unscale_method
,
scaler
)
return
scaler
python/paddle/distributed/fleet/utils/internal_storage.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# The file has been adapted from fairscale file:
# https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/misc/param_bucket.py
# Git commit hash: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e
# We retain the following license from the original files:
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
import
numpy
as
np
import
paddle
from
paddle
import
framework
# (TODO: GhostScreaming) It will be removed later.
from
paddle.fluid
import
core
from
..meta_parallel.sharding.sharding_utils
import
Type
,
device_guard
class
InternalStorage
:
"""
This is a basic class, which is responsible for consolidating the basic storage tensor.
"""
# Support integration parameter tensor
def
__init__
(
self
,
size
,
dtype
,
device
,
convert_cpu
=
False
):
self
.
_params
=
[]
self
.
_param_ids
=
[]
self
.
_fill
=
0
self
.
_device
=
device
self
.
_dtype
=
dtype
# The actual flat tensor
size
=
[
size
]
if
isinstance
(
size
,
int
)
else
size
if
convert_cpu
:
value
=
(
np
.
zeros
(
size
,
dtype
=
np
.
float16
)
if
Type
.
fp16
.
value
==
dtype
else
np
.
zeros
(
size
,
dtype
=
np
.
float32
)
)
self
.
buffer
=
core
.
VarBase
(
value
=
value
,
place
=
core
.
CPUPlace
())
else
:
self
.
buffer
=
paddle
.
zeros
(
size
,
dtype
=
dtype
)
def
to
(
self
,
device
,
dtype
=
None
,
keep_alignment
=
True
):
"""
Move the underlying buffer
"""
assert
(
self
.
buffer
is
not
None
),
"Cannot move a collapsed bucket, please rebuild it"
assert
(
dtype
==
Type
.
fp32
.
value
or
Type
.
fp16
.
value
),
"Conversion type is not supported now"
dev_id
=
(
0
if
paddle
.
get_device
()
==
"cpu"
else
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
)
if
self
.
_device
!=
device
:
tmp_buffer
=
(
self
.
buffer
.
cuda
(
dev_id
)
if
device
==
"gpu"
else
self
.
buffer
.
cpu
()
)
for
param
in
self
.
_params
:
param
.
clear_gradient
(
False
)
param
.
_gradient_set_empty
(
False
)
self
.
buffer
.
value
().
get_tensor
().
_clear
()
self
.
buffer
=
tmp_buffer
self
.
_device
=
device
if
dtype
is
not
None
:
self
.
buffer
=
self
.
buffer
.
cast
(
dtype
=
dtype
)
self
.
_dtype
=
dtype
class
ParamStorage
(
InternalStorage
):
"""
This is a basic class to simplify the handling of parameter InternalStorages.
"""
def
__init__
(
self
,
size
,
dtype
,
device
):
super
().
__init__
(
size
,
dtype
,
device
,
convert_cpu
=
True
)
self
.
param2align
=
None
def
to
(
self
,
device
,
dtype
=
None
,
keep_alignment
=
True
):
"""
Move the underlying buffer
"""
super
().
to
(
device
,
dtype
)
if
keep_alignment
:
self
.
_array_params
()
@
framework
.
no_grad
()
def
add_rank_params
(
self
,
trainable_params
,
param2align
,
convert_gpu
=
True
):
"""
Add new parameters to the InternalStorage. Params becomes a view of this InternalStorage buffer.
"""
assert
all
(
[
id
(
param
)
not
in
self
.
_param_ids
for
param
in
trainable_params
]
),
"The same param cannot be checked in twice"
assert
self
.
buffer
is
not
None
self
.
param2align
=
param2align
cpu_param_shape
=
list
()
for
param
in
trainable_params
:
p_shape
=
self
.
_add_param_as_view
(
param
,
param2align
[
param
.
name
],
convert_gpu
)
cpu_param_shape
.
append
(
p_shape
)
if
convert_gpu
:
# buffer convert from cpu to cuda
dev_id
=
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
self
.
buffer
=
self
.
buffer
.
cuda
(
dev_id
)
self
.
_fill
=
0
for
idx
,
param
in
enumerate
(
trainable_params
):
self
.
_convert_buffer
(
param
,
cpu_param_shape
[
idx
],
param2align
[
param
.
name
]
)
self
.
_params
.
append
(
param
)
self
.
_param_ids
.
append
(
id
(
param
))
@
framework
.
no_grad
()
def
_add_param_as_view
(
self
,
param
,
align
,
convert_gpu
=
True
):
assert
(
param
.
dtype
==
self
.
buffer
.
dtype
),
"Different types for the InternalStorage and the param, cannot proceed: {} - {}"
.
format
(
param
.
dtype
,
self
.
buffer
.
dtype
)
var_end
=
self
.
_fill
+
np
.
prod
(
param
.
shape
)
offset
=
var_end
+
align
assert
offset
<=
np
.
prod
(
self
.
buffer
.
shape
)
p_shape
=
param
.
shape
origin_state
=
param
.
stop_gradient
param
.
stop_gradient
=
True
param
.
flatten_
()
param
.
stop_gradient
=
origin_state
# Copy the current param value
dev_id
=
(
0
if
paddle
.
get_device
()
==
"cpu"
else
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
)
with
device_guard
(
dev_id
,
"cpu"
):
tmp_var
=
core
.
VarBase
(
tensor
=
self
.
buffer
.
_slice
(
self
.
_fill
,
var_end
)
)
if
convert_gpu
:
param_cpu
=
param
.
cpu
()
param
.
value
().
get_tensor
().
_clear
()
tmp_var
.
set_value
(
param_cpu
)
else
:
tmp_var
.
set_value
(
param
)
self
.
_fill
=
offset
return
p_shape
@
framework
.
no_grad
()
def
_convert_buffer
(
self
,
param
,
p_shape
,
align
):
var_end
=
self
.
_fill
+
np
.
prod
(
p_shape
)
offset
=
var_end
+
align
assert
offset
<=
np
.
prod
(
self
.
buffer
.
shape
)
# Convert the param value
tmp_tensor
=
self
.
buffer
.
_slice
(
self
.
_fill
,
var_end
)
param
.
value
().
get_tensor
().
_share_data_with
(
tmp_tensor
)
param
.
value
().
get_tensor
().
_set_dims
(
p_shape
)
self
.
_fill
=
offset
@
framework
.
no_grad
()
def
_array_params
(
self
):
"""
Given the parameters which have been registered previously, rebuild the whole InternalStorage.
"""
assert
len
(
self
.
_params
)
>
0
assert
self
.
param2align
is
not
None
self
.
_fill
=
0
for
p
in
self
.
_params
:
self
.
_convert_buffer
(
p
,
p
.
shape
,
self
.
param2align
[
p
.
name
])
# modify
class
GradStorage
(
InternalStorage
):
"""
This is a basic class to simplify the handling of gradient InternalStorages
"""
def
__init__
(
self
,
size
,
dtype
,
device
,
destination
,
parm2align
,
convert_cpu
=
False
):
if
isinstance
(
size
,
np
.
int64
):
size
=
size
.
tolist
()
super
().
__init__
(
size
,
dtype
,
device
,
convert_cpu
)
self
.
_max_size
=
size
self
.
_release
=
False
self
.
params_checked_in
=
0
self
.
destination
=
destination
self
.
_parm2align
=
parm2align
self
.
sent
=
False
def
reset_checked_in
(
self
):
"""Reset the counter of the parameter grads which have been checked in"""
self
.
params_checked_in
=
0
self
.
sent
=
False
@
property
def
all_checked_in
(
self
):
"""Judge all the expected gradient check-in happened"""
return
len
(
self
.
_params
)
==
self
.
params_checked_in
def
can_add_grad_view
(
self
,
param
,
align
):
"""Is there enough InternalStorage to add this parameter gradient, and whether this param have already checked in."""
return
(
self
.
_fill
+
np
.
prod
(
param
.
shape
)
+
align
<=
self
.
_max_size
and
id
(
param
)
not
in
self
.
_param_ids
)
def
to
(
self
,
device
,
dtype
=
None
,
keep_alignment
=
True
):
"""
Move the underlying buffer
"""
if
self
.
_release
:
self
.
rebuild
()
super
().
to
(
device
,
dtype
)
if
keep_alignment
:
self
.
_array_grads
()
@
framework
.
no_grad
()
def
add_grad
(
self
,
param
,
align
):
"""
Add a new parameter gradient to the InternalStorage. Param.grad becomes a view of this InternalStorage buffer.
"""
assert
(
id
(
param
)
not
in
self
.
_param_ids
),
"The same gradients cannot be checked in twice"
self
.
_add_grad_as_view
(
param
,
align
)
self
.
_params
.
append
(
param
)
self
.
_param_ids
.
append
(
id
(
param
))
@
framework
.
no_grad
()
def
manumal_relase
(
self
):
"""
Release the buffer from InternalStorage. The InternalStorage will need to be rebuilt before use.
"""
if
not
self
.
_release
:
for
p
in
self
.
_params
:
if
p
.
grad
is
not
None
:
p
.
clear_gradient
(
False
)
p
.
_gradient_set_empty
(
False
)
self
.
buffer
=
None
self
.
_fill
=
0
self
.
params_checked_in
=
0
self
.
_release
=
True
@
framework
.
no_grad
()
def
rebuild
(
self
):
"""
Given the parameter gradients which have been registered previously, rebuild the whole InternalStorage.
"""
if
self
.
_release
:
self
.
buffer
=
paddle
.
zeros
([
self
.
_max_size
],
dtype
=
self
.
_dtype
)
for
p
in
self
.
_params
:
self
.
_add_grad_as_view
(
p
,
self
.
_parm2align
[
p
.
name
])
self
.
_release
=
False
@
framework
.
no_grad
()
def
_array_grads
(
self
):
"""
Given the parameters gradients which have been registered previously, rebuild the whole InternalStorage.
"""
if
len
(
self
.
_params
)
>
0
:
self
.
_fill
=
0
for
p
in
self
.
_params
:
self
.
_add_grad_as_view
(
p
,
self
.
_parm2align
[
p
.
name
])
@
framework
.
no_grad
()
def
_add_grad_as_view
(
self
,
param
,
align
):
assert
(
np
.
prod
(
self
.
buffer
.
shape
)
>
0
),
"Cannot add a gradient to a released InternalStorage, please rebuild"
assert
param
.
dtype
==
self
.
buffer
.
dtype
grad_end
=
self
.
_fill
+
np
.
prod
(
param
.
shape
)
offset
=
grad_end
+
align
assert
offset
<=
np
.
prod
(
self
.
buffer
.
shape
)
# Copy the current grad value to InternalStorage
dev_id
=
(
0
if
paddle
.
get_device
()
==
"cpu"
else
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
)
if
self
.
_device
==
"cpu"
:
with
device_guard
(
dev_id
,
self
.
_device
):
tmp_var
=
core
.
VarBase
(
self
.
buffer
.
_slice
(
self
.
_fill
,
grad_end
))
param
.
_copy_gradient_from
(
tmp_var
)
tmp_var
.
value
().
get_tensor
().
_clear
()
elif
self
.
_device
==
"gpu"
:
tmp_var
=
core
.
VarBase
(
self
.
buffer
.
_slice
(
self
.
_fill
,
grad_end
))
param
.
_copy_gradient_from
(
tmp_var
)
tmp_var
.
value
().
get_tensor
().
_clear
()
self
.
_fill
=
offset
python/paddle/distributed/sharding/group_sharded.py
浏览文件 @
2bbdc47a
...
...
@@ -16,13 +16,6 @@ import logging
import
os
import
paddle
# Old version
from
paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2
import
(
ShardingOptimizerStage2
,
)
# New version
from
paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2
import
(
GroupShardedOptimizerStage2
,
)
...
...
@@ -35,17 +28,7 @@ from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import
from
paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils
import
(
GroupShardedScaler
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2
import
(
ShardingStage2
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3
import
(
ShardingStage3
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_utils
import
(
ShardingScaler
,
)
from
paddle.distributed.utils.log_utils
import
get_logger
from
paddle.fluid.framework
import
in_dygraph_mode
from
paddle.optimizer
import
Optimizer
logger_
=
get_logger
(
logging
.
WARNING
)
...
...
@@ -148,71 +131,39 @@ def group_sharded_parallel(
logger_
.
info
(
"*"
*
30
)
logger_
.
info
(
"Sharded level os uses sharded level os_g achieved now."
)
logger_
.
info
(
"*"
*
30
)
if
in_dygraph_mode
():
optimizer
=
GroupShardedOptimizerStage2
(
params
=
optimizer
.
_parameter_list
,
optim
=
optimizer
,
group
=
group
,
offload
=
offload
,
dp_group
=
dp_group
,
device
=
device
,
)
model
=
GroupShardedStage2
(
model
,
optimizer
,
group
=
group
,
sync_buffers
=
sync_buffers
,
buffer_max_size
=
buffer_max_size
,
dp_group
=
dp_group
,
device
=
device
,
)
else
:
optimizer
=
ShardingOptimizerStage2
(
params
=
model
.
parameters
(),
optim
=
optimizer
,
group
=
group
,
offload
=
offload
,
device
=
device
,
)
model
=
ShardingStage2
(
model
,
optimizer
,
group
=
group
,
sync_buffers
=
sync_buffers
,
buffer_max_size
=
buffer_max_size
,
device
=
device
,
)
optimizer
=
GroupShardedOptimizerStage2
(
params
=
optimizer
.
_parameter_list
,
optim
=
optimizer
,
group
=
group
,
offload
=
offload
,
dp_group
=
dp_group
,
device
=
device
,
)
model
=
GroupShardedStage2
(
model
,
optimizer
,
group
=
group
,
sync_buffers
=
sync_buffers
,
buffer_max_size
=
buffer_max_size
,
dp_group
=
dp_group
,
device
=
device
,
)
elif
level
==
'p_g_os'
:
if
in_dygraph_mode
():
model
=
GroupShardedStage3
(
model
,
optimizer
=
optimizer
,
group
=
group
,
sync_buffers
=
sync_buffers
,
segment_size
=
segment_size
,
offload
=
offload
,
sync_comm
=
sync_comm
,
dp_group
=
dp_group
,
device
=
device
,
)
else
:
model
=
ShardingStage3
(
model
,
optimizer
=
optimizer
,
group
=
group
,
sync_buffers
=
sync_buffers
,
segment_size
=
segment_size
,
offload
=
offload
,
sync_comm
=
sync_comm
,
device
=
device
,
)
model
=
GroupShardedStage3
(
model
,
optimizer
=
optimizer
,
group
=
group
,
sync_buffers
=
sync_buffers
,
segment_size
=
segment_size
,
offload
=
offload
,
sync_comm
=
sync_comm
,
dp_group
=
dp_group
,
device
=
device
,
)
else
:
raise
ValueError
(
"Please enter the correct level."
)
if
isinstance
(
scaler
,
paddle
.
amp
.
GradScaler
):
if
in_dygraph_mode
():
scaler
=
GroupShardedScaler
(
scaler
)
else
:
scaler
=
ShardingScaler
(
scaler
)
scaler
=
GroupShardedScaler
(
scaler
)
logger_
.
info
(
"*"
*
30
)
logger_
.
info
(
"If there is a communication hang using group sharded, please check whether the communication operations of each process are unified."
...
...
@@ -275,9 +226,9 @@ def save_group_sharded_model(model, output, optimizer=None):
),
"Saving directory ({}) should be a directory, not a file"
.
format
(
output
)
os
.
makedirs
(
output
,
exist_ok
=
True
)
output_model
=
os
.
path
.
join
(
output
,
"model.pdmodel"
)
if
isinstance
(
model
,
(
ShardingStage2
,
GroupShardedStage2
)
):
if
isinstance
(
model
,
GroupShardedStage2
):
paddle
.
save
(
model
.
_layer
.
state_dict
(),
output_model
)
elif
isinstance
(
model
,
(
ShardingStage3
,
GroupShardedStage3
)
):
elif
isinstance
(
model
,
GroupShardedStage3
):
convert2cpu
=
True
if
model
.
_offload
else
False
model
.
get_all_parameters
(
convert2cpu
=
convert2cpu
)
paddle
.
save
(
model
.
_layer
.
state_dict
(),
output_model
)
...
...
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_optimizer_stage2.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
from
paddle.distributed
import
fleet
from
paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2
import
(
ShardingOptimizerStage2
,
)
from
paddle.distributed.fleet.utils.internal_storage
import
GradStorage
from
paddle.nn
import
Linear
base_lr
=
0.1
momentum_rate
=
0.9
l2_decay
=
1e-4
epoch
=
100
batch_size
=
32
class_dim
=
102
class
MLP
(
fluid
.
Layer
):
def
__init__
(
self
,
param_attr
=
None
,
bias_attr
=
None
):
super
().
__init__
()
self
.
_linear1
=
Linear
(
10
,
10
)
self
.
_linear2
=
Linear
(
10
,
10
)
def
forward
(
self
,
inputs
):
y
=
self
.
_linear1
(
inputs
)
y
=
self
.
_linear2
(
y
)
return
y
def
reader_decorator
():
def
__reader__
():
for
_
in
range
(
100
):
img
=
np
.
random
.
rand
(
10
).
astype
(
'float32'
)
label
=
np
.
ones
(
1
).
astype
(
'int64'
)
yield
img
,
label
return
__reader__
def
optimizer_setting
(
parameter_list
=
None
):
optimizer
=
paddle
.
optimizer
.
Momentum
(
learning_rate
=
base_lr
,
momentum
=
momentum_rate
,
weight_decay
=
paddle
.
regularizer
.
L2Decay
(
l2_decay
),
parameters
=
parameter_list
,
)
return
optimizer
def
train_mlp
():
fleet
.
init
(
is_collective
=
True
)
group
=
paddle
.
distributed
.
new_group
([
0
,
1
])
mlp
=
MLP
()
optimizer
=
optimizer_setting
(
parameter_list
=
mlp
.
parameters
())
oss_optimizer
=
ShardingOptimizerStage2
(
params
=
mlp
.
parameters
(),
optim
=
optimizer
,
group
=
group
)
# cover grad_storage code
trainable_param2align
=
dict
()
for
p
in
mlp
.
parameters
():
trainable_param2align
[
p
.
name
]
=
0
grad_storage
=
GradStorage
(
10000
,
dtype
=
paddle
.
float32
,
device
=
"gpu"
,
destination
=
0
,
parm2align
=
trainable_param2align
,
)
for
p
in
mlp
.
parameters
():
grad_storage
.
can_add_grad_view
(
p
,
trainable_param2align
[
p
.
name
])
grad_storage
.
add_grad
(
p
,
trainable_param2align
[
p
.
name
])
grad_storage
.
manumal_relase
()
grad_storage
.
rebuild
()
grad_storage
.
reset_checked_in
()
train_reader
=
paddle
.
batch
(
reader_decorator
(),
batch_size
=
batch_size
,
drop_last
=
True
)
train_loader
=
paddle
.
io
.
DataLoader
.
from_generator
(
capacity
=
32
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
,
use_multiprocess
=
True
,
)
train_loader
.
set_sample_list_generator
(
train_reader
)
for
eop
in
range
(
epoch
):
mlp
.
train
()
for
batch_id
,
data
in
enumerate
(
train_loader
()):
img
,
label
=
data
label
.
stop_gradient
=
True
img
.
stop_gradient
=
True
out
=
mlp
(
img
)
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
label
=
label
)
avg_loss
=
paddle
.
mean
(
x
=
loss
)
acc_top1
=
paddle
.
metric
.
accuracy
(
input
=
out
,
label
=
label
,
k
=
1
)
acc_top5
=
paddle
.
metric
.
accuracy
(
input
=
out
,
label
=
label
,
k
=
5
)
dy_out
=
avg_loss
.
numpy
()
avg_loss
.
backward
()
oss_optimizer
.
step
()
# oss_optimizer clear cache
oss_optimizer
.
_clear_cache
()
# check optimizer.minimize() error
try
:
oss_optimizer
.
minimize
()
except
:
print
(
"====== Find sharding_stage2_optimizer.minimize() error ======"
)
return
if
__name__
==
'__main__'
:
train_mlp
()
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage2.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
shutil
import
tempfile
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
from
paddle.distributed
import
fleet
from
paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2
import
(
ShardingOptimizerStage2
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2
import
(
ShardingStage2
,
)
from
paddle.nn
import
Linear
seed
=
2022
epoch
=
2
linear_size
=
1000
strategy
=
fleet
.
DistributedStrategy
()
strategy
.
hybrid_configs
=
{
"dp_degree"
:
2
,
"mp_degree"
:
1
,
"pp_degree"
:
1
,
"sharding_degree"
:
1
,
}
np
.
random
.
seed
(
seed
)
paddle
.
seed
(
seed
)
class
MLP
(
fluid
.
Layer
):
def
__init__
(
self
,
linear_size
=
1000
,
param_attr
=
None
,
bias_attr
=
None
):
super
().
__init__
()
self
.
_linear1
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear2
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear3
=
Linear
(
linear_size
,
10
)
def
forward
(
self
,
inputs
):
y
=
self
.
_linear1
(
inputs
)
y
=
self
.
_linear2
(
y
)
y
=
self
.
_linear3
(
y
)
return
y
def
reader_decorator
(
linear_size
=
1000
):
def
__reader__
():
for
_
in
range
(
100
):
img
=
np
.
random
.
rand
(
linear_size
).
astype
(
'float32'
)
label
=
np
.
ones
(
1
).
astype
(
'int64'
)
yield
img
,
label
return
__reader__
def
optimizer_setting
(
model
,
use_pure_fp16
,
opt_group
=
False
):
clip
=
paddle
.
nn
.
ClipGradByGlobalNorm
(
clip_norm
=
1.0
)
optimizer
=
paddle
.
optimizer
.
AdamW
(
parameters
=
[{
"params"
:
model
.
parameters
()}]
if
opt_group
else
model
.
parameters
(),
learning_rate
=
0.001
,
weight_decay
=
0.00001
,
grad_clip
=
clip
,
multi_precision
=
use_pure_fp16
,
)
return
optimizer
def
train_mlp
(
model
,
sharding_stage
,
batch_size
=
100
,
use_pure_fp16
=
False
,
accumulate_grad
=
False
,
opt_group
=
False
,
save_model
=
False
,
):
if
sharding_stage
==
"dp"
:
hcg
=
fleet
.
get_hybrid_communicate_group
()
group
=
hcg
.
get_check_parallel_group
()
else
:
group
=
paddle
.
distributed
.
new_group
([
0
,
1
])
if
opt_group
:
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
use_pure_fp16
,
opt_group
=
opt_group
)
else
:
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
use_pure_fp16
)
if
sharding_stage
==
2
:
optimizer
=
ShardingOptimizerStage2
(
params
=
model
.
parameters
(),
optim
=
optimizer
,
group
=
group
)
model
=
ShardingStage2
(
model
,
optimizer
,
group
=
group
,
buffer_max_size
=
2
**
21
)
else
:
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
)
model
=
fleet
.
distributed_model
(
model
)
train_reader
=
paddle
.
batch
(
reader_decorator
(),
batch_size
=
batch_size
,
drop_last
=
True
)
train_loader
=
paddle
.
io
.
DataLoader
.
from_generator
(
capacity
=
32
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
,
use_multiprocess
=
True
,
)
train_loader
.
set_sample_list_generator
(
train_reader
)
if
sharding_stage
==
2
:
model
.
to
(
device
=
"gpu"
)
for
eop
in
range
(
epoch
):
model
.
train
()
for
batch_id
,
data
in
enumerate
(
train_loader
()):
img
,
label
=
data
label
.
stop_gradient
=
True
img
.
stop_gradient
=
True
out
=
model
(
img
)
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
label
=
label
)
avg_loss
=
paddle
.
mean
(
x
=
loss
.
cast
(
dtype
=
paddle
.
float32
))
if
batch_size
==
20
:
avg_loss
=
avg_loss
/
5
avg_loss
.
backward
()
if
not
accumulate_grad
:
optimizer
.
step
()
optimizer
.
clear_grad
()
if
accumulate_grad
:
optimizer
.
step
()
optimizer
.
clear_grad
()
if
save_model
:
return
model
,
optimizer
return
model
.
parameters
()
def
test_dp_stage2
():
mlp
=
MLP
()
state_dict
=
mlp
.
state_dict
()
mlp1
=
MLP
()
mlp2
=
MLP
()
mlp3
=
MLP
()
mlp4
=
MLP
()
mlp5
=
MLP
()
mlp6
=
MLP
()
mlp1
.
set_state_dict
(
state_dict
)
mlp2
.
set_state_dict
(
state_dict
)
mlp3
.
set_state_dict
(
state_dict
)
mlp4
.
set_state_dict
(
state_dict
)
mlp5
.
set_state_dict
(
state_dict
)
mlp6
.
set_state_dict
(
state_dict
)
# DP VS stage2
dp_params
=
train_mlp
(
mlp1
,
sharding_stage
=
"dp"
,
use_pure_fp16
=
False
,
opt_group
=
False
)
stage2_params
=
train_mlp
(
mlp2
,
sharding_stage
=
2
,
use_pure_fp16
=
False
,
opt_group
=
False
)
for
i
in
range
(
len
(
dp_params
)):
np
.
testing
.
assert_allclose
(
dp_params
[
i
].
numpy
(),
stage2_params
[
i
].
numpy
(),
rtol
=
1e-6
)
# stage2 accumulate grad
stage2_params
=
train_mlp
(
mlp3
,
sharding_stage
=
2
,
accumulate_grad
=
True
)
stage2_accumulate_grad
=
train_mlp
(
mlp4
,
sharding_stage
=
2
,
batch_size
=
20
,
accumulate_grad
=
True
)
for
i
in
range
(
len
(
stage2_params
)):
np
.
testing
.
assert_allclose
(
stage2_params
[
i
].
numpy
(),
stage2_accumulate_grad
[
i
].
numpy
(),
rtol
=
1e-5
,
atol
=
1e-5
,
)
# stage2 param list VS param group
stage2_params
=
train_mlp
(
mlp5
,
sharding_stage
=
2
,
use_pure_fp16
=
False
,
opt_group
=
True
)
for
i
in
range
(
len
(
dp_params
)):
np
.
testing
.
assert_allclose
(
dp_params
[
i
].
numpy
(),
stage2_params
[
i
].
numpy
(),
rtol
=
1e-6
)
# save/load model
output_dir
=
tempfile
.
mkdtemp
()
model_file
=
os
.
path
.
join
(
output_dir
,
"model.pdmodel"
)
optimizer_file
=
os
.
path
.
join
(
output_dir
,
"model.pdopt"
)
model_stage2
,
optimizer_stage2
=
train_mlp
(
mlp6
,
sharding_stage
=
2
,
use_pure_fp16
=
False
,
opt_group
=
False
,
save_model
=
True
,
)
paddle
.
save
(
model_stage2
.
state_dict
(),
model_file
)
paddle
.
save
(
optimizer_stage2
.
state_dict
(),
optimizer_file
)
m_state_dict
=
paddle
.
load
(
model_file
)
opt_state_dict
=
paddle
.
load
(
optimizer_file
)
model_stage2
.
set_state_dict
(
m_state_dict
)
optimizer_stage2
.
set_state_dict
(
opt_state_dict
)
shutil
.
rmtree
(
output_dir
)
return
if
__name__
==
'__main__'
:
fleet
.
init
(
is_collective
=
True
,
strategy
=
strategy
)
test_dp_stage2
()
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage2_offload.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
from
dygraph_sharding_stage2
import
MLP
,
optimizer_setting
,
reader_decorator
import
paddle
from
paddle.distributed
import
fleet
from
paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2
import
(
ShardingOptimizerStage2
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2
import
(
ShardingStage2
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_utils
import
(
ShardingScaler
,
)
seed
=
2021
epoch
=
2
batch_size
=
32
linear_size
=
1000
strategy
=
fleet
.
DistributedStrategy
()
strategy
.
hybrid_configs
=
{
"dp_degree"
:
2
,
"mp_degree"
:
1
,
"pp_degree"
:
1
,
"sharding_degree"
:
1
,
}
np
.
random
.
seed
(
seed
)
paddle
.
seed
(
seed
)
def
train_mlp
(
model
,
offload
=
False
):
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
True
)
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
,
save_dtype
=
'float32'
)
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
1024
)
scaler
=
ShardingScaler
(
scaler
)
optimizer
=
ShardingOptimizerStage2
(
params
=
model
.
parameters
(),
optim
=
optimizer
,
offload
=
offload
)
model
=
ShardingStage2
(
model
,
optimizer
,
buffer_max_size
=
2
**
21
)
train_reader
=
paddle
.
batch
(
reader_decorator
(
linear_size
),
batch_size
=
batch_size
,
drop_last
=
True
)
train_loader
=
paddle
.
io
.
DataLoader
.
from_generator
(
capacity
=
32
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
,
use_multiprocess
=
True
,
)
train_loader
.
set_sample_list_generator
(
train_reader
)
for
eop
in
range
(
epoch
):
model
.
train
()
for
batch_id
,
data
in
enumerate
(
train_loader
()):
img
,
label
=
data
label
.
stop_gradient
=
True
img
.
stop_gradient
=
True
with
paddle
.
amp
.
auto_cast
(
True
,
level
=
'O2'
):
out
=
model
(
img
)
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
label
=
label
)
avg_loss
=
paddle
.
mean
(
x
=
loss
.
cast
(
dtype
=
paddle
.
float32
))
scaler
.
scale
(
avg_loss
).
backward
()
scaler
.
step
(
optimizer
)
scaler
.
update
()
optimizer
.
clear_grad
()
for
dtype
in
optimizer
.
param_storages
:
for
dst_rank
,
param_storage
in
optimizer
.
param_storages
[
dtype
].
items
():
param_storage
.
to
(
device
=
"gpu"
,
dtype
=
dtype
)
return
model
.
parameters
()
def
test_sharding_stage2_offload
():
mlp
=
MLP
(
linear_size
)
mlp_offload
=
MLP
(
linear_size
)
mlp_offload
.
set_state_dict
(
mlp
.
state_dict
())
mlp_params
=
train_mlp
(
mlp
,
offload
=
False
)
mlp_offload_params
=
train_mlp
(
mlp_offload
,
offload
=
True
)
for
i
in
range
(
len
(
mlp_params
)):
np
.
testing
.
assert_allclose
(
mlp_params
[
i
].
numpy
(),
mlp_offload_params
[
i
].
numpy
(),
rtol
=
5e-3
,
atol
=
5e-3
,
)
return
if
__name__
==
'__main__'
:
fleet
.
init
(
is_collective
=
True
,
strategy
=
strategy
)
test_sharding_stage2_offload
()
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage3.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
shutil
import
tempfile
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
from
paddle.distributed
import
fleet
from
paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2
import
(
ShardingOptimizerStage2
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2
import
(
ShardingStage2
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3
import
(
ShardingStage3
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_utils
import
(
ShardingScaler
,
)
from
paddle.nn
import
Linear
epoch
=
10
paddle
.
seed
(
2021
)
np
.
random
.
seed
(
2021
)
base_lr
=
0.1
momentum_rate
=
0.9
l2_decay
=
1e-4
class
MLP
(
fluid
.
Layer
):
def
__init__
(
self
,
linear_size
=
1000
,
param_attr
=
None
,
bias_attr
=
None
):
super
().
__init__
()
self
.
_linear1
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear2
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear3
=
Linear
(
linear_size
,
10
)
def
forward
(
self
,
inputs
):
y
=
self
.
_linear1
(
inputs
)
y
=
self
.
_linear2
(
y
)
y
=
self
.
_linear3
(
y
)
return
y
def
reader_decorator
(
linear_size
=
1000
):
def
__reader__
():
for
_
in
range
(
100
):
img
=
np
.
random
.
rand
(
linear_size
).
astype
(
'float32'
)
label
=
np
.
ones
(
1
).
astype
(
'int64'
)
yield
img
,
label
return
__reader__
def
optimizer_setting
(
model
,
use_pure_fp16
,
opt_group
=
False
):
clip
=
paddle
.
nn
.
ClipGradByGlobalNorm
(
clip_norm
=
1.0
)
optimizer
=
paddle
.
optimizer
.
Momentum
(
parameters
=
[{
"params"
:
list
(
model
.
parameters
())}]
if
opt_group
else
list
(
model
.
parameters
()),
learning_rate
=
0.001
,
weight_decay
=
0.00001
,
grad_clip
=
clip
,
multi_precision
=
use_pure_fp16
,
)
return
optimizer
def
train_mlp
(
model
,
sharding_stage
,
use_pure_fp16
=
False
,
accumulate_grad
=
False
,
batch_size
=
100
,
opt_group
=
False
,
sync_comm
=
False
,
test_minimize
=
False
,
save_model
=
False
,
):
group
=
paddle
.
distributed
.
new_group
([
0
,
1
])
if
opt_group
:
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
use_pure_fp16
,
opt_group
=
opt_group
)
else
:
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
use_pure_fp16
)
if
use_pure_fp16
:
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
,
save_dtype
=
'float32'
)
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
32768
)
scaler
=
ShardingScaler
(
scaler
)
if
sharding_stage
==
2
:
optimizer
=
ShardingOptimizerStage2
(
params
=
model
.
parameters
(),
optim
=
optimizer
,
group
=
group
)
model
=
ShardingStage2
(
model
,
optimizer
,
group
=
group
,
buffer_max_size
=
2
**
21
)
elif
sharding_stage
==
3
:
model
=
ShardingStage3
(
model
,
optimizer
=
optimizer
,
group
=
group
,
sync_comm
=
sync_comm
)
# check optimizer.minimize() error
if
test_minimize
:
try
:
optimizer
.
minimize
()
except
:
print
(
"====== Find sharding_stage3_optimizer.minimize() error ======"
)
return
train_reader
=
paddle
.
batch
(
reader_decorator
(),
batch_size
=
batch_size
,
drop_last
=
True
)
train_loader
=
paddle
.
io
.
DataLoader
.
from_generator
(
capacity
=
32
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
,
use_multiprocess
=
True
,
)
train_loader
.
set_sample_list_generator
(
train_reader
)
for
eop
in
range
(
epoch
):
model
.
train
()
for
batch_id
,
data
in
enumerate
(
train_loader
()):
img
,
label
=
data
label
.
stop_gradient
=
True
img
.
stop_gradient
=
True
with
paddle
.
amp
.
auto_cast
(
True
,
level
=
'O2'
):
out
=
model
(
img
)
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
label
=
label
)
avg_loss
=
paddle
.
mean
(
x
=
loss
.
cast
(
dtype
=
paddle
.
float32
))
if
batch_size
==
20
:
avg_loss
=
avg_loss
/
5
if
not
use_pure_fp16
:
avg_loss
.
backward
()
else
:
scaler
.
scale
(
avg_loss
).
backward
()
if
not
accumulate_grad
:
if
not
use_pure_fp16
:
optimizer
.
step
()
else
:
scaler
.
step
(
optimizer
)
scaler
.
update
()
optimizer
.
clear_grad
()
if
accumulate_grad
:
if
not
use_pure_fp16
:
optimizer
.
step
()
else
:
scaler
.
step
(
optimizer
)
scaler
.
update
()
optimizer
.
clear_grad
()
if
sharding_stage
==
3
:
model
.
get_all_parameters
()
if
save_model
:
return
model
,
optimizer
return
model
.
parameters
()
def
test_stage2_stage3
():
mlp
,
mlp1
,
mlp2
,
mlp3
,
mlp4
,
mlp5
,
mlp6
,
mlp7
,
mlp8
,
mlp9
,
mlp10
=
(
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
)
state_dict
=
mlp
.
state_dict
()
mlp1
.
set_state_dict
(
state_dict
)
mlp2
.
set_state_dict
(
state_dict
)
mlp3
.
set_state_dict
(
state_dict
)
mlp4
.
set_state_dict
(
state_dict
)
mlp5
.
set_state_dict
(
state_dict
)
mlp6
.
set_state_dict
(
state_dict
)
mlp7
.
set_state_dict
(
state_dict
)
mlp8
.
set_state_dict
(
state_dict
)
mlp9
.
set_state_dict
(
state_dict
)
mlp10
.
set_state_dict
(
state_dict
)
# fp32
stage2_params
=
train_mlp
(
mlp1
,
sharding_stage
=
2
,
use_pure_fp16
=
False
,
opt_group
=
False
)
stage3_params
=
train_mlp
(
mlp2
,
sharding_stage
=
3
,
use_pure_fp16
=
False
,
opt_group
=
False
)
for
i
in
range
(
len
(
stage2_params
)):
np
.
testing
.
assert_allclose
(
stage2_params
[
i
].
numpy
(),
stage3_params
[
i
].
numpy
(),
rtol
=
1e-6
,
atol
=
1e-6
,
)
# fp32 accumulate grad
stage3_params
=
train_mlp
(
mlp3
,
sharding_stage
=
3
,
use_pure_fp16
=
False
,
accumulate_grad
=
True
,
opt_group
=
True
,
)
stage3_params_add
=
train_mlp
(
mlp4
,
sharding_stage
=
3
,
use_pure_fp16
=
False
,
accumulate_grad
=
True
,
batch_size
=
20
,
opt_group
=
True
,
)
for
i
in
range
(
len
(
stage3_params
)):
np
.
testing
.
assert_allclose
(
stage3_params
[
i
].
numpy
(),
stage3_params_add
[
i
].
numpy
(),
rtol
=
1e-6
,
atol
=
1e-4
,
)
# fp16
stage2_params
=
train_mlp
(
mlp5
,
sharding_stage
=
2
,
use_pure_fp16
=
True
,
opt_group
=
False
)
stage3_params
=
train_mlp
(
mlp6
,
sharding_stage
=
3
,
use_pure_fp16
=
True
,
opt_group
=
False
)
for
i
in
range
(
len
(
stage2_params
)):
np
.
testing
.
assert_allclose
(
stage2_params
[
i
].
numpy
(),
stage3_params
[
i
].
numpy
(),
rtol
=
1e-4
,
atol
=
1e-3
,
)
# fp16 sync_comm
stage3_params
=
train_mlp
(
mlp7
,
sharding_stage
=
3
,
use_pure_fp16
=
True
,
opt_group
=
False
)
stage3_params_re
=
train_mlp
(
mlp8
,
sharding_stage
=
3
,
use_pure_fp16
=
True
,
opt_group
=
False
,
sync_comm
=
True
,
)
for
i
in
range
(
len
(
stage3_params
)):
np
.
testing
.
assert_allclose
(
stage3_params
[
i
].
numpy
(),
stage3_params_re
[
i
].
numpy
(),
rtol
=
1e-6
)
# save/load model
output_dir
=
tempfile
.
mkdtemp
()
model_file
=
os
.
path
.
join
(
output_dir
,
"model.pdmodel"
)
optimizer_file
=
os
.
path
.
join
(
output_dir
,
"model.pdopt"
)
model_stage3
,
optimizer_stage3
=
train_mlp
(
mlp9
,
sharding_stage
=
3
,
use_pure_fp16
=
False
,
opt_group
=
False
,
save_model
=
True
,
)
paddle
.
save
(
model_stage3
.
state_dict
(),
model_file
)
paddle
.
save
(
optimizer_stage3
.
state_dict
(),
optimizer_file
)
m_state_dict
=
paddle
.
load
(
model_file
)
opt_state_dict
=
paddle
.
load
(
optimizer_file
)
model_stage3
.
set_state_dict
(
m_state_dict
)
optimizer_stage3
.
set_state_dict
(
opt_state_dict
)
shutil
.
rmtree
(
output_dir
)
# check optimizer.minimize() error
train_mlp
(
mlp10
,
sharding_stage
=
3
,
use_pure_fp16
=
False
,
opt_group
=
False
,
test_minimize
=
True
,
)
if
__name__
==
'__main__'
:
fleet
.
init
(
is_collective
=
True
)
test_stage2_stage3
()
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage3_offload.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
from
paddle.distributed
import
fleet
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3
import
(
ShardingStage3
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_utils
import
(
ShardingScaler
,
)
from
paddle.nn
import
Linear
epoch
=
10
paddle
.
seed
(
2022
)
np
.
random
.
seed
(
2022
)
base_lr
=
0.1
momentum_rate
=
0.9
l2_decay
=
1e-4
class
MLP
(
fluid
.
Layer
):
def
__init__
(
self
,
linear_size
=
1000
,
param_attr
=
None
,
bias_attr
=
None
):
super
().
__init__
()
self
.
_linear1
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear2
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear3
=
Linear
(
linear_size
,
10
)
def
forward
(
self
,
inputs
):
y
=
self
.
_linear1
(
inputs
)
y
=
self
.
_linear2
(
y
)
y
=
self
.
_linear3
(
y
)
return
y
def
reader_decorator
(
linear_size
=
1000
):
def
__reader__
():
for
_
in
range
(
100
):
img
=
np
.
random
.
rand
(
linear_size
).
astype
(
'float32'
)
label
=
np
.
ones
(
1
).
astype
(
'int64'
)
yield
img
,
label
return
__reader__
def
optimizer_setting
(
model
,
use_pure_fp16
,
opt_group
=
False
):
clip
=
paddle
.
nn
.
ClipGradByGlobalNorm
(
clip_norm
=
1.0
)
optimizer
=
paddle
.
optimizer
.
AdamW
(
parameters
=
[{
"params"
:
model
.
parameters
()}]
if
opt_group
else
model
.
parameters
(),
learning_rate
=
0.001
,
weight_decay
=
0.00001
,
grad_clip
=
clip
,
multi_precision
=
use_pure_fp16
,
)
return
optimizer
def
train_mlp
(
model
,
use_pure_fp16
=
False
,
accumulate_grad
=
False
,
offload
=
False
,
batch_size
=
100
,
convert2cpu
=
False
,
):
group
=
paddle
.
distributed
.
new_group
([
0
,
1
])
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
use_pure_fp16
)
if
use_pure_fp16
:
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
,
save_dtype
=
'float32'
)
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
32768
)
scaler
=
ShardingScaler
(
scaler
)
model
=
ShardingStage3
(
model
,
optimizer
=
optimizer
,
group
=
group
,
offload
=
offload
)
train_reader
=
paddle
.
batch
(
reader_decorator
(),
batch_size
=
batch_size
,
drop_last
=
True
)
train_loader
=
paddle
.
io
.
DataLoader
.
from_generator
(
capacity
=
32
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
,
use_multiprocess
=
True
,
)
train_loader
.
set_sample_list_generator
(
train_reader
)
for
eop
in
range
(
epoch
):
model
.
train
()
for
batch_id
,
data
in
enumerate
(
train_loader
()):
img
,
label
=
data
label
.
stop_gradient
=
True
img
.
stop_gradient
=
True
with
paddle
.
amp
.
auto_cast
(
True
,
level
=
'O2'
):
out
=
model
(
img
)
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
label
=
label
)
avg_loss
=
paddle
.
mean
(
x
=
loss
.
cast
(
dtype
=
paddle
.
float32
))
if
accumulate_grad
:
avg_loss
=
avg_loss
/
5
if
not
use_pure_fp16
:
avg_loss
.
backward
()
else
:
scaler
.
scale
(
avg_loss
).
backward
()
if
not
accumulate_grad
:
if
not
use_pure_fp16
:
optimizer
.
step
()
else
:
scaler
.
step
(
optimizer
)
scaler
.
update
()
optimizer
.
clear_grad
()
if
accumulate_grad
:
if
not
use_pure_fp16
:
optimizer
.
step
()
else
:
scaler
.
step
(
optimizer
)
scaler
.
update
()
optimizer
.
clear_grad
()
if
not
convert2cpu
:
model
.
get_all_parameters
()
else
:
model
.
get_all_parameters
(
convert2cpu
)
return
model
.
parameters
()
def
test_stage3_offload
():
mlp
,
mlp1
,
mlp2
,
mlp3
,
mlp4
,
mlp5
,
mlp6
=
(
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
)
state_dict
=
mlp
.
state_dict
()
mlp1
.
set_state_dict
(
state_dict
)
mlp2
.
set_state_dict
(
state_dict
)
mlp3
.
set_state_dict
(
state_dict
)
mlp4
.
set_state_dict
(
state_dict
)
mlp5
.
set_state_dict
(
state_dict
)
mlp6
.
set_state_dict
(
state_dict
)
# fp32 offload
stage3_params
=
train_mlp
(
mlp1
,
use_pure_fp16
=
False
)
stage3_params_offload
=
train_mlp
(
mlp2
,
use_pure_fp16
=
False
,
offload
=
True
)
for
i
in
range
(
len
(
stage3_params
)):
np
.
testing
.
assert_allclose
(
stage3_params
[
i
].
numpy
(),
stage3_params_offload
[
i
].
numpy
(),
rtol
=
1e-6
,
atol
=
1e-8
,
)
# fp16 offload
stage3_params
=
train_mlp
(
mlp3
,
use_pure_fp16
=
True
)
stage3_params_offload
=
train_mlp
(
mlp4
,
use_pure_fp16
=
True
,
offload
=
True
)
for
i
in
range
(
len
(
stage3_params
)):
np
.
testing
.
assert_allclose
(
stage3_params
[
i
].
numpy
(),
stage3_params_offload
[
i
].
numpy
(),
rtol
=
1e-2
,
atol
=
1e-2
,
)
# fp32 accumulate grad offload
stage3_params
=
train_mlp
(
mlp5
,
use_pure_fp16
=
False
,
batch_size
=
20
,
accumulate_grad
=
True
)
stage3_params_offload
=
train_mlp
(
mlp6
,
use_pure_fp16
=
False
,
accumulate_grad
=
True
,
offload
=
True
,
batch_size
=
20
,
convert2cpu
=
True
,
)
for
i
in
range
(
len
(
stage3_params
)):
np
.
testing
.
assert_allclose
(
stage3_params
[
i
].
numpy
(),
stage3_params_offload
[
i
].
numpy
(),
rtol
=
1e-6
,
atol
=
1e-8
,
)
return
if
__name__
==
'__main__'
:
fleet
.
init
(
is_collective
=
True
)
test_stage3_offload
()
python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_sharding_stage2.py
已删除
100644 → 0
浏览文件 @
2ca3d3f7
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
shutil
import
tempfile
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
from
paddle.distributed
import
fleet
from
paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2
import
(
ShardingOptimizerStage2
,
)
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2
import
(
ShardingStage2
,
)
from
paddle.nn
import
Linear
seed
=
2022
epoch
=
2
linear_size
=
1000
strategy
=
fleet
.
DistributedStrategy
()
strategy
.
hybrid_configs
=
{
"dp_degree"
:
16
,
"mp_degree"
:
1
,
"pp_degree"
:
1
,
"sharding_degree"
:
1
,
}
np
.
random
.
seed
(
seed
)
paddle
.
seed
(
seed
)
class
MLP
(
fluid
.
Layer
):
def
__init__
(
self
,
linear_size
=
1000
,
param_attr
=
None
,
bias_attr
=
None
):
super
().
__init__
()
self
.
_linear1
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear2
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear3
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear4
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear5
=
Linear
(
linear_size
,
10
)
def
forward
(
self
,
inputs
):
y
=
self
.
_linear1
(
inputs
)
y
=
self
.
_linear2
(
y
)
y
=
self
.
_linear3
(
y
)
y
=
self
.
_linear4
(
y
)
y
=
self
.
_linear5
(
y
)
return
y
def
reader_decorator
(
linear_size
=
1000
):
def
__reader__
():
for
_
in
range
(
100
):
img
=
np
.
random
.
rand
(
linear_size
).
astype
(
'float32'
)
label
=
np
.
ones
(
1
).
astype
(
'int64'
)
yield
img
,
label
return
__reader__
def
optimizer_setting
(
model
,
use_pure_fp16
,
opt_group
=
False
):
clip
=
paddle
.
nn
.
ClipGradByGlobalNorm
(
clip_norm
=
1.0
)
optimizer
=
paddle
.
optimizer
.
AdamW
(
parameters
=
[{
"params"
:
model
.
parameters
()}]
if
opt_group
else
model
.
parameters
(),
learning_rate
=
0.001
,
weight_decay
=
0.00001
,
grad_clip
=
clip
,
multi_precision
=
use_pure_fp16
,
)
return
optimizer
def
train_mlp
(
model
,
sharding_stage
,
batch_size
=
100
,
use_pure_fp16
=
False
,
accumulate_grad
=
False
,
opt_group
=
False
,
save_model
=
False
,
):
if
sharding_stage
==
"dp"
:
hcg
=
fleet
.
get_hybrid_communicate_group
()
group
=
hcg
.
get_check_parallel_group
()
else
:
group
=
paddle
.
distributed
.
new_group
(
[
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
]
)
if
opt_group
:
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
use_pure_fp16
,
opt_group
=
opt_group
)
else
:
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
use_pure_fp16
)
if
sharding_stage
==
2
:
optimizer
=
ShardingOptimizerStage2
(
params
=
model
.
parameters
(),
optim
=
optimizer
,
group
=
group
)
model
=
ShardingStage2
(
model
,
optimizer
,
group
=
group
,
buffer_max_size
=
2
**
21
)
else
:
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
)
model
=
fleet
.
distributed_model
(
model
)
train_reader
=
paddle
.
batch
(
reader_decorator
(),
batch_size
=
batch_size
,
drop_last
=
True
)
train_loader
=
paddle
.
io
.
DataLoader
.
from_generator
(
capacity
=
32
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
,
use_multiprocess
=
True
,
)
train_loader
.
set_sample_list_generator
(
train_reader
)
if
sharding_stage
==
2
:
model
.
to
(
device
=
"gpu"
)
for
eop
in
range
(
epoch
):
model
.
train
()
for
batch_id
,
data
in
enumerate
(
train_loader
()):
img
,
label
=
data
label
.
stop_gradient
=
True
img
.
stop_gradient
=
True
out
=
model
(
img
)
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
label
=
label
)
avg_loss
=
paddle
.
mean
(
x
=
loss
.
cast
(
dtype
=
paddle
.
float32
))
if
batch_size
==
20
:
avg_loss
=
avg_loss
/
5
avg_loss
.
backward
()
if
not
accumulate_grad
:
optimizer
.
step
()
optimizer
.
clear_grad
()
if
accumulate_grad
:
optimizer
.
step
()
optimizer
.
clear_grad
()
if
save_model
:
return
model
,
optimizer
return
model
.
parameters
()
def
test_dp_stage2
():
mlp
=
MLP
()
state_dict
=
mlp
.
state_dict
()
mlp1
=
MLP
()
mlp2
=
MLP
()
mlp3
=
MLP
()
mlp4
=
MLP
()
mlp5
=
MLP
()
mlp6
=
MLP
()
mlp1
.
set_state_dict
(
state_dict
)
mlp2
.
set_state_dict
(
state_dict
)
mlp3
.
set_state_dict
(
state_dict
)
mlp4
.
set_state_dict
(
state_dict
)
mlp5
.
set_state_dict
(
state_dict
)
mlp6
.
set_state_dict
(
state_dict
)
# DP VS stage2
dp_params
=
train_mlp
(
mlp1
,
sharding_stage
=
"dp"
,
use_pure_fp16
=
False
,
opt_group
=
False
)
stage2_params
=
train_mlp
(
mlp2
,
sharding_stage
=
2
,
use_pure_fp16
=
False
,
opt_group
=
False
)
for
i
in
range
(
len
(
dp_params
)):
np
.
testing
.
assert_allclose
(
dp_params
[
i
].
numpy
(),
stage2_params
[
i
].
numpy
(),
rtol
=
1e-6
,
atol
=
5e-4
)
# stage2 accumulate grad
stage2_params
=
train_mlp
(
mlp3
,
sharding_stage
=
2
,
accumulate_grad
=
True
)
stage2_accumulate_grad
=
train_mlp
(
mlp4
,
sharding_stage
=
2
,
batch_size
=
20
,
accumulate_grad
=
True
)
for
i
in
range
(
len
(
stage2_params
)):
np
.
testing
.
assert_allclose
(
stage2_params
[
i
].
numpy
(),
stage2_accumulate_grad
[
i
].
numpy
(),
rtol
=
1e-5
,
atol
=
1e-5
,
)
# stage2 param list VS param group
stage2_params
=
train_mlp
(
mlp5
,
sharding_stage
=
2
,
use_pure_fp16
=
False
,
opt_group
=
True
)
for
i
in
range
(
len
(
dp_params
)):
np
.
testing
.
assert_allclose
(
dp_params
[
i
].
numpy
(),
stage2_params
[
i
].
numpy
(),
rtol
=
1e-6
,
atol
=
5e-4
)
# save/load model
output_dir
=
tempfile
.
mkdtemp
()
try
:
model_file
=
os
.
path
.
join
(
output_dir
,
"model.pdmodel"
)
optimizer_file
=
os
.
path
.
join
(
output_dir
,
"model.pdopt"
)
model_stage2
,
optimizer_stage2
=
train_mlp
(
mlp6
,
sharding_stage
=
2
,
use_pure_fp16
=
False
,
opt_group
=
False
,
save_model
=
True
,
)
paddle
.
save
(
model_stage2
.
state_dict
(),
model_file
)
paddle
.
save
(
optimizer_stage2
.
state_dict
(),
optimizer_file
)
m_state_dict
=
paddle
.
load
(
model_file
)
opt_state_dict
=
paddle
.
load
(
optimizer_file
)
model_stage2
.
set_state_dict
(
m_state_dict
)
optimizer_stage2
.
set_state_dict
(
opt_state_dict
)
except
Exception
as
e
:
shutil
.
rmtree
(
output_dir
)
raise
e
else
:
shutil
.
rmtree
(
output_dir
)
if
__name__
==
'__main__'
:
fleet
.
init
(
is_collective
=
True
,
strategy
=
strategy
)
test_dp_stage2
()
python/paddle/fluid/tests/unittests/collective/multinode/test_multinode_dygraph_sharding.py
浏览文件 @
2bbdc47a
...
...
@@ -25,13 +25,6 @@ class TestDYgrapShardingDP(TestDistBase):
self
.
_trainers
=
16
self
.
_init_env
()
def
test_hybrid_sharding_stage2
(
self
):
self
.
check_with_place
(
"mn_dygraph_sharding_stage2.py"
,
backend
=
"nccl"
,
need_envs
=
os
.
environ
,
)
def
test_hybrid_sharding_stage3
(
self
):
self
.
check_with_place
(
"mn_dygraph_group_sharded_stage3.py"
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录