Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
ffd40860
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ffd40860
编写于
4月 19, 2021
作者:
S
ShenLiang
提交者:
GitHub
4月 19, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Hybrid Parallel] Support dp & mp in dygraph (#32323)
* support dp & mp
上级
4d69eeaa
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
572 addition
and
57 deletion
+572
-57
python/paddle/distributed/fleet/base/fleet_base.py
python/paddle/distributed/fleet/base/fleet_base.py
+27
-12
python/paddle/distributed/fleet/base/meta_optimizer_factory.py
...n/paddle/distributed/fleet/base/meta_optimizer_factory.py
+4
-0
python/paddle/distributed/fleet/base/topology.py
python/paddle/distributed/fleet/base/topology.py
+25
-3
python/paddle/distributed/fleet/meta_optimizers/__init__.py
python/paddle/distributed/fleet/meta_optimizers/__init__.py
+1
-0
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
...buted/fleet/meta_optimizers/dygraph_optimizer/__init__.py
+13
-0
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
...optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+58
-0
python/paddle/distributed/fleet/meta_parallel/__init__.py
python/paddle/distributed/fleet/meta_parallel/__init__.py
+1
-0
python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py
...dle/distributed/fleet/meta_parallel/meta_parallel_base.py
+43
-0
python/paddle/distributed/fleet/meta_parallel/model_parallel.py
.../paddle/distributed/fleet/meta_parallel/model_parallel.py
+29
-0
python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
...on/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+96
-0
python/paddle/fluid/dygraph/parallel.py
python/paddle/fluid/dygraph/parallel.py
+58
-41
python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
.../paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
+213
-0
python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py
.../tests/unittests/test_parallel_dygraph_hybrid_parallel.py
+3
-1
python/setup.py.in
python/setup.py.in
+1
-0
未找到文件。
python/paddle/distributed/fleet/base/fleet_base.py
浏览文件 @
ffd40860
...
...
@@ -27,6 +27,9 @@ from .runtime_factory import RuntimeFactory
from
paddle.fluid.wrapped_decorator
import
wrap_decorator
from
paddle.fluid.dygraph
import
parallel_helper
from
.
import
topology
as
tp
from
.topology
import
ParallelMode
from
..meta_parallel
import
ModelParallel
from
..meta_optimizers
import
HybridParallelOptimizer
def
_inited_runtime_handler_
(
func
):
...
...
@@ -219,6 +222,9 @@ class Fleet(object):
if
paddle
.
fluid
.
framework
.
in_dygraph_mode
():
if
self
.
worker_num
()
==
1
:
# if worker_num is 1, should construct default topology & hcg
self
.
_topology
=
tp
.
CommunicateTopology
()
self
.
_hcg
=
tp
.
HybridCommunicateGroup
(
self
.
_topology
)
return
if
parallel_helper
.
_is_parallel_ctx_initialized
():
warnings
.
warn
(
...
...
@@ -694,10 +700,12 @@ class Fleet(object):
self
.
_context
=
{}
# TODO(shenliang03): This is a temporary solution to support amp. In the case of a dynamic graph,
# the optimizer is returned directly. This problem will be fixed in the future.
if
paddle
.
fluid
.
framework
.
in_dygraph_mode
():
return
optimizer
if
self
.
worker_num
()
>
1
:
return
HybridParallelOptimizer
(
optimizer
,
self
.
_hcg
,
self
.
_user_defined_strategy
)
else
:
return
optimizer
return
self
@
dygraph_only
...
...
@@ -756,15 +764,22 @@ class Fleet(object):
"""
assert
model
is
not
None
self
.
model
=
paddle
.
DataParallel
(
model
,
comm_buffer_size
=
self
.
_user_defined_strategy
.
fuse_grad_size_in_MB
,
last_comm_buffer_size
=
self
.
_user_defined_strategy
.
last_comm_group_size_MB
,
find_unused_parameters
=
self
.
_user_defined_strategy
.
find_unused_parameters
)
return
self
.
model
assert
model
is
not
None
,
"model should not be None"
if
self
.
worker_num
()
<=
1
:
return
model
if
self
.
_hcg
.
get_parallel_mode
()
==
ParallelMode
.
DATA_PARALLEL
:
distributed_model
=
paddle
.
DataParallel
(
model
,
comm_buffer_size
=
self
.
_user_defined_strategy
.
fuse_grad_size_in_MB
,
last_comm_buffer_size
=
self
.
_user_defined_strategy
.
last_comm_group_size_MB
,
find_unused_parameters
=
self
.
_user_defined_strategy
.
find_unused_parameters
)
elif
self
.
_hcg
.
get_parallel_mode
()
==
ParallelMode
.
MODEL_PARALLEL
:
distributed_model
=
ModelParallel
(
model
,
self
.
_hcg
,
strategy
=
self
.
_user_defined_strategy
)
return
distributed_model
@
dygraph_only
def
state_dict
(
self
):
...
...
python/paddle/distributed/fleet/base/meta_optimizer_factory.py
浏览文件 @
ffd40860
...
...
@@ -17,6 +17,10 @@ from ..meta_optimizers import *
meta_optimizer_names
=
list
(
filter
(
lambda
name
:
name
.
endswith
(
"Optimizer"
),
dir
()))
# Because HybridParallelOptimizer is dygraph optimizer, it
# should be removed
meta_optimizer_names
.
remove
(
"HybridParallelOptimizer"
)
class
MetaOptimizerFactory
(
object
):
def
__init__
(
self
):
...
...
python/paddle/distributed/fleet/base/topology.py
浏览文件 @
ffd40860
...
...
@@ -24,8 +24,16 @@ __all__ = ['CommunicateTopology', 'HybridCommunicateGroup']
_HYBRID_PARALLEL_GROUP
=
None
class
ParallelMode
(
object
):
DATA_PARALLEL
=
0
MODEL_PARALLEL
=
1
PIPELINE_PARALLEL
=
2
class
CommunicateTopology
(
object
):
def
__init__
(
self
,
hybrid_group_names
,
dims
):
def
__init__
(
self
,
hybrid_group_names
=
[
"data"
,
"pipe"
,
"model"
],
dims
=
[
1
,
1
,
1
]):
self
.
_parallel_names
=
hybrid_group_names
self
.
_dims
=
dims
self
.
coordinate
=
collections
.
namedtuple
(
'Coordinate'
,
...
...
@@ -118,15 +126,29 @@ class HybridCommunicateGroup(object):
# create comm group for data parallel
self
.
_dp_group
,
self
.
_dp_comm_group
=
self
.
_set_comm_group
(
"data"
)
print
(
"data parallel group"
,
self
.
_dp_group
,
file
=
sys
.
stderr
)
# create comm group for model parallel
self
.
_mp_group
,
self
.
_mp_comm_group
=
self
.
_set_comm_group
(
"model"
)
print
(
"data parallel group"
,
self
.
_mp_group
,
file
=
sys
.
stderr
)
debug_str
=
"HybridParallelInfo: rank_id: %d, dp_degree: %d, "
\
"mp_degree: %d, pp_degree: %d
\n
"
%
(
self
.
global_rank
,
self
.
_dp_degree
,
self
.
_mp_degree
,
self
.
_pp_degree
)
debug_str
+=
"dp_group: %s, mp_group: %s"
%
(
self
.
_dp_group
,
self
.
_mp_group
)
print
(
debug_str
,
file
=
sys
.
stderr
)
global
_HYBRID_PARALLEL_GROUP
_HYBRID_PARALLEL_GROUP
=
self
def
get_parallel_mode
(
self
):
# there are three modes : DataParallel / ModelParallel / PipelineParallel
if
self
.
_mp_degree
==
1
and
self
.
_pp_degree
==
1
:
return
ParallelMode
.
DATA_PARALLEL
elif
self
.
_mp_degree
>
1
and
self
.
_pp_degree
==
1
:
# initialize the seed
return
ParallelMode
.
MODEL_PARALLEL
elif
self
.
_pp_degree
>
1
:
return
ParallelMode
.
PIPELINE_PARALLEL
def
_check_vaild_topo
(
self
):
return
self
.
_dp_degree
*
self
.
_mp_degree
*
self
.
_pp_degree
==
self
.
nranks
...
...
python/paddle/distributed/fleet/meta_optimizers/__init__.py
浏览文件 @
ffd40860
...
...
@@ -25,3 +25,4 @@ from .dgc_optimizer import DGCOptimizer
from
.lamb_optimizer
import
LambOptimizer
from
.fp16_allreduce_optimizer
import
FP16AllReduceOptimizer
from
.sharding_optimizer
import
ShardingOptimizer
from
.dygraph_optimizer
import
HybridParallelOptimizer
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
0 → 100644
浏览文件 @
ffd40860
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from
.hybrid_parallel_optimizer
import
HybridParallelOptimizer
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
0 → 100644
浏览文件 @
ffd40860
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
paddle.optimizer
import
Optimizer
from
...utils.hybrid_parallel_util
import
fused_allreduce_gradients
from
...base.topology
import
ParallelMode
from
paddle.fluid.dygraph
import
base
as
imperative_base
from
paddle.fluid
import
framework
from
paddle.fluid.framework
import
Variable
class
HybridParallelOptimizer
:
def
__init__
(
self
,
optimizer
,
hcg
,
strategy
):
self
.
_inner_opt
=
optimizer
self
.
_strategy
=
strategy
self
.
_hcg
=
hcg
self
.
_is_mp
=
(
self
.
_hcg
.
get_parallel_mode
()
==
ParallelMode
.
MODEL_PARALLEL
)
self
.
_need_dp
=
(
self
.
_hcg
.
get_data_parallel_world_size
()
>
1
)
@
imperative_base
.
no_grad
@
framework
.
dygraph_only
def
step
(
self
):
if
self
.
_is_mp
and
self
.
_need_dp
:
fused_allreduce_gradients
(
list
(
self
.
_inner_opt
.
_parameter_list
),
self
.
_hcg
)
self
.
_inner_opt
.
step
()
@
imperative_base
.
no_grad
def
minimize
(
self
,
loss
,
startup_program
=
None
,
parameters
=
None
,
no_grad_set
=
None
):
assert
isinstance
(
loss
,
Variable
),
"The loss should be an Tensor."
parameter_list
=
parameters
if
parameters
\
else
self
.
_parameter_list
if
self
.
_is_mp
and
self
.
_need_dp
:
fused_allreduce_gradients
(
list
(
parameter_list
),
self
.
_hcg
)
return
self
.
_inner_opt
.
minimize
(
loss
,
startup_program
,
parameters
,
no_grad_set
)
def
__getattr__
(
self
,
item
):
return
getattr
(
self
.
_inner_opt
,
item
)
python/paddle/distributed/fleet/meta_parallel/__init__.py
浏览文件 @
ffd40860
...
...
@@ -13,3 +13,4 @@
# limitations under the License.
from
.mp_utils
import
*
from
.model_parallel
import
ModelParallel
python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py
0 → 100644
浏览文件 @
ffd40860
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
paddle.fluid.dygraph.layers
import
Layer
import
logging
class
MetaParallelBase
(
Layer
):
def
__init__
(
self
,
layers
,
hcg
,
strategy
):
super
(
MetaParallelBase
,
self
).
__init__
(
layers
.
full_name
()
+
"_meta_parallel_base"
)
self
.
_layers
=
layers
self
.
_hcg
=
hcg
self
.
_prepare_for_model
()
def
_prepare_for_model
(
self
):
pass
def
_pre_forward
(
self
,
*
inputs
,
**
kwargs
):
pass
def
forward
(
self
,
*
inputs
,
**
kwargs
):
self
.
_pre_forward
(
*
inputs
,
**
kwargs
)
output
=
self
.
_layers
(
*
inputs
,
**
kwargs
)
self
.
_post_forward
(
output
)
return
output
def
_post_forward
(
self
,
output
):
pass
python/paddle/distributed/fleet/meta_parallel/model_parallel.py
0 → 100644
浏览文件 @
ffd40860
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
paddle.fluid.dygraph.layers
import
Layer
from
.meta_parallel_base
import
MetaParallelBase
from
..utils.hybrid_parallel_util
import
*
class
ModelParallel
(
MetaParallelBase
):
def
__init__
(
self
,
layers
,
hcg
,
**
kwargs
):
super
(
ModelParallel
,
self
).
__init__
(
layers
,
hcg
,
**
kwargs
)
def
_prepare_for_model
(
self
):
broadcast_mp_parameters
(
self
.
_layers
,
self
.
_hcg
)
broadcast_dp_parameters
(
self
.
_layers
,
self
.
_hcg
)
def
_pre_forward
(
self
,
*
inputs
,
**
kwargs
):
return
broadcast_input_data
(
self
.
_hcg
,
*
inputs
,
**
kwargs
)
python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
0 → 100644
浏览文件 @
ffd40860
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
six
import
numpy
as
np
import
warnings
from
paddle
import
framework
import
paddle
from
paddle.fluid
import
core
from
paddle.fluid.dygraph.parallel
import
_split_tensors
,
sync_params_buffers
,
construct_groups
from
collections
import
OrderedDict
def
_apply_collective_grads
(
parameters
,
comm_group
):
grad_var_set
=
set
()
grad_vars
=
[]
sparse_grad_vars
=
[]
for
param
in
parameters
:
if
param
.
trainable
and
(
param
.
_grad_ivar
()
is
not
None
):
g_var
=
param
.
_grad_ivar
()
assert
not
g_var
.
_is_sparse
(
),
"Now, it doesn't support sparse parameters"
grad_vars
.
append
(
g_var
)
assert
g_var
not
in
grad_var_set
grad_var_set
.
add
(
g_var
)
coalesced_grads_and_vars
=
construct_groups
(
grad_vars
,
128
*
1024
*
1024
)
for
coalesced_grad
,
_
,
_
in
coalesced_grads_and_vars
:
# need to div nranks
coalesced_grad
=
coalesced_grad
/
comm_group
.
nranks
paddle
.
distributed
.
all_reduce
(
coalesced_grad
,
group
=
comm_group
)
_split_tensors
(
coalesced_grads_and_vars
)
def
broadcast_input_data
(
hcg
,
*
inputs
,
**
kwargs
):
model_parallel_group
=
hcg
.
get_model_parallel_group
()
src_rank
=
hcg
.
get_model_parallel_group_src_rank
()
for
input_
in
inputs
:
if
isinstance
(
input_
,
core
.
VarBase
):
with
framework
.
no_grad
():
paddle
.
distributed
.
broadcast
(
input_
,
src
=
src_rank
,
group
=
model_parallel_group
,
use_calc_stream
=
True
)
else
:
print
(
"it doesn't support data type {}"
.
format
(
type
(
input_
)))
for
k
,
v
in
kwargs
.
items
():
if
isinstance
(
v
,
core
.
VarBase
):
with
framework
.
no_grad
():
paddle
.
distributed
.
broadcast
(
v
,
src
=
src_rank
,
group
=
model_parallel_group
,
use_calc_stream
=
True
)
kwargs
[
k
]
=
v
else
:
print
(
"it doesn't support data type {}"
.
format
(
type
(
v
)))
return
inputs
,
kwargs
def
broadcast_mp_parameters
(
model
,
hcg
):
model_parallel_group
=
hcg
.
get_model_parallel_group
()
src_rank
=
hcg
.
get_model_parallel_group_src_rank
()
sync_params_buffers
(
model
,
model_parallel_group
,
src_rank
,
is_model_parallel
=
True
)
def
broadcast_dp_parameters
(
model
,
hcg
):
data_parallel_group
=
hcg
.
get_data_parallel_group
()
src_rank
=
hcg
.
get_data_parallel_group_src_rank
()
sync_params_buffers
(
model
,
data_parallel_group
,
src_rank
,
is_model_parallel
=
False
)
def
fused_allreduce_gradients
(
parameter_list
,
hcg
):
data_parallel_group
=
hcg
.
get_data_parallel_group
()
with
framework
.
no_grad
():
_apply_collective_grads
(
parameter_list
,
data_parallel_group
)
python/paddle/fluid/dygraph/parallel.py
浏览文件 @
ffd40860
...
...
@@ -25,6 +25,7 @@ from paddle.fluid.dygraph import parallel_helper
from
paddle.fluid.dygraph
import
to_variable
,
no_grad
from
paddle.utils
import
deprecated
from
..layers
import
collective
from
paddle.fluid.dygraph
import
base
as
imperative_base
import
warnings
import
paddle
import
itertools
...
...
@@ -320,6 +321,62 @@ def scale_loss(loss):
return
scaled_loss
@
imperative_base
.
no_grad
@
framework
.
dygraph_only
def
construct_groups
(
vars
,
group_size
):
group_idx
=
0
memory_counter
=
0
var_groups
=
OrderedDict
()
dtype
=
vars
[
0
].
dtype
for
var
in
vars
:
bytes
=
np
.
prod
(
var
.
shape
)
*
core
.
size_of_dtype
(
var
.
dtype
)
if
memory_counter
<
group_size
and
dtype
==
var
.
dtype
:
memory_counter
+=
bytes
else
:
memory_counter
=
0
dtype
=
var
.
dtype
group_idx
+=
1
var_groups
.
setdefault
(
group_idx
,
[]).
append
(
var
)
return
_coalesce_tensors
(
var_groups
)
@
imperative_base
.
no_grad
@
framework
.
dygraph_only
def
sync_params_buffers
(
model
,
comm_group
=
None
,
src_rank
=
0
,
is_model_parallel
=
False
):
model_vars
=
[]
for
_
,
param
in
model
.
state_dict
().
items
():
if
not
isinstance
(
param
,
core
.
VarBase
):
raise
TypeError
(
"The data type of '%s' must be Varbase"
%
param
.
name
)
# is_distributed param not need to sync when in mp mode
if
is_model_parallel
and
param
.
is_distributed
:
continue
model_vars
.
append
(
param
.
detach
())
if
len
(
model_vars
)
==
0
:
return
# group size is 128M
coalesced_vars
=
construct_groups
(
model_vars
,
128
*
1024
*
1024
)
for
coalesced_var
,
_
,
_
in
coalesced_vars
:
paddle
.
distributed
.
broadcast
(
coalesced_var
,
src
=
src_rank
,
group
=
comm_group
,
use_calc_stream
=
True
)
for
coalesced_var
,
origin_vars
,
var_shapes
in
coalesced_vars
:
var_len
=
[
np
.
prod
(
v_shape
)
for
v_shape
in
var_shapes
]
paddle
.
fluid
.
framework
.
_dygraph_tracer
().
trace_op
(
type
=
'split'
,
inputs
=
{
'X'
:
coalesced_var
},
outputs
=
{
'Out'
:
origin_vars
},
attrs
=
{
'sections'
:
var_len
,
'axis'
:
0
})
class
DataParallel
(
layers
.
Layer
):
"""
Run the dygraph module with data parallelism.
...
...
@@ -443,7 +500,7 @@ class DataParallel(layers.Layer):
# TODO(liuyuhui) Currently not support xpu. xpu is
# still broadcasting parameters when calling layer
if
not
paddle
.
is_compiled_with_xpu
():
s
elf
.
_sync_params_buffers
(
)
s
ync_params_buffers
(
self
.
_layers
)
self
.
comm_buffer_size
=
int
(
comm_buffer_size
*
1024
*
1024
)
# NOTE(shenliang03): We can set environment variables to control
...
...
@@ -516,46 +573,6 @@ class DataParallel(layers.Layer):
return
itertools
.
chain
(
*
map
(
self
.
_find_varbase
,
obj
.
values
()))
return
[]
def
_sync_params_buffers
(
self
):
model_vars
=
[]
for
_
,
param
in
self
.
_layers
.
state_dict
().
items
():
if
not
isinstance
(
param
,
core
.
VarBase
):
raise
TypeError
(
"The data type of '%s' must be Varbase"
%
param
.
name
)
model_vars
.
append
(
param
.
detach
())
if
len
(
model_vars
)
==
0
:
return
mega_bytes
=
128
*
1024
*
1024
group_idx
=
0
memory_counter
=
0
var_groups
=
OrderedDict
()
dtype
=
model_vars
[
0
].
dtype
for
var
in
model_vars
:
bytes
=
np
.
prod
(
var
.
shape
)
*
core
.
size_of_dtype
(
var
.
dtype
)
if
memory_counter
<
mega_bytes
and
dtype
==
var
.
dtype
:
memory_counter
+=
bytes
else
:
memory_counter
=
0
dtype
=
var
.
dtype
group_idx
+=
1
var_groups
.
setdefault
(
group_idx
,
[]).
append
(
var
)
coalesced_vars
=
_coalesce_tensors
(
var_groups
)
for
coalesced_var
,
_
,
_
in
coalesced_vars
:
collective
.
_broadcast
(
coalesced_var
,
root
=
0
,
sync_mode
=
True
)
for
coalesced_var
,
origin_vars
,
var_shapes
in
coalesced_vars
:
var_len
=
[
np
.
prod
(
v_shape
)
for
v_shape
in
var_shapes
]
framework
.
_dygraph_tracer
().
trace_op
(
type
=
'split'
,
inputs
=
{
'X'
:
coalesced_var
},
outputs
=
{
'Out'
:
origin_vars
},
attrs
=
{
'sections'
:
var_len
,
'axis'
:
0
})
def
forward
(
self
,
*
inputs
,
**
kwargs
):
outputs
=
self
.
_layers
(
*
inputs
,
**
kwargs
)
if
self
.
_strategy
.
nranks
>
1
and
framework
.
_dygraph_tracer
().
_has_grad
:
...
...
python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
0 → 100644
浏览文件 @
ffd40860
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
division
from
__future__
import
print_function
import
paddle
import
numpy
as
np
import
random
import
paddle.distributed
as
dist
import
paddle.fluid
as
fluid
import
paddle.distributed.fleet
as
fleet
import
paddle.fluid.generator
as
generator
from
paddle.io
import
DataLoader
,
Dataset
import
unittest
def
set_random_seed
(
seed
,
dp_id
,
rank_id
):
"""Set random seed for reproducability."""
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
+
dp_id
)
paddle
.
seed
(
seed
+
rank_id
)
vocab_size
=
5
hidden_size
=
10
inner_size
=
8
output_size
=
2
seq_length
=
2
class
SimpleMPNet
(
fluid
.
dygraph
.
Layer
):
def
__init__
(
self
,
vocab_size
,
hidden_size
,
inner_size
,
output_size
,
np_fc1
,
np_fc2
,
mp_id
):
super
(
SimpleMPNet
,
self
).
__init__
()
if
mp_id
==
0
:
init_fc1_data
=
np_fc1
[:,
:(
inner_size
//
2
)]
init_fc2_data
=
np_fc2
[:(
inner_size
//
2
),
:]
else
:
init_fc1_data
=
np_fc1
[:,
(
inner_size
//
2
):]
init_fc2_data
=
np_fc2
[(
inner_size
//
2
):,
:]
self
.
linear1
=
fleet
.
meta_parallel
.
ColumnParallelLinear
(
hidden_size
,
inner_size
,
weight_attr
=
paddle
.
framework
.
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
Assign
(
init_fc1_data
)),
gather_output
=
False
,
has_bias
=
True
)
self
.
linear2
=
fleet
.
meta_parallel
.
RowParallelLinear
(
inner_size
,
hidden_size
,
weight_attr
=
paddle
.
framework
.
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
Assign
(
init_fc2_data
)),
input_is_parallel
=
True
,
has_bias
=
True
)
self
.
linear3
=
paddle
.
nn
.
Linear
(
hidden_size
,
output_size
,
weight_attr
=
paddle
.
framework
.
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
Constant
(
0.0
)),
bias_attr
=
paddle
.
framework
.
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
Constant
(
0.0
)))
self
.
embedding
=
fleet
.
meta_parallel
.
VocabParallelEmbedding
(
vocab_size
,
hidden_size
,
weight_attr
=
paddle
.
nn
.
initializer
.
Constant
(
value
=
0.5
))
def
forward
(
self
,
x
):
x
=
self
.
embedding
(
x
)
x
=
self
.
linear1
(
x
)
x
=
self
.
linear2
(
x
)
x
=
self
.
linear3
(
x
)
return
x
class
SimpleDPNet
(
fluid
.
dygraph
.
Layer
):
def
__init__
(
self
,
vocab_size
,
hidden_size
,
inner_size
,
output_size
,
np_fc1
,
np_fc2
):
super
(
SimpleDPNet
,
self
).
__init__
()
self
.
linear1
=
paddle
.
nn
.
Linear
(
hidden_size
,
inner_size
,
weight_attr
=
paddle
.
framework
.
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
Assign
(
np_fc1
)),
bias_attr
=
paddle
.
framework
.
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
Constant
(
0.0
)))
self
.
linear2
=
paddle
.
nn
.
Linear
(
inner_size
,
hidden_size
,
weight_attr
=
paddle
.
framework
.
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
Assign
(
np_fc2
)),
bias_attr
=
paddle
.
framework
.
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
Constant
(
0.0
)))
self
.
linear3
=
paddle
.
nn
.
Linear
(
hidden_size
,
output_size
,
weight_attr
=
paddle
.
framework
.
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
Constant
(
0.0
)),
bias_attr
=
paddle
.
framework
.
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
Constant
(
0.0
)))
self
.
embedding
=
paddle
.
nn
.
Embedding
(
vocab_size
,
hidden_size
,
weight_attr
=
paddle
.
nn
.
initializer
.
Constant
(
value
=
0.5
))
def
forward
(
self
,
x
):
x
=
self
.
embedding
(
x
)
x
=
self
.
linear1
(
x
)
x
=
self
.
linear2
(
x
)
x
=
self
.
linear3
(
x
)
return
x
class
TrainDataset
(
Dataset
):
def
__init__
(
self
,
length
):
self
.
length
=
length
def
__len__
(
self
):
return
self
.
length
def
__getitem__
(
self
,
index
):
np_input_data
=
np
.
random
.
randint
(
0
,
vocab_size
,
(
seq_length
,
))
return
np_input_data
class
TestDistTraning
(
unittest
.
TestCase
):
def
setUp
(
self
):
strategy
=
fleet
.
DistributedStrategy
()
self
.
model_parallel_size
=
2
self
.
data_parallel_size
=
1
strategy
.
hybrid_configs
=
{
"dp_degree"
:
self
.
data_parallel_size
,
"mp_degree"
:
self
.
model_parallel_size
,
"pp_degree"
:
1
}
fleet
.
init
(
is_collective
=
True
,
strategy
=
strategy
)
def
test_mp_model
(
self
):
hcg
=
fleet
.
get_hybrid_communicate_group
()
word_size
=
hcg
.
get_model_parallel_world_size
()
mp_id
=
hcg
.
get_model_parallel_rank
()
dp_id
=
hcg
.
get_data_parallel_rank
()
rank_id
=
dist
.
get_rank
()
set_random_seed
(
1024
,
dp_id
,
rank_id
)
np_fc1
=
np
.
random
.
random_sample
((
hidden_size
,
inner_size
))
np_fc2
=
np
.
random
.
random_sample
((
inner_size
,
hidden_size
))
train_data
=
TrainDataset
(
length
=
10000
)
train_batch_sampler
=
paddle
.
io
.
DistributedBatchSampler
(
train_data
,
batch_size
=
4
,
shuffle
=
False
,
num_replicas
=
self
.
data_parallel_size
,
rank
=
dp_id
)
train_data_loader
=
DataLoader
(
dataset
=
train_data
,
batch_sampler
=
train_batch_sampler
,
num_workers
=
0
,
return_list
=
True
)
model_a
=
SimpleMPNet
(
vocab_size
,
hidden_size
,
inner_size
,
output_size
,
np_fc1
,
np_fc2
,
mp_id
)
optimizer_a
=
paddle
.
optimizer
.
SGD
(
learning_rate
=
0.001
,
parameters
=
model_a
.
parameters
())
model_a
=
fleet
.
distributed_model
(
model_a
)
optimizer_a
=
fleet
.
distributed_optimizer
(
optimizer_a
)
model_b
=
SimpleDPNet
(
vocab_size
,
hidden_size
,
inner_size
,
output_size
,
np_fc1
,
np_fc2
)
optimizer_b
=
paddle
.
optimizer
.
SGD
(
learning_rate
=
0.001
,
parameters
=
model_b
.
parameters
())
for
step
,
batch
in
enumerate
(
train_data_loader
):
if
step
>
5
:
return
output_a
=
model_a
(
batch
)
loss_a
=
output_a
.
mean
()
loss_a
.
backward
()
optimizer_a
.
step
()
optimizer_a
.
clear_grad
()
output_b
=
model_b
(
batch
)
loss_b
=
output_b
.
mean
()
loss_b
.
backward
()
optimizer_b
.
step
()
optimizer_b
.
clear_grad
()
np
.
testing
.
assert_allclose
(
loss_a
.
numpy
(),
loss_b
.
numpy
())
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py
浏览文件 @
ffd40860
...
...
@@ -15,7 +15,6 @@
from
__future__
import
print_function
import
unittest
import
time
import
paddle.fluid
as
fluid
from
test_parallel_dygraph_dataparallel
import
TestMultipleGpus
...
...
@@ -28,6 +27,9 @@ class TestHybridParallel(TestMultipleGpus):
def
test_hybrid_parallel_mp_random
(
self
):
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_random.py'
)
def
test_hybrid_parallel_mp_model
(
self
):
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_model.py'
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/setup.py.in
浏览文件 @
ffd40860
...
...
@@ -150,6 +150,7 @@ packages=['paddle',
'paddle.distributed.fleet.meta_optimizers',
'paddle.distributed.fleet.meta_optimizers.sharding',
'paddle.distributed.fleet.meta_optimizers.ascend',
'paddle.distributed.fleet.meta_optimizers.dygraph_optimizer',
'paddle.distributed.fleet.runtime',
'paddle.distributed.fleet.dataset',
'paddle.distributed.fleet.data_generator',
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录