Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
2e82b6c8
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
2e82b6c8
编写于
4月 06, 2021
作者:
S
ShenLiang
提交者:
GitHub
4月 06, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Hybrid Parallel] Add Topology for hybrid communicate (#32011)
* support hyparallel, add topology * fix utest
上级
a3b08bad
变更
6
显示空白变更内容
内联
并排
Showing
6 changed file
with
368 addition
and
9 deletion
+368
-9
python/paddle/distributed/fleet/__init__.py
python/paddle/distributed/fleet/__init__.py
+5
-8
python/paddle/distributed/fleet/base/topology.py
python/paddle/distributed/fleet/base/topology.py
+176
-0
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+1
-1
python/paddle/fluid/tests/unittests/hybrid_communicate_group.py
.../paddle/fluid/tests/unittests/hybrid_communicate_group.py
+101
-0
python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py
...le/fluid/tests/unittests/test_hybrid_parallel_topology.py
+84
-0
python/paddle/fluid/tests/unittests/test_new_group.sh
python/paddle/fluid/tests/unittests/test_new_group.sh
+1
-0
未找到文件。
python/paddle/distributed/fleet/__init__.py
浏览文件 @
2e82b6c8
...
...
@@ -20,16 +20,13 @@ from .base.util_factory import UtilBase
from
.dataset
import
*
from
.data_generator
import
MultiSlotDataGenerator
,
MultiSlotStringDataGenerator
from
.
import
metrics
from
.base.topology
import
CommunicateTopology
,
HybridCommunicateGroup
__all__
=
[
"DistributedStrategy"
,
"UtilBase"
,
"UserDefinedRoleMaker"
,
"PaddleCloudRoleMaker"
,
"Fleet"
,
"MultiSlotDataGenerator"
,
"MultiSlotStringDataGenerator"
,
"Role"
,
"DistributedStrategy"
,
"UtilBase"
,
"UserDefinedRoleMaker"
,
"PaddleCloudRoleMaker"
,
"Fleet"
,
"MultiSlotDataGenerator"
,
"MultiSlotStringDataGenerator"
,
"Role"
,
"CommunicateTopology"
,
"HybridCommunicateGroup"
]
fleet
=
Fleet
()
...
...
python/paddle/distributed/fleet/base/topology.py
0 → 100644
浏览文件 @
2e82b6c8
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
import
collections
import
numpy
as
np
from
itertools
import
product
from
functools
import
reduce
__all__
=
[
'CommunicateTopology'
,
'HybridCommunicateGroup'
]
class
CommunicateTopology
(
object
):
def
__init__
(
self
,
hybrid_group_names
,
dims
):
self
.
_parallel_names
=
hybrid_group_names
self
.
_dims
=
dims
self
.
coordinate
=
collections
.
namedtuple
(
'Coordinate'
,
self
.
_parallel_names
)
self
.
_world_size
=
reduce
(
lambda
x
,
y
:
x
*
y
,
self
.
_dims
)
ranges
=
[
range
(
d
)
for
d
in
self
.
_dims
]
all_coordinate
=
[
self
.
coordinate
(
*
x
)
for
x
in
product
(
*
ranges
)]
self
.
_coord2rank
=
dict
(
zip
(
all_coordinate
,
range
(
len
(
all_coordinate
))))
self
.
_rank2coord
=
dict
(
zip
(
self
.
_coord2rank
.
values
(),
self
.
_coord2rank
.
keys
()))
def
get_hybrid_group_names
(
self
):
return
self
.
_parallel_names
def
get_dim
(
self
,
axis_name
):
return
self
.
_dims
[
self
.
_parallel_names
.
index
(
axis_name
)]
def
world_size
(
self
):
return
self
.
_world_size
def
get_rank
(
self
,
**
args
):
assert
len
(
args
)
==
len
(
self
.
_dims
)
key
=
self
.
coordinate
(
**
args
)
assert
key
in
self
.
_coord2rank
.
keys
()
return
self
.
_coord2rank
[
key
]
def
get_coord
(
self
,
rank
):
assert
rank
<
self
.
_world_size
assert
rank
in
self
.
_rank2coord
.
keys
()
return
self
.
_rank2coord
[
rank
]
def
get_axis_list
(
self
,
axis_name
,
index
):
axis
=
self
.
_parallel_names
.
index
(
axis_name
)
ranks
=
[
self
.
_coord2rank
[
coord
]
for
coord
in
self
.
_coord2rank
.
keys
()
if
coord
[
axis
]
==
index
]
ranks
.
sort
()
return
ranks
def
get_dim_size
(
self
,
axis_name
):
assert
axis_name
in
self
.
_parallel_names
return
self
.
_dims
[
self
.
_parallel_names
.
index
(
axis_name
)]
def
get_comm_list
(
self
,
axis_name
):
assert
axis_name
in
self
.
_parallel_names
other_axis_names
=
[
name
for
name
in
self
.
_parallel_names
if
name
!=
axis_name
]
ranges
=
[]
for
name
in
other_axis_names
:
dim_num
=
self
.
get_dim_size
(
name
)
ranges
.
append
(
range
(
dim_num
))
all_result
=
[]
for
x
in
product
(
*
ranges
):
key_coord
=
{}
for
other_name
in
other_axis_names
:
key_coord
[
other_name
]
=
x
[
other_axis_names
.
index
(
other_name
)]
result
=
[]
for
i
in
range
(
0
,
self
.
get_dim_size
(
axis_name
)):
key_coord
[
axis_name
]
=
i
result
.
append
(
self
.
_coord2rank
[
self
.
coordinate
(
**
key_coord
)])
all_result
.
append
(
result
)
return
all_result
class
HybridCommunicateGroup
(
object
):
def
__init__
(
self
,
topology
):
self
.
nranks
=
paddle
.
distributed
.
get_world_size
()
self
.
global_rank
=
paddle
.
distributed
.
get_rank
()
self
.
_topo
=
topology
self
.
_num_data_parallel
=
self
.
_topo
.
get_dim
(
'data'
)
self
.
_num_model_parallel
=
self
.
_topo
.
get_dim
(
'model'
)
self
.
_num_pipe_parallel
=
self
.
_topo
.
get_dim
(
'pipe'
)
self
.
_data_parallel_id
=
self
.
_get_data_parallel_id
()
self
.
_model_parallel_id
=
self
.
_get_model_parallel_id
()
assert
self
.
_check_vaild_topo
(
),
"Here is an unreasonable topogy setting"
# create comm group for data parallel
self
.
_dp_group
,
self
.
_dp_comm_group
=
self
.
_set_comm_group
(
"data"
)
print
(
"data parallel group"
,
self
.
_dp_group
)
# create comm group for model parallel
self
.
_mp_group
,
self
.
_mp_comm_group
=
self
.
_set_comm_group
(
"model"
)
print
(
"model parallel group"
,
self
.
_mp_group
)
def
_check_vaild_topo
(
self
):
return
self
.
_num_data_parallel
*
self
.
_num_model_parallel
*
self
.
_num_pipe_parallel
==
self
.
nranks
def
_set_comm_group
(
self
,
parallel_method
=
"data"
):
parallel_group
=
[]
parallel_comm_group
=
None
parallel_groups
=
self
.
_topo
.
get_comm_list
(
parallel_method
)
for
group
in
parallel_groups
:
comm_group
=
paddle
.
distributed
.
new_group
(
ranks
=
group
)
if
self
.
global_rank
in
group
:
parallel_group
=
group
parallel_comm_group
=
comm_group
assert
len
(
parallel_group
)
>
0
assert
parallel_comm_group
is
not
None
return
parallel_group
,
parallel_comm_group
def
topology
(
self
):
return
self
.
_topo
def
get_global_rank
(
self
):
return
self
.
global_rank
# data parallel message:
def
_get_data_parallel_id
(
self
):
return
self
.
_topo
.
get_coord
(
self
.
global_rank
).
data
def
get_data_parallel_rank
(
self
):
return
self
.
_data_parallel_id
def
get_data_parallel_world_size
(
self
):
return
self
.
_num_data_parallel
def
get_data_parallel_group
(
self
):
return
self
.
_dp_comm_group
def
get_data_parallel_group_src_rank
(
self
):
return
self
.
_dp_comm_group
.
ranks
[
0
]
# model parallel message:
def
_get_model_parallel_id
(
self
):
return
self
.
_topo
.
get_coord
(
self
.
global_rank
).
model
def
get_model_parallel_rank
(
self
):
return
self
.
_model_parallel_id
def
get_model_parallel_world_size
(
self
):
return
self
.
_num_model_parallel
def
get_model_parallel_group
(
self
):
return
self
.
_mp_comm_group
def
get_model_parallel_group_src_rank
(
self
):
return
self
.
_mp_comm_group
.
ranks
[
0
]
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
2e82b6c8
...
...
@@ -524,7 +524,6 @@ if(WITH_DISTRIBUTE)
if
(
WITH_GPU OR WITH_ROCM
)
bash_test_modules
(
test_c_comm_init_op START_BASH test_c_comm_init_op.sh ENVS PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
py_test_modules
(
test_launch_coverage MODULES test_launch_coverage
)
bash_test_modules
(
test_new_group START_BASH test_new_group.sh ENVS PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
endif
()
bash_test_modules
(
test_fleetrun START_BASH test_fleetrun.sh ENVS PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
...
...
@@ -543,6 +542,7 @@ if(WITH_DISTRIBUTE)
endif
()
endforeach
(
TEST_OP
)
bash_test_modules
(
test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS
"RUN_TYPE=EXCLUSIVE"
ENVS
"PADDLE_DIST_UT_PORT=
${
dist_ut_port
}
"
PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
bash_test_modules
(
test_new_group START_BASH test_new_group.sh SERIAL LABELS
"RUN_TYPE=EXCLUSIVE"
ENVS
"PADDLE_DIST_UT_PORT=
${
dist_ut_port
}
+20"
PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
endif
(
NOT APPLE
)
endif
()
...
...
python/paddle/fluid/tests/unittests/hybrid_communicate_group.py
0 → 100644
浏览文件 @
2e82b6c8
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
os
import
paddle
from
paddle.distributed
import
fleet
class
TestNewGroupAPI
(
object
):
def
__init__
(
self
):
paddle
.
distributed
.
init_parallel_env
()
topo
=
fleet
.
CommunicateTopology
([
"data"
,
"model"
,
"pipe"
],
[
2
,
1
,
1
])
self
.
hcg
=
fleet
.
HybridCommunicateGroup
(
topo
)
d1
=
np
.
array
([
1
,
2
,
3
])
d2
=
np
.
array
([
2
,
3
,
4
])
self
.
tensor1
=
paddle
.
to_tensor
(
d1
)
self
.
tensor2
=
paddle
.
to_tensor
(
d2
)
def
test_all
(
self
):
topo
=
self
.
hcg
.
topology
()
global_rank
=
self
.
hcg
.
get_data_parallel_rank
()
dp_rank
=
self
.
hcg
.
get_data_parallel_rank
()
dp_gp
=
self
.
hcg
.
get_data_parallel_group
()
dp_world_size
=
self
.
hcg
.
get_data_parallel_world_size
()
dp_src_rank
=
self
.
hcg
.
get_data_parallel_group_src_rank
()
np
.
testing
.
assert_array_equal
(
dp_world_size
,
2
)
np
.
testing
.
assert_array_equal
(
dp_src_rank
,
0
)
mp_rank
=
self
.
hcg
.
get_model_parallel_rank
()
mp_gp
=
self
.
hcg
.
get_model_parallel_group
()
mp_world_size
=
self
.
hcg
.
get_model_parallel_world_size
()
mp_src_rank
=
self
.
hcg
.
get_model_parallel_group_src_rank
()
np
.
testing
.
assert_array_equal
(
mp_world_size
,
1
)
tmp
=
np
.
array
([
0
,
0
,
0
])
result
=
paddle
.
to_tensor
(
tmp
)
paddle
.
distributed
.
scatter
(
result
,
[
self
.
tensor2
,
self
.
tensor1
],
src
=
dp_src_rank
,
group
=
dp_gp
,
use_calc_stream
=
True
)
if
dp_rank
==
0
:
assert
np
.
array_equal
(
result
,
self
.
tensor2
)
elif
dp_rank
==
1
:
assert
np
.
array_equal
(
result
,
self
.
tensor1
)
print
(
"test scatter api ok"
)
paddle
.
distributed
.
broadcast
(
result
,
src
=
1
,
group
=
dp_gp
,
use_calc_stream
=
True
)
assert
np
.
array_equal
(
result
,
self
.
tensor1
)
print
(
"test broadcast api ok"
)
paddle
.
distributed
.
reduce
(
result
,
dst
=
dp_src_rank
,
group
=
dp_gp
,
use_calc_stream
=
True
)
if
dp_rank
==
0
:
assert
np
.
array_equal
(
result
,
paddle
.
add
(
self
.
tensor1
,
self
.
tensor1
))
elif
dp_rank
==
1
:
assert
np
.
array_equal
(
result
,
self
.
tensor1
)
print
(
"test reduce api ok"
)
paddle
.
distributed
.
all_reduce
(
result
,
use_calc_stream
=
True
)
assert
np
.
array_equal
(
result
,
paddle
.
add
(
paddle
.
add
(
self
.
tensor1
,
self
.
tensor1
),
self
.
tensor1
))
print
(
"test all_reduce api ok"
)
paddle
.
distributed
.
wait
(
result
,
dp_gp
,
use_calc_stream
=
True
)
paddle
.
distributed
.
wait
(
result
,
dp_gp
,
use_calc_stream
=
False
)
print
(
"test wait api ok"
)
result
=
[]
paddle
.
distributed
.
all_gather
(
result
,
self
.
tensor1
,
group
=
dp_gp
,
use_calc_stream
=
True
)
assert
np
.
array_equal
(
result
[
0
],
self
.
tensor1
)
assert
np
.
array_equal
(
result
[
1
],
self
.
tensor1
)
print
(
"test all_gather api ok"
)
paddle
.
distributed
.
barrier
(
group
=
dp_gp
)
print
(
"test barrier api ok"
)
return
if
__name__
==
"__main__"
:
gpt
=
TestNewGroupAPI
()
gpt
.
test_all
()
python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py
0 → 100644
浏览文件 @
2e82b6c8
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
paddle
import
paddle.nn
as
nn
import
unittest
from
paddle.distributed
import
fleet
import
numpy
as
np
class
TestCommunicateTopology
(
unittest
.
TestCase
):
def
test_topology
(
self
):
topo
=
fleet
.
CommunicateTopology
([
"dp"
,
"mp"
,
"pp"
],
[
2
,
2
,
2
])
# test get_comm_list
dp_comm_list
=
[[
0
,
4
],
[
1
,
5
],
[
2
,
6
],
[
3
,
7
]]
mp_comm_list
=
[[
0
,
2
],
[
1
,
3
],
[
4
,
6
],
[
5
,
7
]]
pp_comm_list
=
[[
0
,
1
],
[
2
,
3
],
[
4
,
5
],
[
6
,
7
]]
np
.
testing
.
assert_array_equal
(
dp_comm_list
,
topo
.
get_comm_list
(
"dp"
))
np
.
testing
.
assert_array_equal
(
mp_comm_list
,
topo
.
get_comm_list
(
"mp"
))
np
.
testing
.
assert_array_equal
(
pp_comm_list
,
topo
.
get_comm_list
(
"pp"
))
# test get_hybrid_group_names
parallel_names
=
[
"dp"
,
"mp"
,
"pp"
]
np
.
testing
.
assert_array_equal
(
parallel_names
,
topo
.
get_hybrid_group_names
())
# test get_dims
np
.
testing
.
assert_array_equal
(
2
,
topo
.
get_dim
(
"dp"
))
np
.
testing
.
assert_array_equal
(
2
,
topo
.
get_dim
(
"mp"
))
np
.
testing
.
assert_array_equal
(
2
,
topo
.
get_dim
(
"pp"
))
# test world size
self
.
assertEqual
(
topo
.
world_size
(),
8
)
# test get_rank
self
.
assertEqual
(
topo
.
get_rank
(
dp
=
0
,
mp
=
0
,
pp
=
0
),
0
)
self
.
assertEqual
(
topo
.
get_rank
(
dp
=
0
,
mp
=
0
,
pp
=
1
),
1
)
self
.
assertEqual
(
topo
.
get_rank
(
dp
=
0
,
mp
=
1
,
pp
=
0
),
2
)
self
.
assertEqual
(
topo
.
get_rank
(
dp
=
0
,
mp
=
1
,
pp
=
1
),
3
)
self
.
assertEqual
(
topo
.
get_rank
(
dp
=
1
,
mp
=
0
,
pp
=
0
),
4
)
self
.
assertEqual
(
topo
.
get_rank
(
dp
=
1
,
mp
=
0
,
pp
=
1
),
5
)
self
.
assertEqual
(
topo
.
get_rank
(
dp
=
1
,
mp
=
1
,
pp
=
0
),
6
)
self
.
assertEqual
(
topo
.
get_rank
(
dp
=
1
,
mp
=
1
,
pp
=
1
),
7
)
# test get_coord
self
.
assertEqual
(
topo
.
get_coord
(
0
),
topo
.
coordinate
(
0
,
0
,
0
))
self
.
assertEqual
(
topo
.
get_coord
(
1
),
topo
.
coordinate
(
0
,
0
,
1
))
self
.
assertEqual
(
topo
.
get_coord
(
2
),
topo
.
coordinate
(
0
,
1
,
0
))
self
.
assertEqual
(
topo
.
get_coord
(
3
),
topo
.
coordinate
(
0
,
1
,
1
))
self
.
assertEqual
(
topo
.
get_coord
(
4
),
topo
.
coordinate
(
1
,
0
,
0
))
self
.
assertEqual
(
topo
.
get_coord
(
5
),
topo
.
coordinate
(
1
,
0
,
1
))
self
.
assertEqual
(
topo
.
get_coord
(
6
),
topo
.
coordinate
(
1
,
1
,
0
))
self
.
assertEqual
(
topo
.
get_coord
(
7
),
topo
.
coordinate
(
1
,
1
,
1
))
# test get_axis_list
self
.
assertEqual
(
topo
.
get_axis_list
(
"dp"
,
0
),
[
0
,
1
,
2
,
3
])
self
.
assertEqual
(
topo
.
get_axis_list
(
"dp"
,
1
),
[
4
,
5
,
6
,
7
])
self
.
assertEqual
(
topo
.
get_axis_list
(
"mp"
,
0
),
[
0
,
1
,
4
,
5
])
self
.
assertEqual
(
topo
.
get_axis_list
(
"mp"
,
1
),
[
2
,
3
,
6
,
7
])
self
.
assertEqual
(
topo
.
get_axis_list
(
"pp"
,
0
),
[
0
,
2
,
4
,
6
])
self
.
assertEqual
(
topo
.
get_axis_list
(
"pp"
,
1
),
[
1
,
3
,
5
,
7
])
# test get_dim_size
self
.
assertEqual
(
topo
.
get_dim_size
(
"dp"
),
2
)
self
.
assertEqual
(
topo
.
get_dim_size
(
"mp"
),
2
)
self
.
assertEqual
(
topo
.
get_dim_size
(
"pp"
),
2
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_new_group.sh
浏览文件 @
2e82b6c8
...
...
@@ -17,3 +17,4 @@
set
-e
CUDA_VISIBLE_DEVICES
=
0,1 python
-m
paddle.distributed.launch
--gpus
=
0,1 new_group.py
CUDA_VISIBLE_DEVICES
=
0,1 python
-m
paddle.distributed.launch
--gpus
=
0,1 hybrid_communicate_group.py
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录