Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
961d6cce
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
961d6cce
编写于
7月 13, 2022
作者:
C
caozhou
提交者:
GitHub
7月 13, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Auto Parallel]Generate default cluster (#44150)
* generate default cluster * add unittest
上级
07c729aa
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
228 addition
and
5 deletion
+228
-5
python/paddle/distributed/auto_parallel/cluster.py
python/paddle/distributed/auto_parallel/cluster.py
+212
-5
python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py
...addle/fluid/tests/unittests/auto_parallel/test_cluster.py
+16
-0
未找到文件。
python/paddle/distributed/auto_parallel/cluster.py
浏览文件 @
961d6cce
...
...
@@ -16,6 +16,7 @@ import os
import
json
from
enum
import
IntEnum
from
enum
import
unique
import
paddle
@
unique
...
...
@@ -138,7 +139,7 @@ class Device:
class
Link
:
default_hop
=
1
default_nic_bandwith
=
24
default_nic_bandwi
d
th
=
24
def
__init__
(
self
,
source
,
target
):
self
.
_src
=
source
...
...
@@ -411,6 +412,174 @@ class Cluster:
self
.
_alpha_latency
=
None
self
.
_rank_to_device_id
=
{}
self
.
_device_id_to_rank
=
{}
# This property only be valid when the cluster consists of machines,
# which have the same number accelerators.
self
.
_num_devices_per_machine
=
None
def
gen_default_config_cluster
(
self
,
gpu_model
=
"V100"
,
cpu_model
=
"6271C"
,
node_count
=
1
,
device_count
=
1
,
gpu_memory
=
32
,
cpu_memory
=
503
,
inter_bandwidth
=
24
,
intra_bandwidth
=
235
,
gpu_dp_gflops
=
7800
,
gpu_sp_gflops
=
15700
,
cpu_dp_gflops
=
75
,
cpu_sp_gflops
=
150
):
"""Generate cluster by default config."""
gpu_models
=
[
"V100"
,
"A100"
,
"H100"
,
"A2"
,
"A10"
,
"A16"
,
"A30"
,
"A40"
]
xpu_models
=
[
"XPU"
]
npu_models
=
[
"NPU"
]
dcu_models
=
[
"DCU"
]
all_gpu_models
=
gpu_models
+
xpu_models
+
npu_models
+
dcu_models
assert
gpu_model
in
all_gpu_models
self
.
_num_devices_per_machine
=
device_count
def
_convert_to_type
(
gpu_model
):
type
=
None
if
gpu_model
in
gpu_models
:
type
=
"GPU"
elif
gpu_model
in
xpu_models
:
type
=
"XPU"
elif
gpu_model
in
npu_models
:
type
=
"NPU"
elif
gpu_model
in
dcu_models
:
type
=
"DCU"
assert
type
is
not
None
return
type
def
_convert_to_model
(
gpu_model
,
gpu_memory
):
model
=
None
if
gpu_model
==
"V100"
:
model
=
"Tesla V100-SXM2-"
+
str
(
gpu_memory
)
+
"GB"
assert
model
is
not
None
return
model
def
_convert_to_cpu_info
(
cpu_model
):
arch
,
vendor
,
model
=
None
,
None
,
None
if
cpu_model
==
"6271C"
:
arch
=
"x86_64"
vendor
=
"GenuineIntel"
model
=
"Intel(R) Xeon(R) Gold 6271C CPU @ 2.60G"
elif
cpu_model
==
"6148"
:
arch
=
"x86_64"
vendor
=
"GenuineIntel"
model
=
"Intel(R) Xeon(R) Gold 6148 CPU @ 2.40G"
assert
arch
is
not
None
assert
vendor
is
not
None
assert
model
is
not
None
return
arch
,
vendor
,
model
cluster_info
=
{}
cluster_info
[
"machines"
]
=
[]
global_id
=
0
global_id_to_device_type
=
{}
global_id_to_node
=
{}
# NOTE: It will support NPU, XPU, DCU models in the future, it is just a fake value now
for
i
in
range
(
node_count
):
machine
=
{}
# NOTE: The hostname is host_0, host_1, ...
machine
[
"hostname"
]
=
"host_"
+
str
(
i
)
# NOTE: The addr is localhost, if need actual addr, it should be reset manually
machine
[
"addr"
]
=
"127.0.0.1"
# NOTE: The port is a default value
machine
[
"port"
]
=
60009
machine
[
"links"
]
=
[]
devices
=
[]
local_id
=
0
for
j
in
range
(
device_count
):
device
=
{}
global_id
=
global_id
if
i
==
0
and
j
==
0
else
global_id
+
1
local_id
+=
1
type
=
_convert_to_type
(
gpu_model
)
model
=
_convert_to_model
(
gpu_model
,
gpu_memory
)
dp_gflops
=
gpu_dp_gflops
sp_gflops
=
gpu_dp_gflops
memory
=
gpu_memory
device
[
"global_id"
]
=
global_id
device
[
"local_id"
]
=
local_id
device
[
"type"
]
=
type
device
[
"model"
]
=
model
device
[
"memory"
]
=
memory
device
[
"sp_gflops"
]
=
sp_gflops
device
[
"dp_gflops"
]
=
dp_gflops
global_id_to_device_type
[
global_id
]
=
type
global_id_to_node
[
global_id
]
=
i
devices
.
append
(
device
)
# add cpu device and nic device, just one cpu
cpu_device
=
{}
arch
,
vendor
,
model
=
_convert_to_cpu_info
(
cpu_model
)
sp_gflops
=
cpu_sp_gflops
dp_gflops
=
cpu_dp_gflops
global_id
+=
1
local_id
=
0
memory
=
cpu_memory
type
=
"CPU"
cpu_device
[
"arch"
]
=
arch
cpu_device
[
"vendor"
]
=
vendor
cpu_device
[
"model"
]
=
model
cpu_device
[
"sp_gflops"
]
=
sp_gflops
cpu_device
[
"dp_gflops"
]
=
dp_gflops
cpu_device
[
"global_id"
]
=
global_id
cpu_device
[
"local_id"
]
=
local_id
cpu_device
[
"memory"
]
=
memory
cpu_device
[
"type"
]
=
type
global_id_to_node
[
global_id
]
=
i
global_id_to_device_type
[
global_id
]
=
type
devices
.
append
(
cpu_device
)
nic_device
=
{}
global_id
+=
1
# add NIC
type
=
"NIC"
width
=
12.5
ip
=
"127.0.0.1"
local_id
=
0
nic_device
[
"type"
]
=
type
nic_device
[
"local_id"
]
=
type
nic_device
[
"global_id"
]
=
global_id
global_id_to_device_type
[
global_id
]
=
type
global_id_to_node
[
global_id
]
=
i
devices
.
append
(
nic_device
)
machine
[
"devices"
]
=
devices
cluster_info
[
"machines"
].
append
(
machine
)
# build link
for
i
in
range
(
0
,
global_id
+
1
):
for
j
in
range
(
0
,
global_id
+
1
):
if
i
==
j
:
continue
node_id_i
=
global_id_to_node
[
i
]
node_id_j
=
global_id_to_node
[
j
]
device_type_i
=
global_id_to_device_type
[
i
]
device_type_j
=
global_id_to_device_type
[
j
]
link
=
{}
source_global_id
=
i
target_global_id
=
j
link
[
"source_global_id"
]
=
source_global_id
link
[
"target_global_id"
]
=
target_global_id
# the same node and device_type, set intra_bandwidth, NVL
if
node_id_i
==
node_id_j
and
device_type_i
==
device_type_j
:
link
[
"type"
]
=
"NVL"
link
[
"bandwidth"
]
=
intra_bandwidth
else
:
link
[
"type"
]
=
"PHB"
link
[
"bandwidth"
]
=
inter_bandwidth
cluster_info
[
"machines"
][
node_id_i
][
"links"
].
append
(
link
)
self
.
_build_from_dict
(
cluster_info
)
@
property
def
rank_to_device_id
(
self
):
...
...
@@ -473,9 +642,7 @@ class Cluster:
device
=
machine
.
devices
[
device_global_id
]
return
device
def
build_from_file
(
self
,
json_file_path
):
with
open
(
json_file_path
)
as
json_file
:
cluster_info
=
json
.
load
(
json_file
)
def
_build_from_dict
(
self
,
cluster_info
):
machines_info
=
cluster_info
[
"machines"
]
for
machine_info
in
machines_info
:
machine_id
=
self
.
_generate_machine_id
()
...
...
@@ -533,6 +700,11 @@ class Cluster:
else
:
self
.
_alpha_latecy
=
None
def
build_from_file
(
self
,
json_file_path
):
with
open
(
json_file_path
)
as
json_file
:
cluster_info
=
json
.
load
(
json_file
)
self
.
_build_from_dict
(
cluster_info
)
def
_generate_machine_id
(
self
):
cur_machine_id
=
self
.
_num_machines
self
.
_num_machines
+=
1
...
...
@@ -556,7 +728,7 @@ class Cluster:
bandwidth
=
None
# None means the source and target are not connected directly, set NIC in default
if
link
is
None
:
bandwidth
=
Link
.
default_nic_bandwith
bandwidth
=
Link
.
default_nic_bandwi
d
th
else
:
bandwidth
=
link
.
bandwidth
...
...
@@ -608,6 +780,15 @@ class Cluster:
assert
count
>
0
return
count
def
get_num_machines
(
self
):
return
len
(
self
.
_machines
)
def
get_num_devices_per_machine
(
self
):
# Only return the number of accelerators of each machine.
# All machines must has the same number of devices and same type of devices.
assert
self
.
_num_devices_per_machine
return
self
.
_num_devices_per_machine
def
__str__
(
self
):
str
=
""
for
machine
in
self
.
machines
.
values
():
...
...
@@ -616,3 +797,29 @@ class Cluster:
def
__repr__
(
self
):
return
self
.
__str__
()
def
get_default_cluster
():
cluster
=
Cluster
()
local_device_count
=
os
.
getenv
(
"PADDLE_LOCAL_SIZE"
)
if
local_device_count
is
None
:
local_device_count
=
1
else
:
local_device_count
=
int
(
local_device_count
)
global_device_count
=
os
.
getenv
(
"PADDLE_GLOBAL_SIZE"
)
if
global_device_count
is
None
:
node_count
=
1
else
:
global_device_count
=
int
(
global_device_count
)
assert
global_device_count
%
local_device_count
==
0
node_count
=
int
(
global_device_count
)
//
local_device_count
print
(
"Node Count: "
,
node_count
,
"Local Device Size: "
,
local_device_count
,
"World size: "
,
paddle
.
distributed
.
get_world_size
(),
flush
=
True
)
cluster
.
gen_default_config_cluster
(
node_count
=
node_count
,
device_count
=
local_device_count
)
return
cluster
python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py
浏览文件 @
961d6cce
...
...
@@ -19,6 +19,7 @@ import json
import
paddle
from
paddle.distributed.auto_parallel.cluster
import
Cluster
from
paddle.distributed.auto_parallel.cluster
import
get_default_cluster
cluster_json
=
"""
{
...
...
@@ -1997,6 +1998,10 @@ class TestCluster(unittest.TestCase):
self
.
assertTrue
(
devices
==
[
0
,
1
,
2
,
3
])
self
.
assertTrue
(
involved_machine_count
==
1
)
# Remove unnecessary files
if
os
.
path
.
exists
(
cluster_json_path
):
os
.
remove
(
cluster_json_path
)
def
test_multi_machine
(
self
):
# Build cluster
cluster_json_path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
...
...
@@ -2022,6 +2027,17 @@ class TestCluster(unittest.TestCase):
if
os
.
path
.
exists
(
cluster_json_path
):
os
.
remove
(
cluster_json_path
)
def
test_default_config_cluster
(
self
):
cluster
=
Cluster
()
cluster
.
gen_default_config_cluster
(
device_count
=
8
)
# check machines and devices
self
.
assertTrue
(
cluster
.
get_num_machines
()
==
1
)
self
.
assertTrue
(
cluster
.
get_num_devices_per_machine
()
==
8
)
def
test_default_cluster
(
self
):
cluster
=
get_default_cluster
()
self
.
assertTrue
(
isinstance
(
cluster
,
Cluster
))
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录