Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
605552a9
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
605552a9
编写于
4月 15, 2022
作者:
C
caozhou
提交者:
GitHub
4月 15, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Auto Parallel]update cluster (#41722)
* update cluster
上级
42abcc08
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
2271 addition
and
1 deletion
+2271
-1
python/paddle/distributed/auto_parallel/cluster.py
python/paddle/distributed/auto_parallel/cluster.py
+248
-1
python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
...paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+1
-0
python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py
...addle/fluid/tests/unittests/auto_parallel/test_cluster.py
+2022
-0
未找到文件。
python/paddle/distributed/auto_parallel/cluster.py
浏览文件 @
605552a9
#
Copyright (c) 2021
PaddlePaddle Authors. All Rights Reserved.
#
Copyright (c) 2022
PaddlePaddle Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -43,6 +43,8 @@ class LinkType(IntEnum):
...
@@ -43,6 +43,8 @@ class LinkType(IntEnum):
class
Device
:
class
Device
:
NON_ACCELERATOR_TYPE
=
[
DeviceType
.
CPU
,
DeviceType
.
NIC
,
DeviceType
.
UNKNOWN
]
def
__init__
(
self
,
global_id
,
local_id
,
machine
):
def
__init__
(
self
,
global_id
,
local_id
,
machine
):
self
.
_global_id
=
global_id
self
.
_global_id
=
global_id
self
.
_local_id
=
local_id
self
.
_local_id
=
local_id
...
@@ -134,6 +136,10 @@ class Device:
...
@@ -134,6 +136,10 @@ class Device:
class
Link
:
class
Link
:
default_hop
=
1
default_nic_bandwith
=
24
def
__init__
(
self
,
source
,
target
):
def
__init__
(
self
,
source
,
target
):
self
.
_src
=
source
self
.
_src
=
source
self
.
_tgt
=
target
self
.
_tgt
=
target
...
@@ -142,6 +148,7 @@ class Link:
...
@@ -142,6 +148,7 @@ class Link:
self
.
_bandwidth
=
None
self
.
_bandwidth
=
None
# latency is stored by millisecond
# latency is stored by millisecond
self
.
_latency
=
None
self
.
_latency
=
None
self
.
_hop
=
None
@
property
@
property
def
source
(
self
):
def
source
(
self
):
...
@@ -183,6 +190,14 @@ class Link:
...
@@ -183,6 +190,14 @@ class Link:
def
latency
(
self
,
value
):
def
latency
(
self
,
value
):
self
.
_latency
=
value
self
.
_latency
=
value
@
property
def
hop
(
self
):
return
self
.
_hop
@
hop
.
setter
def
hop
(
self
,
value
):
self
.
_hop
=
value
def
__str__
(
self
):
def
__str__
(
self
):
str
=
""
str
=
""
str
+=
"source_global_id: {}, target_global_id: {}, type: {}, bandwidth: {}, latency: {}"
.
format
(
str
+=
"source_global_id: {}, target_global_id: {}, type: {}, bandwidth: {}, latency: {}"
.
format
(
...
@@ -202,6 +217,8 @@ class Machine:
...
@@ -202,6 +217,8 @@ class Machine:
self
.
_port
=
None
self
.
_port
=
None
self
.
_devices
=
{}
self
.
_devices
=
{}
self
.
_links
=
{}
self
.
_links
=
{}
self
.
_accelerators
=
{}
self
.
_non_accelerator_cumulative_count
=
0
@
property
@
property
def
id
(
self
):
def
id
(
self
):
...
@@ -243,14 +260,23 @@ class Machine:
...
@@ -243,14 +260,23 @@ class Machine:
def
links
(
self
):
def
links
(
self
):
return
self
.
_links
return
self
.
_links
@
property
def
accelerators
(
self
):
return
self
.
_accelerators
def
add_device
(
self
,
device
):
def
add_device
(
self
,
device
):
# Use the device global_id as the key
# Use the device global_id as the key
self
.
_devices
[
device
.
global_id
]
=
device
self
.
_devices
[
device
.
global_id
]
=
device
if
device
.
type
not
in
Device
.
NON_ACCELERATOR_TYPE
:
self
.
_accelerators
[
device
.
global_id
]
=
device
def
add_link
(
self
,
link
):
def
add_link
(
self
,
link
):
# Use the source device global_id and target device global_id as the key
# Use the source device global_id and target device global_id as the key
self
.
_links
[(
link
.
source
.
global_id
,
link
.
target
.
global_id
)]
=
link
self
.
_links
[(
link
.
source
.
global_id
,
link
.
target
.
global_id
)]
=
link
def
get_link
(
self
,
source_global_id
,
target_global_id
):
return
self
.
_links
.
get
((
source_global_id
,
target_global_id
),
None
)
def
__str__
(
self
):
def
__str__
(
self
):
str
=
""
str
=
""
for
device
in
self
.
devices
.
values
():
for
device
in
self
.
devices
.
values
():
...
@@ -263,6 +289,109 @@ class Machine:
...
@@ -263,6 +289,109 @@ class Machine:
return
self
.
__str__
()
return
self
.
__str__
()
class
AlphaLatency
:
def
__init__
(
self
,
alpha_latency
):
assert
isinstance
(
alpha_latency
,
dict
)
self
.
_base
=
alpha_latency
.
get
(
"base"
,
None
)
self
.
_inter
=
alpha_latency
.
get
(
"inter"
,
None
)
self
.
_intra
=
alpha_latency
.
get
(
"intra"
,
None
)
self
.
_switch
=
alpha_latency
.
get
(
"switch"
,
None
)
if
self
.
_switch
is
not
None
:
try
:
self
.
_switch
=
float
(
self
.
_switch
)
except
:
raise
TypeError
(
"The switch latency must be float"
)
self
.
_base_ring
=
self
.
_base
.
get
(
"ring"
,
None
)
if
self
.
_base
is
not
None
else
None
self
.
_base_tree
=
self
.
_base
.
get
(
"tree"
,
None
)
if
self
.
_base
is
not
None
else
None
self
.
_base_inter
=
self
.
_base
.
get
(
"inter"
,
None
)
if
self
.
_base
is
not
None
else
None
if
self
.
_base_ring
is
not
None
:
try
:
self
.
_base_ring
=
float
(
self
.
_base_ring
)
except
:
raise
TypeError
(
"The base ring latency must be float."
)
if
self
.
_base_tree
is
not
None
:
try
:
self
.
_base_tree
=
float
(
self
.
_base_tree
)
except
:
raise
TypeError
(
"The base ring latency must be float."
)
self
.
_inter_ring
=
self
.
_inter
.
get
(
"ring"
,
None
)
self
.
_inter_tree
=
self
.
_inter
.
get
(
"tree"
,
None
)
self
.
_intra_ring
=
self
.
_intra
.
get
(
"ring"
,
None
)
self
.
_intra_tree
=
self
.
_intra
.
get
(
"tree"
,
None
)
if
self
.
_inter_ring
is
not
None
:
if
isinstance
(
self
.
_inter_ring
,
str
):
assert
self
.
_inter_ring
in
[
"NET"
]
self
.
_inter_ring
=
LinkType
[
self
.
_inter_ring
]
else
:
try
:
self
.
_inter_ring
=
float
(
self
.
_inter_ring
)
except
:
raise
TypeError
(
"The inter ring latency must be float."
)
if
self
.
_inter_tree
is
not
None
:
if
isinstance
(
self
.
_inter_tree
,
str
):
assert
self
.
_inter_tree
in
[
"NET"
]
self
.
_inter_tree
=
LinkType
[
self
.
_inter_tree
]
else
:
try
:
self
.
_inter_tree
=
float
(
self
.
_inter_tree
)
except
:
raise
TypeError
(
"The inter tree latency must be float."
)
if
self
.
_intra_ring
is
not
None
:
if
isinstance
(
self
.
_intra_ring
,
str
):
assert
self
.
_intra_ring
in
[
"NVL"
,
"PHB"
]
self
.
_intra_ring
=
LinkType
[
self
.
_intra_ring
]
else
:
try
:
self
.
_intra_ring
=
float
(
self
.
_intra_ring
)
except
:
raise
TypeError
(
"The intra ring latency must be float."
)
if
self
.
_intra_tree
is
not
None
:
if
isinstance
(
self
.
_intra_tree
,
str
):
assert
self
.
_intra_tree
in
[
"NVL"
,
"PHB"
]
self
.
_intra_tree
=
LinkType
[
self
.
_intra_tree
]
else
:
try
:
self
.
_intra_tree
=
float
(
self
.
_intra_tree
)
except
:
raise
TypeError
(
"The intra tree latency must be float."
)
@
property
def
base_ring
(
self
):
return
self
.
_base_ring
@
property
def
base_tree
(
self
):
return
self
.
_base_tree
@
property
def
switch
(
self
):
return
self
.
_switch
@
property
def
inter_ring
(
self
):
return
self
.
_inter_ring
@
property
def
inter_tree
(
self
):
return
self
.
_inter_tree
@
property
def
intra_ring
(
self
):
return
self
.
_intra_ring
@
property
def
intra_tree
(
self
):
return
self
.
_intra_tree
class
Cluster
:
class
Cluster
:
"""
"""
The cluster is an abstract of the hardware resource for training, which contains the cluster topology and
The cluster is an abstract of the hardware resource for training, which contains the cluster topology and
...
@@ -276,6 +405,18 @@ class Cluster:
...
@@ -276,6 +405,18 @@ class Cluster:
self
.
_machines
=
{}
self
.
_machines
=
{}
# Cluster graph topology
# Cluster graph topology
self
.
_topology
=
None
self
.
_topology
=
None
# Latency for communication cost model
self
.
_alpha_latency
=
None
self
.
_rank_to_device_id
=
{}
self
.
_device_id_to_rank
=
{}
@
property
def
rank_to_device_id
(
self
):
return
self
.
_rank_to_device_id
@
property
def
device_id_to_rank
(
self
):
return
self
.
_device_id_to_rank
@
property
@
property
def
machines
(
self
):
def
machines
(
self
):
...
@@ -285,6 +426,35 @@ class Cluster:
...
@@ -285,6 +426,35 @@ class Cluster:
assert
isinstance
(
machine
,
Machine
)
assert
isinstance
(
machine
,
Machine
)
self
.
_machines
[
machine
.
id
]
=
machine
self
.
_machines
[
machine
.
id
]
=
machine
# map rank to device id and map device id to rank
if
machine
.
id
!=
0
:
prev_machine
=
self
.
_machines
[
machine
.
id
-
1
]
offset
=
prev_machine
.
_non_accelerator_cumulative_count
for
global_id
in
machine
.
devices
:
if
machine
.
devices
[
global_id
].
type
not
in
Device
.
NON_ACCELERATOR_TYPE
:
rank_id
=
global_id
-
offset
self
.
_rank_to_device_id
[
rank_id
]
=
global_id
self
.
_device_id_to_rank
[
global_id
]
=
rank_id
machine
.
_non_accelerator_cumulative_count
=
len
(
machine
.
devices
)
-
len
(
machine
.
accelerators
)
+
prev_machine
.
_non_accelerator_cumulative_count
else
:
for
global_id
in
machine
.
devices
:
if
machine
.
devices
[
global_id
].
type
not
in
Device
.
NON_ACCELERATOR_TYPE
:
rank_id
=
global_id
self
.
_rank_to_device_id
[
rank_id
]
=
global_id
self
.
_device_id_to_rank
[
global_id
]
=
rank_id
machine
.
accelerators
[
global_id
]
=
machine
.
devices
[
global_id
]
machine
.
_non_accelerator_cumulative_count
=
len
(
machine
.
devices
)
-
len
(
machine
.
accelerators
)
@
property
def
alpha_latency
(
self
):
return
self
.
_alpha_latency
def
add_device
(
self
,
device
):
def
add_device
(
self
,
device
):
assert
isinstance
(
device
,
Device
)
assert
isinstance
(
device
,
Device
)
device
.
machine
.
add_device
(
device
)
device
.
machine
.
add_device
(
device
)
...
@@ -344,8 +514,23 @@ class Cluster:
...
@@ -344,8 +514,23 @@ class Cluster:
link
.
type
=
link_type
link
.
type
=
link_type
link
.
bandwidth
=
float
(
link_info
.
get
(
"bandwidth"
,
0
))
link
.
bandwidth
=
float
(
link_info
.
get
(
"bandwidth"
,
0
))
link
.
latency
=
float
(
link_info
.
get
(
"latency"
,
0
))
link
.
latency
=
float
(
link_info
.
get
(
"latency"
,
0
))
link
.
hop
=
link_info
.
get
(
"hop"
,
None
)
if
link
.
hop
is
None
:
# Set the default of hop: If in the same machine, hop is 0. And if in the different machine, hop is 1.
source_machine
=
source
.
machine
target_machine
=
target
.
machine
if
source_machine
.
id
==
target_machine
.
id
:
link
.
hop
=
0
else
:
link
.
hop
=
Link
.
default_hop
self
.
add_link
(
link
)
self
.
add_link
(
link
)
if
"alpha_latency"
in
cluster_info
:
self
.
_alpha_latency
=
AlphaLatency
(
cluster_info
.
get
(
"alpha_latency"
))
else
:
self
.
_alpha_latecy
=
None
def
_generate_machine_id
(
self
):
def
_generate_machine_id
(
self
):
cur_machine_id
=
self
.
_num_machines
cur_machine_id
=
self
.
_num_machines
self
.
_num_machines
+=
1
self
.
_num_machines
+=
1
...
@@ -359,6 +544,68 @@ class Cluster:
...
@@ -359,6 +544,68 @@ class Cluster:
devices
.
append
(
device
)
devices
.
append
(
device
)
return
devices
return
devices
def
get_beta
(
self
,
source_device_id
,
target_device_id
):
# beta means the time transferring a byte, us/B
beta
=
None
convert_base
=
1000
device
=
self
.
get_device
(
source_device_id
)
machine
=
device
.
machine
link
=
machine
.
get_link
(
source_device_id
,
target_device_id
)
bandwidth
=
None
# None means the source and target are not connected directly, set NIC in default
if
link
is
None
:
bandwidth
=
Link
.
default_nic_bandwith
else
:
bandwidth
=
link
.
bandwidth
if
bandwidth
==
0.
:
beta
=
0
else
:
beta
=
1
/
(
bandwidth
*
(
convert_base
**
3
/
10
**
6
))
return
beta
def
get_hop
(
self
,
source_device_id
,
target_device_id
):
beta
=
None
hop
=
None
device
=
self
.
get_device
(
source_device_id
)
machine
=
device
.
machine
link
=
machine
.
get_link
(
source_device_id
,
target_device_id
)
if
link
is
not
None
:
hop
=
link
.
hop
else
:
hop
=
Link
.
default_hop
return
hop
def
cross_machine
(
self
,
device_ids
):
machine_ids
=
set
()
for
device_id
in
device_ids
:
device
=
self
.
get_device
(
device_id
)
machine_id
=
device
.
machine
.
id
machine_ids
.
add
(
machine_id
)
if
len
(
machine_ids
)
==
1
:
return
False
else
:
return
True
def
convert_rank_to_device_id
(
self
,
group_ranks
):
# group_ranks is global id of the rank in paddle
# task will use all of machine in this cluster with accelerators in default
device_ids
=
[]
for
rank
in
group_ranks
:
device_ids
.
append
(
self
.
rank_to_device_id
[
rank
])
return
device_ids
def
get_involved_machine_count
(
self
,
device_ids
):
machine_ids
=
set
()
for
device_id
in
device_ids
:
device
=
self
.
get_device
(
device_id
)
machine_id
=
device
.
machine
.
id
machine_ids
.
add
(
machine_id
)
count
=
len
(
machine_ids
)
assert
count
>
0
return
count
def
__str__
(
self
):
def
__str__
(
self
):
str
=
""
str
=
""
for
machine
in
self
.
machines
.
values
():
for
machine
in
self
.
machines
.
values
():
...
...
python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
浏览文件 @
605552a9
...
@@ -18,4 +18,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
...
@@ -18,4 +18,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
py_test_modules
(
test_recorder MODULES test_recorder ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_recorder MODULES test_recorder ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_trial MODULES test_trial ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_trial MODULES test_trial ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_new_cost_model MODULES test_new_cost_model ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_new_cost_model MODULES test_new_cost_model ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_cluster MODULES test_cluster ENVS
${
dist_ENVS
}
)
endif
()
endif
()
python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py
0 → 100644
浏览文件 @
605552a9
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
os
import
json
import
paddle
from
paddle.distributed.auto_parallel.cluster
import
Cluster
cluster_json
=
"""
{
"alpha_latency": {"inter": {"ring": "NET", "tree": "NET"},
"intra": {"ring": "NVL", "tree": "PHB"},
"base": {"ring": 8.4, "tree": 0},
"switch": 10.0},
"machines": [
{
"hostname": "yq01-sys-hic-v100-box-a225-0266",
"addr": "10.127.9.147",
"port": "60009",
"devices": [
{
"global_id": 0,
"local_id": 0,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 1,
"local_id": 1,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 2,
"local_id": 2,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 3,
"local_id": 3,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 4,
"local_id": 4,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 5,
"local_id": 5,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 6,
"local_id": 6,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 7,
"local_id": 7,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 8,
"local_id": 0,
"type": "CPU",
"arch": "x86_64",
"vendor": "GenuineIntel",
"model": "Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GH",
"memory": "502",
"sp_gflops": "150",
"dp_gflops": "75"
},
{
"global_id": 9,
"local_id": 0,
"type": "NIC",
"width": 12.5,
"ip": "10.127.9.147"
}
],
"links": [
{
"source_global_id": 0,
"target_global_id": 1,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 2,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 3,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 4,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 5,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 6,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 7,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 0,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 1,
"target_global_id": 0,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 2,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 3,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 4,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 5,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 6,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 7,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 1,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 2,
"target_global_id": 0,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 1,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 3,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 4,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 5,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 6,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 7,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 2,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 3,
"target_global_id": 0,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 1,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 2,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 4,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 5,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 6,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 7,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 3,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 4,
"target_global_id": 0,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 1,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 2,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 3,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 5,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 6,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 7,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 4,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 5,
"target_global_id": 0,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 1,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 2,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 3,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 4,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 6,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 7,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 5,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 6,
"target_global_id": 0,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 1,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 2,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 3,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 4,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 5,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 7,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 6,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 7,
"target_global_id": 0,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 1,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 2,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 3,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 4,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 5,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 6,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 7,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 0,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 1,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 2,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 3,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 4,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 5,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 6,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 7,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 0,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 1,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 2,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 3,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 4,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 5,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 6,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 7,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
}
]
}
]
}
"""
multi_cluster_json
=
"""{
"machines": [
{
"hostname": "yq01-sys-hic-v100-box-a225-0266",
"addr": "10.127.9.147",
"port": "60009",
"devices": [
{
"global_id": 0,
"local_id": 0,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 1,
"local_id": 1,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 2,
"local_id": 2,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 3,
"local_id": 3,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 4,
"local_id": 4,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 5,
"local_id": 5,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 6,
"local_id": 6,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 7,
"local_id": 7,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 8,
"local_id": 0,
"type": "CPU",
"arch": "x86_64",
"vendor": "GenuineIntel",
"model": "Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GH",
"memory": "502",
"sp_gflops": "150",
"dp_gflops": "75"
},
{
"global_id": 9,
"local_id": 0,
"type": "NIC",
"width": 12.5,
"ip": "10.127.9.147"
}
],
"links": [
{
"source_global_id": 0,
"target_global_id": 1,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 2,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 3,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 4,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 5,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 6,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 7,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 0,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 1,
"target_global_id": 0,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 2,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 3,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 4,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 5,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 6,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 7,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 1,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 2,
"target_global_id": 0,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 1,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 3,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 4,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 5,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 6,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 7,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 2,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 3,
"target_global_id": 0,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 1,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 2,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 4,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 5,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 6,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 7,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 3,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 4,
"target_global_id": 0,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 1,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 2,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 3,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 5,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 6,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 7,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 4,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 5,
"target_global_id": 0,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 1,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 2,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 3,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 4,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 6,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 7,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 5,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 6,
"target_global_id": 0,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 1,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 2,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 3,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 4,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 5,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 7,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 6,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 7,
"target_global_id": 0,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 1,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 2,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 3,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 4,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 5,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 6,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 7,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 0,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 1,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 2,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 3,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 4,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 5,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 6,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 7,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 0,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 1,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 2,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 3,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 4,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 5,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 6,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 7,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 19,
"type": "NET",
"bandwidth": 24.0
}
]
},
{
"hostname": "yq01-sys-hic-k8s-v100-box-a225-0751",
"addr": "10.127.43.24",
"port": "60009",
"devices": [
{
"global_id": 10,
"local_id": 0,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 11,
"local_id": 1,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 12,
"local_id": 2,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 13,
"local_id": 3,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 14,
"local_id": 4,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 15,
"local_id": 5,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 16,
"local_id": 6,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 17,
"local_id": 7,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 18,
"local_id": 0,
"type": "CPU",
"arch": "x86_64",
"vendor": "GenuineIntel",
"model": "Intel(R) Xeon(R) Gold 6271C CPU @ 2.60G",
"memory": "503",
"sp_gflops": "150",
"dp_gflops": "75"
},
{
"global_id": 19,
"local_id": 0,
"type": "NIC",
"width": 12.5,
"ip": "10.127.43.24"
}
],
"links": [
{
"source_global_id": 10,
"target_global_id": 11,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 10,
"target_global_id": 12,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 10,
"target_global_id": 13,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 10,
"target_global_id": 14,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 10,
"target_global_id": 15,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 10,
"target_global_id": 16,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 10,
"target_global_id": 17,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 10,
"target_global_id": 18,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 10,
"target_global_id": 19,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 11,
"target_global_id": 10,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 11,
"target_global_id": 12,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 11,
"target_global_id": 13,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 11,
"target_global_id": 14,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 11,
"target_global_id": 15,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 11,
"target_global_id": 16,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 11,
"target_global_id": 17,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 11,
"target_global_id": 18,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 11,
"target_global_id": 19,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 12,
"target_global_id": 10,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 12,
"target_global_id": 11,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 12,
"target_global_id": 13,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 12,
"target_global_id": 14,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 12,
"target_global_id": 15,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 12,
"target_global_id": 16,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 12,
"target_global_id": 17,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 12,
"target_global_id": 18,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 12,
"target_global_id": 19,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 13,
"target_global_id": 10,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 13,
"target_global_id": 11,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 13,
"target_global_id": 12,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 13,
"target_global_id": 14,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 13,
"target_global_id": 15,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 13,
"target_global_id": 16,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 13,
"target_global_id": 17,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 13,
"target_global_id": 18,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 13,
"target_global_id": 19,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 14,
"target_global_id": 10,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 14,
"target_global_id": 11,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 14,
"target_global_id": 12,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 14,
"target_global_id": 13,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 14,
"target_global_id": 15,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 14,
"target_global_id": 16,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 14,
"target_global_id": 17,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 14,
"target_global_id": 18,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 14,
"target_global_id": 19,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 15,
"target_global_id": 10,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 15,
"target_global_id": 11,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 15,
"target_global_id": 12,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 15,
"target_global_id": 13,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 15,
"target_global_id": 14,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 15,
"target_global_id": 16,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 15,
"target_global_id": 17,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 15,
"target_global_id": 18,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 15,
"target_global_id": 19,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 16,
"target_global_id": 10,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 16,
"target_global_id": 11,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 16,
"target_global_id": 12,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 16,
"target_global_id": 13,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 16,
"target_global_id": 14,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 16,
"target_global_id": 15,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 16,
"target_global_id": 17,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 16,
"target_global_id": 18,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 16,
"target_global_id": 19,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 17,
"target_global_id": 10,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 17,
"target_global_id": 11,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 17,
"target_global_id": 12,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 17,
"target_global_id": 13,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 17,
"target_global_id": 14,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 17,
"target_global_id": 15,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 17,
"target_global_id": 16,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 17,
"target_global_id": 18,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 17,
"target_global_id": 19,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 18,
"target_global_id": 10,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 18,
"target_global_id": 11,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 18,
"target_global_id": 12,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 18,
"target_global_id": 13,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 18,
"target_global_id": 14,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 18,
"target_global_id": 15,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 18,
"target_global_id": 16,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 18,
"target_global_id": 17,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 18,
"target_global_id": 19,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 10,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 11,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 12,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 13,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 14,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 15,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 16,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 17,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 18,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 9,
"type": "NET",
"bandwidth": 24.0
}
]
}
]
}
"""
class
TestCluster
(
unittest
.
TestCase
):
def
test_single_machine
(
self
):
# Build cluster
file_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
cluster_json_path
=
os
.
path
.
join
(
file_dir
,
"auto_parallel_cluster.json"
)
cluster_json_object
=
json
.
loads
(
cluster_json
)
with
open
(
cluster_json_path
,
"w"
)
as
cluster_json_file
:
json
.
dump
(
cluster_json_object
,
cluster_json_file
)
cluster
=
Cluster
()
cluster
.
build_from_file
(
cluster_json_path
)
beta
=
cluster
.
get_beta
(
0
,
1
)
hop
=
cluster
.
get_hop
(
0
,
1
)
cross_machine
=
cluster
.
cross_machine
([
0
,
1
])
devices
=
cluster
.
convert_rank_to_device_id
([
0
,
1
,
2
,
3
])
involved_machine_count
=
cluster
.
get_involved_machine_count
(
devices
)
self
.
assertTrue
(
beta
>
0
)
self
.
assertTrue
(
hop
==
0
)
self
.
assertTrue
(
not
cross_machine
)
self
.
assertTrue
(
devices
==
[
0
,
1
,
2
,
3
])
self
.
assertTrue
(
involved_machine_count
==
1
)
# Remove unnecessary files
if
os
.
path
.
exists
(
cluster_json_path
):
os
.
remove
(
cluster_json_path
)
def
test_multi_machine
(
self
):
# Build cluster
file_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
cluster_json_path
=
os
.
path
.
join
(
file_dir
,
"auto_parallel_cluster.json"
)
cluster_json_object
=
json
.
loads
(
multi_cluster_json
)
with
open
(
cluster_json_path
,
"w"
)
as
cluster_json_file
:
json
.
dump
(
cluster_json_object
,
cluster_json_file
)
cluster
=
Cluster
()
cluster
.
build_from_file
(
cluster_json_path
)
beta
=
cluster
.
get_beta
(
0
,
11
)
hop
=
cluster
.
get_hop
(
0
,
11
)
cross_machine
=
cluster
.
cross_machine
([
0
,
11
])
devices
=
cluster
.
convert_rank_to_device_id
([
5
,
6
,
7
,
8
])
involved_machine_count
=
cluster
.
get_involved_machine_count
(
devices
)
self
.
assertTrue
(
beta
>
0
)
self
.
assertTrue
(
hop
>=
0
)
self
.
assertTrue
(
cross_machine
)
self
.
assertTrue
(
devices
==
[
5
,
6
,
7
,
10
])
self
.
assertTrue
(
involved_machine_count
==
2
)
# Remove unnecessary files
if
os
.
path
.
exists
(
cluster_json_path
):
os
.
remove
(
cluster_json_path
)
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录