未验证 提交 605552a9 编写于 作者: C caozhou 提交者: GitHub

[Auto Parallel]update cluster (#41722)

* update cluster
上级 42abcc08
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -43,6 +43,8 @@ class LinkType(IntEnum): ...@@ -43,6 +43,8 @@ class LinkType(IntEnum):
class Device: class Device:
NON_ACCELERATOR_TYPE = [DeviceType.CPU, DeviceType.NIC, DeviceType.UNKNOWN]
def __init__(self, global_id, local_id, machine): def __init__(self, global_id, local_id, machine):
self._global_id = global_id self._global_id = global_id
self._local_id = local_id self._local_id = local_id
...@@ -134,6 +136,10 @@ class Device: ...@@ -134,6 +136,10 @@ class Device:
class Link: class Link:
default_hop = 1
default_nic_bandwith = 24
def __init__(self, source, target): def __init__(self, source, target):
self._src = source self._src = source
self._tgt = target self._tgt = target
...@@ -142,6 +148,7 @@ class Link: ...@@ -142,6 +148,7 @@ class Link:
self._bandwidth = None self._bandwidth = None
# latency is stored by millisecond # latency is stored by millisecond
self._latency = None self._latency = None
self._hop = None
@property @property
def source(self): def source(self):
...@@ -183,6 +190,14 @@ class Link: ...@@ -183,6 +190,14 @@ class Link:
def latency(self, value): def latency(self, value):
self._latency = value self._latency = value
@property
def hop(self):
return self._hop
@hop.setter
def hop(self, value):
self._hop = value
def __str__(self): def __str__(self):
str = "" str = ""
str += "source_global_id: {}, target_global_id: {}, type: {}, bandwidth: {}, latency: {}".format( str += "source_global_id: {}, target_global_id: {}, type: {}, bandwidth: {}, latency: {}".format(
...@@ -202,6 +217,8 @@ class Machine: ...@@ -202,6 +217,8 @@ class Machine:
self._port = None self._port = None
self._devices = {} self._devices = {}
self._links = {} self._links = {}
self._accelerators = {}
self._non_accelerator_cumulative_count = 0
@property @property
def id(self): def id(self):
...@@ -243,14 +260,23 @@ class Machine: ...@@ -243,14 +260,23 @@ class Machine:
def links(self): def links(self):
return self._links return self._links
@property
def accelerators(self):
return self._accelerators
def add_device(self, device): def add_device(self, device):
# Use the device global_id as the key # Use the device global_id as the key
self._devices[device.global_id] = device self._devices[device.global_id] = device
if device.type not in Device.NON_ACCELERATOR_TYPE:
self._accelerators[device.global_id] = device
def add_link(self, link): def add_link(self, link):
# Use the source device global_id and target device global_id as the key # Use the source device global_id and target device global_id as the key
self._links[(link.source.global_id, link.target.global_id)] = link self._links[(link.source.global_id, link.target.global_id)] = link
def get_link(self, source_global_id, target_global_id):
return self._links.get((source_global_id, target_global_id), None)
def __str__(self): def __str__(self):
str = "" str = ""
for device in self.devices.values(): for device in self.devices.values():
...@@ -263,6 +289,109 @@ class Machine: ...@@ -263,6 +289,109 @@ class Machine:
return self.__str__() return self.__str__()
class AlphaLatency:
def __init__(self, alpha_latency):
assert isinstance(alpha_latency, dict)
self._base = alpha_latency.get("base", None)
self._inter = alpha_latency.get("inter", None)
self._intra = alpha_latency.get("intra", None)
self._switch = alpha_latency.get("switch", None)
if self._switch is not None:
try:
self._switch = float(self._switch)
except:
raise TypeError("The switch latency must be float")
self._base_ring = self._base.get(
"ring", None) if self._base is not None else None
self._base_tree = self._base.get(
"tree", None) if self._base is not None else None
self._base_inter = self._base.get(
"inter", None) if self._base is not None else None
if self._base_ring is not None:
try:
self._base_ring = float(self._base_ring)
except:
raise TypeError("The base ring latency must be float.")
if self._base_tree is not None:
try:
self._base_tree = float(self._base_tree)
except:
raise TypeError("The base ring latency must be float.")
self._inter_ring = self._inter.get("ring", None)
self._inter_tree = self._inter.get("tree", None)
self._intra_ring = self._intra.get("ring", None)
self._intra_tree = self._intra.get("tree", None)
if self._inter_ring is not None:
if isinstance(self._inter_ring, str):
assert self._inter_ring in ["NET"]
self._inter_ring = LinkType[self._inter_ring]
else:
try:
self._inter_ring = float(self._inter_ring)
except:
raise TypeError("The inter ring latency must be float.")
if self._inter_tree is not None:
if isinstance(self._inter_tree, str):
assert self._inter_tree in ["NET"]
self._inter_tree = LinkType[self._inter_tree]
else:
try:
self._inter_tree = float(self._inter_tree)
except:
raise TypeError("The inter tree latency must be float.")
if self._intra_ring is not None:
if isinstance(self._intra_ring, str):
assert self._intra_ring in ["NVL", "PHB"]
self._intra_ring = LinkType[self._intra_ring]
else:
try:
self._intra_ring = float(self._intra_ring)
except:
raise TypeError("The intra ring latency must be float.")
if self._intra_tree is not None:
if isinstance(self._intra_tree, str):
assert self._intra_tree in ["NVL", "PHB"]
self._intra_tree = LinkType[self._intra_tree]
else:
try:
self._intra_tree = float(self._intra_tree)
except:
raise TypeError("The intra tree latency must be float.")
@property
def base_ring(self):
return self._base_ring
@property
def base_tree(self):
return self._base_tree
@property
def switch(self):
return self._switch
@property
def inter_ring(self):
return self._inter_ring
@property
def inter_tree(self):
return self._inter_tree
@property
def intra_ring(self):
return self._intra_ring
@property
def intra_tree(self):
return self._intra_tree
class Cluster: class Cluster:
""" """
The cluster is an abstract of the hardware resource for training, which contains the cluster topology and The cluster is an abstract of the hardware resource for training, which contains the cluster topology and
...@@ -276,6 +405,18 @@ class Cluster: ...@@ -276,6 +405,18 @@ class Cluster:
self._machines = {} self._machines = {}
# Cluster graph topology # Cluster graph topology
self._topology = None self._topology = None
# Latency for communication cost model
self._alpha_latency = None
self._rank_to_device_id = {}
self._device_id_to_rank = {}
@property
def rank_to_device_id(self):
return self._rank_to_device_id
@property
def device_id_to_rank(self):
return self._device_id_to_rank
@property @property
def machines(self): def machines(self):
...@@ -285,6 +426,35 @@ class Cluster: ...@@ -285,6 +426,35 @@ class Cluster:
assert isinstance(machine, Machine) assert isinstance(machine, Machine)
self._machines[machine.id] = machine self._machines[machine.id] = machine
# map rank to device id and map device id to rank
if machine.id != 0:
prev_machine = self._machines[machine.id - 1]
offset = prev_machine._non_accelerator_cumulative_count
for global_id in machine.devices:
if machine.devices[
global_id].type not in Device.NON_ACCELERATOR_TYPE:
rank_id = global_id - offset
self._rank_to_device_id[rank_id] = global_id
self._device_id_to_rank[global_id] = rank_id
machine._non_accelerator_cumulative_count = len(
machine.devices) - len(
machine.accelerators
) + prev_machine._non_accelerator_cumulative_count
else:
for global_id in machine.devices:
if machine.devices[
global_id].type not in Device.NON_ACCELERATOR_TYPE:
rank_id = global_id
self._rank_to_device_id[rank_id] = global_id
self._device_id_to_rank[global_id] = rank_id
machine.accelerators[global_id] = machine.devices[global_id]
machine._non_accelerator_cumulative_count = len(
machine.devices) - len(machine.accelerators)
@property
def alpha_latency(self):
return self._alpha_latency
def add_device(self, device): def add_device(self, device):
assert isinstance(device, Device) assert isinstance(device, Device)
device.machine.add_device(device) device.machine.add_device(device)
...@@ -344,8 +514,23 @@ class Cluster: ...@@ -344,8 +514,23 @@ class Cluster:
link.type = link_type link.type = link_type
link.bandwidth = float(link_info.get("bandwidth", 0)) link.bandwidth = float(link_info.get("bandwidth", 0))
link.latency = float(link_info.get("latency", 0)) link.latency = float(link_info.get("latency", 0))
link.hop = link_info.get("hop", None)
if link.hop is None:
# Set the default of hop: If in the same machine, hop is 0. And if in the different machine, hop is 1.
source_machine = source.machine
target_machine = target.machine
if source_machine.id == target_machine.id:
link.hop = 0
else:
link.hop = Link.default_hop
self.add_link(link) self.add_link(link)
if "alpha_latency" in cluster_info:
self._alpha_latency = AlphaLatency(
cluster_info.get("alpha_latency"))
else:
self._alpha_latecy = None
def _generate_machine_id(self): def _generate_machine_id(self):
cur_machine_id = self._num_machines cur_machine_id = self._num_machines
self._num_machines += 1 self._num_machines += 1
...@@ -359,6 +544,68 @@ class Cluster: ...@@ -359,6 +544,68 @@ class Cluster:
devices.append(device) devices.append(device)
return devices return devices
def get_beta(self, source_device_id, target_device_id):
# beta means the time transferring a byte, us/B
beta = None
convert_base = 1000
device = self.get_device(source_device_id)
machine = device.machine
link = machine.get_link(source_device_id, target_device_id)
bandwidth = None
# None means the source and target are not connected directly, set NIC in default
if link is None:
bandwidth = Link.default_nic_bandwith
else:
bandwidth = link.bandwidth
if bandwidth == 0.:
beta = 0
else:
beta = 1 / (bandwidth * (convert_base**3 / 10**6))
return beta
def get_hop(self, source_device_id, target_device_id):
beta = None
hop = None
device = self.get_device(source_device_id)
machine = device.machine
link = machine.get_link(source_device_id, target_device_id)
if link is not None:
hop = link.hop
else:
hop = Link.default_hop
return hop
def cross_machine(self, device_ids):
machine_ids = set()
for device_id in device_ids:
device = self.get_device(device_id)
machine_id = device.machine.id
machine_ids.add(machine_id)
if len(machine_ids) == 1:
return False
else:
return True
def convert_rank_to_device_id(self, group_ranks):
# group_ranks is global id of the rank in paddle
# task will use all of machine in this cluster with accelerators in default
device_ids = []
for rank in group_ranks:
device_ids.append(self.rank_to_device_id[rank])
return device_ids
def get_involved_machine_count(self, device_ids):
machine_ids = set()
for device_id in device_ids:
device = self.get_device(device_id)
machine_id = device.machine.id
machine_ids.add(machine_id)
count = len(machine_ids)
assert count > 0
return count
def __str__(self): def __str__(self):
str = "" str = ""
for machine in self.machines.values(): for machine in self.machines.values():
......
...@@ -18,4 +18,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU) ...@@ -18,4 +18,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS}) py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS})
py_test_modules(test_trial MODULES test_trial ENVS ${dist_ENVS}) py_test_modules(test_trial MODULES test_trial ENVS ${dist_ENVS})
py_test_modules(test_new_cost_model MODULES test_new_cost_model ENVS ${dist_ENVS}) py_test_modules(test_new_cost_model MODULES test_new_cost_model ENVS ${dist_ENVS})
py_test_modules(test_cluster MODULES test_cluster ENVS ${dist_ENVS})
endif() endif()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import os
import json
import paddle
from paddle.distributed.auto_parallel.cluster import Cluster
cluster_json = """
{
"alpha_latency": {"inter": {"ring": "NET", "tree": "NET"},
"intra": {"ring": "NVL", "tree": "PHB"},
"base": {"ring": 8.4, "tree": 0},
"switch": 10.0},
"machines": [
{
"hostname": "yq01-sys-hic-v100-box-a225-0266",
"addr": "10.127.9.147",
"port": "60009",
"devices": [
{
"global_id": 0,
"local_id": 0,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 1,
"local_id": 1,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 2,
"local_id": 2,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 3,
"local_id": 3,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 4,
"local_id": 4,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 5,
"local_id": 5,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 6,
"local_id": 6,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 7,
"local_id": 7,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 8,
"local_id": 0,
"type": "CPU",
"arch": "x86_64",
"vendor": "GenuineIntel",
"model": "Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GH",
"memory": "502",
"sp_gflops": "150",
"dp_gflops": "75"
},
{
"global_id": 9,
"local_id": 0,
"type": "NIC",
"width": 12.5,
"ip": "10.127.9.147"
}
],
"links": [
{
"source_global_id": 0,
"target_global_id": 1,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 2,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 3,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 4,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 5,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 6,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 7,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 0,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 1,
"target_global_id": 0,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 2,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 3,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 4,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 5,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 6,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 7,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 1,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 2,
"target_global_id": 0,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 1,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 3,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 4,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 5,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 6,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 7,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 2,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 3,
"target_global_id": 0,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 1,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 2,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 4,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 5,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 6,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 7,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 3,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 4,
"target_global_id": 0,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 1,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 2,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 3,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 5,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 6,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 7,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 4,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 5,
"target_global_id": 0,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 1,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 2,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 3,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 4,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 6,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 7,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 5,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 6,
"target_global_id": 0,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 1,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 2,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 3,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 4,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 5,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 7,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 6,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 7,
"target_global_id": 0,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 1,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 2,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 3,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 4,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 5,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 6,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 7,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 0,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 1,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 2,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 3,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 4,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 5,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 6,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 7,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 0,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 1,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 2,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 3,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 4,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 5,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 6,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 7,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
}
]
}
]
}
"""
multi_cluster_json = """{
"machines": [
{
"hostname": "yq01-sys-hic-v100-box-a225-0266",
"addr": "10.127.9.147",
"port": "60009",
"devices": [
{
"global_id": 0,
"local_id": 0,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 1,
"local_id": 1,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 2,
"local_id": 2,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 3,
"local_id": 3,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 4,
"local_id": 4,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 5,
"local_id": 5,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 6,
"local_id": 6,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 7,
"local_id": 7,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 8,
"local_id": 0,
"type": "CPU",
"arch": "x86_64",
"vendor": "GenuineIntel",
"model": "Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GH",
"memory": "502",
"sp_gflops": "150",
"dp_gflops": "75"
},
{
"global_id": 9,
"local_id": 0,
"type": "NIC",
"width": 12.5,
"ip": "10.127.9.147"
}
],
"links": [
{
"source_global_id": 0,
"target_global_id": 1,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 2,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 3,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 4,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 5,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 6,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 7,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 0,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 0,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 1,
"target_global_id": 0,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 2,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 3,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 4,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 5,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 6,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 7,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 1,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 1,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 2,
"target_global_id": 0,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 1,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 3,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 4,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 5,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 6,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 7,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 2,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 2,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 3,
"target_global_id": 0,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 1,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 2,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 4,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 5,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 6,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 7,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 3,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 3,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 4,
"target_global_id": 0,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 1,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 2,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 3,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 5,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 6,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 7,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 4,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 4,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 5,
"target_global_id": 0,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 1,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 2,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 3,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 4,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 6,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 7,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 5,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 5,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 6,
"target_global_id": 0,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 1,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 2,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 3,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 4,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 5,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 7,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 6,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 6,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 7,
"target_global_id": 0,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 1,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 2,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 3,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 4,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 5,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 6,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 7,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 7,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 0,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 1,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 2,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 3,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 4,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 5,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 6,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 7,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 8,
"target_global_id": 9,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 0,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 1,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 2,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 3,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 4,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 5,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 6,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 7,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 8,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 9,
"target_global_id": 19,
"type": "NET",
"bandwidth": 24.0
}
]
},
{
"hostname": "yq01-sys-hic-k8s-v100-box-a225-0751",
"addr": "10.127.43.24",
"port": "60009",
"devices": [
{
"global_id": 10,
"local_id": 0,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 11,
"local_id": 1,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 12,
"local_id": 2,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 13,
"local_id": 3,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 14,
"local_id": 4,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 15,
"local_id": 5,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 16,
"local_id": 6,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 17,
"local_id": 7,
"type": "GPU",
"model": " Tesla V100-SXM2-32GB",
"memory": "32",
"sp_gflops": "15700",
"dp_gflops": "7800"
},
{
"global_id": 18,
"local_id": 0,
"type": "CPU",
"arch": "x86_64",
"vendor": "GenuineIntel",
"model": "Intel(R) Xeon(R) Gold 6271C CPU @ 2.60G",
"memory": "503",
"sp_gflops": "150",
"dp_gflops": "75"
},
{
"global_id": 19,
"local_id": 0,
"type": "NIC",
"width": 12.5,
"ip": "10.127.43.24"
}
],
"links": [
{
"source_global_id": 10,
"target_global_id": 11,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 10,
"target_global_id": 12,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 10,
"target_global_id": 13,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 10,
"target_global_id": 14,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 10,
"target_global_id": 15,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 10,
"target_global_id": 16,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 10,
"target_global_id": 17,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 10,
"target_global_id": 18,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 10,
"target_global_id": 19,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 11,
"target_global_id": 10,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 11,
"target_global_id": 12,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 11,
"target_global_id": 13,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 11,
"target_global_id": 14,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 11,
"target_global_id": 15,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 11,
"target_global_id": 16,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 11,
"target_global_id": 17,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 11,
"target_global_id": 18,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 11,
"target_global_id": 19,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 12,
"target_global_id": 10,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 12,
"target_global_id": 11,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 12,
"target_global_id": 13,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 12,
"target_global_id": 14,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 12,
"target_global_id": 15,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 12,
"target_global_id": 16,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 12,
"target_global_id": 17,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 12,
"target_global_id": 18,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 12,
"target_global_id": 19,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 13,
"target_global_id": 10,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 13,
"target_global_id": 11,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 13,
"target_global_id": 12,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 13,
"target_global_id": 14,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 13,
"target_global_id": 15,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 13,
"target_global_id": 16,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 13,
"target_global_id": 17,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 13,
"target_global_id": 18,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 13,
"target_global_id": 19,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 14,
"target_global_id": 10,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 14,
"target_global_id": 11,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 14,
"target_global_id": 12,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 14,
"target_global_id": 13,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 14,
"target_global_id": 15,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 14,
"target_global_id": 16,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 14,
"target_global_id": 17,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 14,
"target_global_id": 18,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 14,
"target_global_id": 19,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 15,
"target_global_id": 10,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 15,
"target_global_id": 11,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 15,
"target_global_id": 12,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 15,
"target_global_id": 13,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 15,
"target_global_id": 14,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 15,
"target_global_id": 16,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 15,
"target_global_id": 17,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 15,
"target_global_id": 18,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 15,
"target_global_id": 19,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 16,
"target_global_id": 10,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 16,
"target_global_id": 11,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 16,
"target_global_id": 12,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 16,
"target_global_id": 13,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 16,
"target_global_id": 14,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 16,
"target_global_id": 15,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 16,
"target_global_id": 17,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 16,
"target_global_id": 18,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 16,
"target_global_id": 19,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 17,
"target_global_id": 10,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 17,
"target_global_id": 11,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 17,
"target_global_id": 12,
"type": "NVB",
"bandwidth": 235.0
},
{
"source_global_id": 17,
"target_global_id": 13,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 17,
"target_global_id": 14,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 17,
"target_global_id": 15,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 17,
"target_global_id": 16,
"type": "NVL",
"bandwidth": 235.0
},
{
"source_global_id": 17,
"target_global_id": 18,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 17,
"target_global_id": 19,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 18,
"target_global_id": 10,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 18,
"target_global_id": 11,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 18,
"target_global_id": 12,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 18,
"target_global_id": 13,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 18,
"target_global_id": 14,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 18,
"target_global_id": 15,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 18,
"target_global_id": 16,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 18,
"target_global_id": 17,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 18,
"target_global_id": 19,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 10,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 11,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 12,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 13,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 14,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 15,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 16,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 17,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 18,
"type": "PHB",
"bandwidth": 24.0
},
{
"source_global_id": 19,
"target_global_id": 9,
"type": "NET",
"bandwidth": 24.0
}
]
}
]
}
"""
class TestCluster(unittest.TestCase):
def test_single_machine(self):
# Build cluster
file_dir = os.path.dirname(os.path.abspath(__file__))
cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
cluster_json_object = json.loads(cluster_json)
with open(cluster_json_path, "w") as cluster_json_file:
json.dump(cluster_json_object, cluster_json_file)
cluster = Cluster()
cluster.build_from_file(cluster_json_path)
beta = cluster.get_beta(0, 1)
hop = cluster.get_hop(0, 1)
cross_machine = cluster.cross_machine([0, 1])
devices = cluster.convert_rank_to_device_id([0, 1, 2, 3])
involved_machine_count = cluster.get_involved_machine_count(devices)
self.assertTrue(beta > 0)
self.assertTrue(hop == 0)
self.assertTrue(not cross_machine)
self.assertTrue(devices == [0, 1, 2, 3])
self.assertTrue(involved_machine_count == 1)
# Remove unnecessary files
if os.path.exists(cluster_json_path):
os.remove(cluster_json_path)
def test_multi_machine(self):
# Build cluster
file_dir = os.path.dirname(os.path.abspath(__file__))
cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
cluster_json_object = json.loads(multi_cluster_json)
with open(cluster_json_path, "w") as cluster_json_file:
json.dump(cluster_json_object, cluster_json_file)
cluster = Cluster()
cluster.build_from_file(cluster_json_path)
beta = cluster.get_beta(0, 11)
hop = cluster.get_hop(0, 11)
cross_machine = cluster.cross_machine([0, 11])
devices = cluster.convert_rank_to_device_id([5, 6, 7, 8])
involved_machine_count = cluster.get_involved_machine_count(devices)
self.assertTrue(beta > 0)
self.assertTrue(hop >= 0)
self.assertTrue(cross_machine)
self.assertTrue(devices == [5, 6, 7, 10])
self.assertTrue(involved_machine_count == 2)
# Remove unnecessary files
if os.path.exists(cluster_json_path):
os.remove(cluster_json_path)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册