diff --git a/python/paddle/distributed/auto_parallel/cluster.py b/python/paddle/distributed/auto_parallel/cluster.py index ef05ff7c7460ef01108d8e48e80a30f479aeebb6..3685729cb6c297800d2f73fee1d97106e2e70e92 100644 --- a/python/paddle/distributed/auto_parallel/cluster.py +++ b/python/paddle/distributed/auto_parallel/cluster.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -43,6 +43,8 @@ class LinkType(IntEnum): class Device: + NON_ACCELERATOR_TYPE = [DeviceType.CPU, DeviceType.NIC, DeviceType.UNKNOWN] + def __init__(self, global_id, local_id, machine): self._global_id = global_id self._local_id = local_id @@ -134,6 +136,10 @@ class Device: class Link: + + default_hop = 1 + default_nic_bandwith = 24 + def __init__(self, source, target): self._src = source self._tgt = target @@ -142,6 +148,7 @@ class Link: self._bandwidth = None # latency is stored by millisecond self._latency = None + self._hop = None @property def source(self): @@ -183,6 +190,14 @@ class Link: def latency(self, value): self._latency = value + @property + def hop(self): + return self._hop + + @hop.setter + def hop(self, value): + self._hop = value + def __str__(self): str = "" str += "source_global_id: {}, target_global_id: {}, type: {}, bandwidth: {}, latency: {}".format( @@ -202,6 +217,8 @@ class Machine: self._port = None self._devices = {} self._links = {} + self._accelerators = {} + self._non_accelerator_cumulative_count = 0 @property def id(self): @@ -243,14 +260,23 @@ class Machine: def links(self): return self._links + @property + def accelerators(self): + return self._accelerators + def add_device(self, device): # Use the device global_id as the key self._devices[device.global_id] = device + if device.type not in Device.NON_ACCELERATOR_TYPE: + self._accelerators[device.global_id] = device def add_link(self, link): # Use the source device global_id and target device global_id as the key self._links[(link.source.global_id, link.target.global_id)] = link + def get_link(self, source_global_id, target_global_id): + return self._links.get((source_global_id, target_global_id), None) + def __str__(self): str = "" for device in self.devices.values(): @@ -263,6 +289,109 @@ class Machine: return self.__str__() +class AlphaLatency: + def __init__(self, alpha_latency): + assert isinstance(alpha_latency, dict) + self._base = alpha_latency.get("base", None) + self._inter = alpha_latency.get("inter", None) + self._intra = alpha_latency.get("intra", None) + self._switch = alpha_latency.get("switch", None) + if self._switch is not None: + try: + self._switch = float(self._switch) + except: + raise TypeError("The switch latency must be float") + self._base_ring = self._base.get( + "ring", None) if self._base is not None else None + self._base_tree = self._base.get( + "tree", None) if self._base is not None else None + self._base_inter = self._base.get( + "inter", None) if self._base is not None else None + if self._base_ring is not None: + try: + self._base_ring = float(self._base_ring) + except: + raise TypeError("The base ring latency must be float.") + if self._base_tree is not None: + try: + self._base_tree = float(self._base_tree) + except: + raise TypeError("The base ring latency must be float.") + + self._inter_ring = self._inter.get("ring", None) + self._inter_tree = self._inter.get("tree", None) + self._intra_ring = self._intra.get("ring", None) + self._intra_tree = self._intra.get("tree", None) + + if self._inter_ring is not None: + if isinstance(self._inter_ring, str): + assert self._inter_ring in ["NET"] + self._inter_ring = LinkType[self._inter_ring] + else: + try: + self._inter_ring = float(self._inter_ring) + except: + raise TypeError("The inter ring latency must be float.") + + if self._inter_tree is not None: + if isinstance(self._inter_tree, str): + assert self._inter_tree in ["NET"] + self._inter_tree = LinkType[self._inter_tree] + else: + try: + self._inter_tree = float(self._inter_tree) + except: + raise TypeError("The inter tree latency must be float.") + + if self._intra_ring is not None: + if isinstance(self._intra_ring, str): + assert self._intra_ring in ["NVL", "PHB"] + self._intra_ring = LinkType[self._intra_ring] + else: + try: + self._intra_ring = float(self._intra_ring) + except: + raise TypeError("The intra ring latency must be float.") + + if self._intra_tree is not None: + if isinstance(self._intra_tree, str): + assert self._intra_tree in ["NVL", "PHB"] + self._intra_tree = LinkType[self._intra_tree] + else: + try: + self._intra_tree = float(self._intra_tree) + except: + raise TypeError("The intra tree latency must be float.") + + @property + def base_ring(self): + return self._base_ring + + @property + def base_tree(self): + return self._base_tree + + @property + def switch(self): + return self._switch + + @property + def inter_ring(self): + return self._inter_ring + + @property + def inter_tree(self): + return self._inter_tree + + @property + def intra_ring(self): + return self._intra_ring + + @property + def intra_tree(self): + return self._intra_tree + + class Cluster: """ The cluster is an abstract of the hardware resource for training, which contains the cluster topology and @@ -276,6 +405,18 @@ class Cluster: self._machines = {} # Cluster graph topology self._topology = None + # Latency for communication cost model + self._alpha_latency = None + self._rank_to_device_id = {} + self._device_id_to_rank = {} + + @property + def rank_to_device_id(self): + return self._rank_to_device_id + + @property + def device_id_to_rank(self): + return self._device_id_to_rank @property def machines(self): @@ -285,6 +426,35 @@ class Cluster: assert isinstance(machine, Machine) self._machines[machine.id] = machine + # map rank to device id and map device id to rank + if machine.id != 0: + prev_machine = self._machines[machine.id - 1] + offset = prev_machine._non_accelerator_cumulative_count + for global_id in machine.devices: + if machine.devices[ + global_id].type not in Device.NON_ACCELERATOR_TYPE: + rank_id = global_id - offset + self._rank_to_device_id[rank_id] = global_id + self._device_id_to_rank[global_id] = rank_id + machine._non_accelerator_cumulative_count = len( + machine.devices) - len( + machine.accelerators + ) + prev_machine._non_accelerator_cumulative_count + else: + for global_id in machine.devices: + if machine.devices[ + global_id].type not in Device.NON_ACCELERATOR_TYPE: + rank_id = global_id + self._rank_to_device_id[rank_id] = global_id + self._device_id_to_rank[global_id] = rank_id + machine.accelerators[global_id] = machine.devices[global_id] + machine._non_accelerator_cumulative_count = len( + machine.devices) - len(machine.accelerators) + + @property + def alpha_latency(self): + return self._alpha_latency + def add_device(self, device): assert isinstance(device, Device) device.machine.add_device(device) @@ -344,8 +514,23 @@ class Cluster: link.type = link_type link.bandwidth = float(link_info.get("bandwidth", 0)) link.latency = float(link_info.get("latency", 0)) + link.hop = link_info.get("hop", None) + if link.hop is None: + # Set the default of hop: If in the same machine, hop is 0. And if in the different machine, hop is 1. + source_machine = source.machine + target_machine = target.machine + if source_machine.id == target_machine.id: + link.hop = 0 + else: + link.hop = Link.default_hop self.add_link(link) + if "alpha_latency" in cluster_info: + self._alpha_latency = AlphaLatency( + cluster_info.get("alpha_latency")) + else: + self._alpha_latecy = None + def _generate_machine_id(self): cur_machine_id = self._num_machines self._num_machines += 1 @@ -359,6 +544,68 @@ class Cluster: devices.append(device) return devices + def get_beta(self, source_device_id, target_device_id): + # beta means the time transferring a byte, us/B + beta = None + convert_base = 1000 + device = self.get_device(source_device_id) + machine = device.machine + link = machine.get_link(source_device_id, target_device_id) + bandwidth = None + # None means the source and target are not connected directly, set NIC in default + if link is None: + bandwidth = Link.default_nic_bandwith + else: + bandwidth = link.bandwidth + + if bandwidth == 0.: + beta = 0 + else: + beta = 1 / (bandwidth * (convert_base**3 / 10**6)) + + return beta + + def get_hop(self, source_device_id, target_device_id): + beta = None + hop = None + device = self.get_device(source_device_id) + machine = device.machine + link = machine.get_link(source_device_id, target_device_id) + if link is not None: + hop = link.hop + else: + hop = Link.default_hop + return hop + + def cross_machine(self, device_ids): + machine_ids = set() + for device_id in device_ids: + device = self.get_device(device_id) + machine_id = device.machine.id + machine_ids.add(machine_id) + if len(machine_ids) == 1: + return False + else: + return True + + def convert_rank_to_device_id(self, group_ranks): + # group_ranks is global id of the rank in paddle + # task will use all of machine in this cluster with accelerators in default + device_ids = [] + for rank in group_ranks: + device_ids.append(self.rank_to_device_id[rank]) + return device_ids + + def get_involved_machine_count(self, device_ids): + machine_ids = set() + for device_id in device_ids: + device = self.get_device(device_id) + machine_id = device.machine.id + machine_ids.add(machine_id) + count = len(machine_ids) + assert count > 0 + return count + def __str__(self): str = "" for machine in self.machines.values(): diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index c16936db5a3340816709a7789d1b2fc7cd26b2db..87031fe09e5a8ca613ad9c919a5e5b815559e475 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -18,4 +18,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS}) py_test_modules(test_trial MODULES test_trial ENVS ${dist_ENVS}) py_test_modules(test_new_cost_model MODULES test_new_cost_model ENVS ${dist_ENVS}) + py_test_modules(test_cluster MODULES test_cluster ENVS ${dist_ENVS}) endif() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py new file mode 100644 index 0000000000000000000000000000000000000000..dc22263b520403eec348d8dfed0eb9bde4f70e28 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py @@ -0,0 +1,2022 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import os +import json + +import paddle +from paddle.distributed.auto_parallel.cluster import Cluster + +cluster_json = """ +{ + "alpha_latency": {"inter": {"ring": "NET", "tree": "NET"}, + "intra": {"ring": "NVL", "tree": "PHB"}, + "base": {"ring": 8.4, "tree": 0}, + "switch": 10.0}, + "machines": [ + { + "hostname": "yq01-sys-hic-v100-box-a225-0266", + "addr": "10.127.9.147", + "port": "60009", + "devices": [ + { + "global_id": 0, + "local_id": 0, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 1, + "local_id": 1, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 2, + "local_id": 2, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 3, + "local_id": 3, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 4, + "local_id": 4, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 5, + "local_id": 5, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 6, + "local_id": 6, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 7, + "local_id": 7, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 8, + "local_id": 0, + "type": "CPU", + "arch": "x86_64", + "vendor": "GenuineIntel", + "model": "Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GH", + "memory": "502", + "sp_gflops": "150", + "dp_gflops": "75" + }, + { + "global_id": 9, + "local_id": 0, + "type": "NIC", + "width": 12.5, + "ip": "10.127.9.147" + } + ], + "links": [ + { + "source_global_id": 0, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 4, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 5, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 6, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 7, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 0, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 1, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 4, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 6, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 7, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 1, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 2, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 4, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 5, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 7, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 2, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 3, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 4, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 5, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 6, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 3, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 4, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 1, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 2, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 3, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 4, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 5, + "target_global_id": 0, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 2, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 3, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 4, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 5, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 6, + "target_global_id": 0, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 1, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 3, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 4, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 6, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 7, + "target_global_id": 0, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 1, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 2, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 4, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 7, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 0, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 1, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 2, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 3, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 4, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 5, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 6, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 7, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 0, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 1, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 2, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 3, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 4, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 5, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 6, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 7, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + } + ] + } + ] +} +""" + +multi_cluster_json = """{ + "machines": [ + { + "hostname": "yq01-sys-hic-v100-box-a225-0266", + "addr": "10.127.9.147", + "port": "60009", + "devices": [ + { + "global_id": 0, + "local_id": 0, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 1, + "local_id": 1, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 2, + "local_id": 2, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 3, + "local_id": 3, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 4, + "local_id": 4, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 5, + "local_id": 5, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 6, + "local_id": 6, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 7, + "local_id": 7, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 8, + "local_id": 0, + "type": "CPU", + "arch": "x86_64", + "vendor": "GenuineIntel", + "model": "Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GH", + "memory": "502", + "sp_gflops": "150", + "dp_gflops": "75" + }, + { + "global_id": 9, + "local_id": 0, + "type": "NIC", + "width": 12.5, + "ip": "10.127.9.147" + } + ], + "links": [ + { + "source_global_id": 0, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 4, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 5, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 6, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 7, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 0, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 1, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 4, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 6, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 7, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 1, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 2, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 4, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 5, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 7, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 2, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 3, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 4, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 5, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 6, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 3, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 4, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 1, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 2, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 3, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 4, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 5, + "target_global_id": 0, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 2, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 3, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 4, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 5, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 6, + "target_global_id": 0, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 1, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 3, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 4, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 6, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 7, + "target_global_id": 0, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 1, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 2, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 4, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 7, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 0, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 1, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 2, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 3, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 4, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 5, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 6, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 7, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 0, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 1, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 2, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 3, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 4, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 5, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 6, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 7, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 19, + "type": "NET", + "bandwidth": 24.0 + } + ] + }, + { + "hostname": "yq01-sys-hic-k8s-v100-box-a225-0751", + "addr": "10.127.43.24", + "port": "60009", + "devices": [ + { + "global_id": 10, + "local_id": 0, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 11, + "local_id": 1, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 12, + "local_id": 2, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 13, + "local_id": 3, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 14, + "local_id": 4, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 15, + "local_id": 5, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 16, + "local_id": 6, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 17, + "local_id": 7, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 18, + "local_id": 0, + "type": "CPU", + "arch": "x86_64", + "vendor": "GenuineIntel", + "model": "Intel(R) Xeon(R) Gold 6271C CPU @ 2.60G", + "memory": "503", + "sp_gflops": "150", + "dp_gflops": "75" + }, + { + "global_id": 19, + "local_id": 0, + "type": "NIC", + "width": 12.5, + "ip": "10.127.43.24" + } + ], + "links": [ + { + "source_global_id": 10, + "target_global_id": 11, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 10, + "target_global_id": 12, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 10, + "target_global_id": 13, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 10, + "target_global_id": 14, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 10, + "target_global_id": 15, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 10, + "target_global_id": 16, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 10, + "target_global_id": 17, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 10, + "target_global_id": 18, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 10, + "target_global_id": 19, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 11, + "target_global_id": 10, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 11, + "target_global_id": 12, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 11, + "target_global_id": 13, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 11, + "target_global_id": 14, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 11, + "target_global_id": 15, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 11, + "target_global_id": 16, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 11, + "target_global_id": 17, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 11, + "target_global_id": 18, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 11, + "target_global_id": 19, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 12, + "target_global_id": 10, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 12, + "target_global_id": 11, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 12, + "target_global_id": 13, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 12, + "target_global_id": 14, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 12, + "target_global_id": 15, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 12, + "target_global_id": 16, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 12, + "target_global_id": 17, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 12, + "target_global_id": 18, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 12, + "target_global_id": 19, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 13, + "target_global_id": 10, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 13, + "target_global_id": 11, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 13, + "target_global_id": 12, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 13, + "target_global_id": 14, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 13, + "target_global_id": 15, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 13, + "target_global_id": 16, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 13, + "target_global_id": 17, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 13, + "target_global_id": 18, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 13, + "target_global_id": 19, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 14, + "target_global_id": 10, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 14, + "target_global_id": 11, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 14, + "target_global_id": 12, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 14, + "target_global_id": 13, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 14, + "target_global_id": 15, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 14, + "target_global_id": 16, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 14, + "target_global_id": 17, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 14, + "target_global_id": 18, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 14, + "target_global_id": 19, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 15, + "target_global_id": 10, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 15, + "target_global_id": 11, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 15, + "target_global_id": 12, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 15, + "target_global_id": 13, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 15, + "target_global_id": 14, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 15, + "target_global_id": 16, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 15, + "target_global_id": 17, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 15, + "target_global_id": 18, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 15, + "target_global_id": 19, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 16, + "target_global_id": 10, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 16, + "target_global_id": 11, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 16, + "target_global_id": 12, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 16, + "target_global_id": 13, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 16, + "target_global_id": 14, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 16, + "target_global_id": 15, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 16, + "target_global_id": 17, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 16, + "target_global_id": 18, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 16, + "target_global_id": 19, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 17, + "target_global_id": 10, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 17, + "target_global_id": 11, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 17, + "target_global_id": 12, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 17, + "target_global_id": 13, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 17, + "target_global_id": 14, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 17, + "target_global_id": 15, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 17, + "target_global_id": 16, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 17, + "target_global_id": 18, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 17, + "target_global_id": 19, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 18, + "target_global_id": 10, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 18, + "target_global_id": 11, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 18, + "target_global_id": 12, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 18, + "target_global_id": 13, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 18, + "target_global_id": 14, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 18, + "target_global_id": 15, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 18, + "target_global_id": 16, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 18, + "target_global_id": 17, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 18, + "target_global_id": 19, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 10, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 11, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 12, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 13, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 14, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 15, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 16, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 17, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 18, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 9, + "type": "NET", + "bandwidth": 24.0 + } + ] + } + ] +} +""" + + +class TestCluster(unittest.TestCase): + def test_single_machine(self): + # Build cluster + file_dir = os.path.dirname(os.path.abspath(__file__)) + cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json") + cluster_json_object = json.loads(cluster_json) + with open(cluster_json_path, "w") as cluster_json_file: + json.dump(cluster_json_object, cluster_json_file) + cluster = Cluster() + cluster.build_from_file(cluster_json_path) + + beta = cluster.get_beta(0, 1) + hop = cluster.get_hop(0, 1) + cross_machine = cluster.cross_machine([0, 1]) + devices = cluster.convert_rank_to_device_id([0, 1, 2, 3]) + involved_machine_count = cluster.get_involved_machine_count(devices) + self.assertTrue(beta > 0) + self.assertTrue(hop == 0) + self.assertTrue(not cross_machine) + self.assertTrue(devices == [0, 1, 2, 3]) + self.assertTrue(involved_machine_count == 1) + + # Remove unnecessary files + if os.path.exists(cluster_json_path): + os.remove(cluster_json_path) + + def test_multi_machine(self): + # Build cluster + file_dir = os.path.dirname(os.path.abspath(__file__)) + cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json") + cluster_json_object = json.loads(multi_cluster_json) + with open(cluster_json_path, "w") as cluster_json_file: + json.dump(cluster_json_object, cluster_json_file) + cluster = Cluster() + cluster.build_from_file(cluster_json_path) + + beta = cluster.get_beta(0, 11) + hop = cluster.get_hop(0, 11) + cross_machine = cluster.cross_machine([0, 11]) + devices = cluster.convert_rank_to_device_id([5, 6, 7, 8]) + involved_machine_count = cluster.get_involved_machine_count(devices) + self.assertTrue(beta > 0) + self.assertTrue(hop >= 0) + self.assertTrue(cross_machine) + self.assertTrue(devices == [5, 6, 7, 10]) + self.assertTrue(involved_machine_count == 2) + + # Remove unnecessary files + if os.path.exists(cluster_json_path): + os.remove(cluster_json_path) + + +if __name__ == "__main__": + unittest.main()