diff --git a/mindinsight/backend/application.py b/mindinsight/backend/application.py index cdeeca7197d45566f19eeaac12237895cb938276..b9f63b0ce5c8a577675995efb4c17f325c4c356e 100644 --- a/mindinsight/backend/application.py +++ b/mindinsight/backend/application.py @@ -111,6 +111,7 @@ def create_app(): static_folder_path = os.path.realpath(os.path.join(os.path.dirname(__file__), os.pardir, 'ui', 'dist', 'static')) app = Flask(__name__, static_url_path=static_url_path, static_folder=static_folder_path) + app.config['JSON_SORT_KEYS'] = False if settings.ENABLE_CORS: CORS(app, supports_credentials=True) diff --git a/mindinsight/backend/datavisual/__init__.py b/mindinsight/backend/datavisual/__init__.py index 278f18059725a994178ee02304c270a98ed1b1ca..63039b4d296b6e4ba3b162848340350cd8cdbbd7 100644 --- a/mindinsight/backend/datavisual/__init__.py +++ b/mindinsight/backend/datavisual/__init__.py @@ -17,6 +17,7 @@ from mindinsight.backend.datavisual.static_resource_api import init_module as static_init_module from mindinsight.backend.datavisual.task_manager_api import init_module as task_init_module from mindinsight.backend.datavisual.train_visual_api import init_module as train_init_module +from mindinsight.backend.datavisual.sysmetric_api import init_module as sysmetric_init_module def init_module(app): @@ -30,3 +31,4 @@ def init_module(app): static_init_module(app) task_init_module(app) train_init_module(app) + sysmetric_init_module(app) diff --git a/mindinsight/backend/datavisual/sysmetric_api.py b/mindinsight/backend/datavisual/sysmetric_api.py new file mode 100644 index 0000000000000000000000000000000000000000..be66f3760b69820bffbdd178215ca7182a6ec5ce --- /dev/null +++ b/mindinsight/backend/datavisual/sysmetric_api.py @@ -0,0 +1,39 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""System metrics API.""" + +from flask import Blueprint, jsonify +from mindinsight.conf import settings +from mindinsight.sysmetric.collector import get_metrics + +BLUEPRINT = Blueprint("sysmetric", __name__, url_prefix=settings.URL_PATH_PREFIX + settings.API_PREFIX) + + +@BLUEPRINT.route("/sysmetric/current", methods=["GET"]) +def query_sysmetric(): + """Query the system metrics.""" + + return jsonify(get_metrics()) + + +def init_module(app): + """ + Init module entry. + + Args: + app: the application obj. + + """ + app.register_blueprint(BLUEPRINT) diff --git a/mindinsight/sysmetric/collector/__init__.py b/mindinsight/sysmetric/collector/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ad194136943f711c47aec9b011b36ff84779b515 --- /dev/null +++ b/mindinsight/sysmetric/collector/__init__.py @@ -0,0 +1,42 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""The metrics collector.""" + +from ._collect_cpu import collect_cpu +from ._collect_mem import collect_mem +from ._collect_npu import collect_npu + +__all__ = [ + 'collect_cpu', + 'collect_mem', + 'collect_npu', +] + + +def get_metrics(): + mem = collect_mem() + return { + 'npu': collect_npu(), + 'cpu': { + 'overall': collect_cpu(percent=True), + 'percpu': collect_cpu(percpu=True, percent=True) + }, + 'memory': { + 'virtual': { + 'available': mem.get('available'), + 'used': mem.get('used') + } + } + } diff --git a/mindinsight/sysmetric/collector/_collect_cpu.py b/mindinsight/sysmetric/collector/_collect_cpu.py new file mode 100644 index 0000000000000000000000000000000000000000..4ce5365b79618a12218eac099759385dd7be1823 --- /dev/null +++ b/mindinsight/sysmetric/collector/_collect_cpu.py @@ -0,0 +1,37 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""The cpu collector.""" + +import psutil + + +def collect_cpu(percpu=False, percent=False): + """ + Collect the cpu info. + + Args: + percpu (bool): To return a list of cpu info for each logical CPU on the system. + percent (bool): Represent the sized in percentage. + + Returns: + Union[dict, List[dict]], the CPUs info. + """ + if percent: + times = psutil.cpu_times_percent(percpu=percpu) + else: + times = psutil.cpu_times(percpu=percpu) + if not percpu: + return dict(times._asdict()) + return [dict(time._asdict()) for time in times] diff --git a/mindinsight/sysmetric/collector/_collect_mem.py b/mindinsight/sysmetric/collector/_collect_mem.py new file mode 100644 index 0000000000000000000000000000000000000000..e3852e21cab73ccdb2849d9a0738b9695de0266c --- /dev/null +++ b/mindinsight/sysmetric/collector/_collect_mem.py @@ -0,0 +1,34 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""The memory collector.""" + +import psutil +from psutil._common import bytes2human + + +def collect_mem(readable=False): + """ + Collect the virtual memory info. + + Args: + readable (bool): Read the sizes like 1K, 234M, 2G etc. + + Returns: + dict, the virtual memory info. + """ + mem = psutil.virtual_memory()._asdict() + if not readable: + return dict(mem) + return {k: v if k == 'percent' else bytes2human(v) for k, v in mem.items()} diff --git a/mindinsight/sysmetric/collector/_collect_npu.py b/mindinsight/sysmetric/collector/_collect_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..9f34c8a4161eee7badde7db78a85a9e885518dc4 --- /dev/null +++ b/mindinsight/sysmetric/collector/_collect_npu.py @@ -0,0 +1,281 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""The npu collector.""" + +import inspect +from ctypes import CDLL, Structure, byref, c_char, c_int, c_uint, c_ulong, c_ushort +from functools import lru_cache + +from mindinsight.sysmetric.common.log import logger + +try: + libsmi = CDLL('libdrvdsmi_host.so') +except OSError: + logger.info('Failed to load libdrvdsmi_host.so.') + libsmi = None + + +def libsmicall(*args, **kwargs): + if not libsmi: + logger.error('Trying to call the libdrvdsmi_host which is not loaded.') + raise ValueError('Trying to call the libdrvdsmi_host which is not loaded.') + fname = inspect.stack()[1].function + return getattr(libsmi, fname)(*args, **kwargs) + + +@lru_cache(maxsize=4) +def dsmi_get_device_count(): + """ + Get device count. + + Returns: + int, the device count. + """ + device_count = c_int() + + libsmicall(byref(device_count)) + + return device_count.value + + +@lru_cache(maxsize=4) +def dsmi_list_device(count): + """ + List the device IDs. + + Args: + count (int): The device count. + + Returns: + List[int], the device IDs. + """ + device_id_array = c_int * count + device_id_list = device_id_array() + count = c_int(count) + + libsmicall(device_id_list, count) + + return list(device_id_list) + + +@lru_cache(maxsize=8) +def dsmi_get_chip_info(device_id): + """ + Get chip info. + + Args: + device_id (int): The specific device id. + + Returns: + dict, the chip info: + - chip_type (str): The chip type. + - chip_name (str): The chip name. + - chip_ver (str): The chip name. + """ + + class ChipInfoStruct(Structure): + _fields_ = [('chip_type', c_char * 32), ('chip_name', c_char * 32), ('chip_ver', c_char * 32)] + + device_id = c_int(device_id) + chip_info = ChipInfoStruct() + libsmicall(device_id, byref(chip_info)) + return { + 'chip_type': chip_info.chip_type.decode('utf-8'), + 'chip_name': chip_info.chip_name.decode('utf-8'), + 'chip_ver': chip_info.chip_ver.decode('utf-8') + } + + +def dsmi_get_device_health(device_id): + """ + Get device health. + + Args: + device_id (int): The specific device id. + + Returns: + int, 0 indicats normal, 1 minor alarm, 2 major alarm, 3 critical alarm, 0xffffffff device not found. + """ + device_id = c_int(device_id) + health = c_uint() + + libsmicall(device_id, byref(health)) + + return health.value + + +@lru_cache(maxsize=8) +def dsmi_get_device_ip_address(device_id): + """ + Get device IP address. + + Args: + device_id (int): The specific device ID. + Returns: + dict, the device IP address: + - ip_address (str): the IP address. + - mask_address (str): the mask address. + """ + is_ipv6, port_type, port_id = False, 1, 0 + + class Ipaddrstruct(Structure): + _fields_ = [('u_addr', c_char * (16 if is_ipv6 else 4)), ('ip_type', c_int)] + + ip_type = c_int(1 if is_ipv6 else 0) + + device_id = c_int(device_id) + ip_address = Ipaddrstruct(b'', ip_type) + mask_address = Ipaddrstruct(b'', ip_type) + + libsmicall(device_id, port_type, port_id, byref(ip_address), byref(mask_address)) + + def pad(u_addr): + for i in range(4): + if i < len(u_addr): + yield u_addr[i] + else: + yield 0 + + return { + 'ip_address': '.'.join(str(c) for c in pad(ip_address.u_addr)), + 'mask_address': '.'.join(str(c) for c in pad(mask_address.u_addr)) + } + + +def dsmi_get_hbm_info(device_id): + """ + Get the HBM info. + + Args: + device_id (int): The specific device id. + + Returns: + dict, the HBM info: + memory_size (int), The total HBM memory, in KB. + frep (int), The HBM frequency, in MHZ. + memory_usage (int), The used HBM memory, in KB. + temp (int), The HBM temperature, in °C. + bandwith_util_rate (int): The bandwith util rate, in %. + """ + + class HbmInfoStruct(Structure): + _fields_ = [('memory_size', c_ulong), ('freq', c_uint), ('memory_usage', c_ulong), ('temp', c_int), + ('bandwith_util_rate', c_uint)] + + device_id = c_int(device_id) + hbm_info = HbmInfoStruct() + + libsmicall(device_id, byref(hbm_info)) + + return { + 'memory_size': hbm_info.memory_size, + 'freq': hbm_info.freq, + 'memory_usage': hbm_info.memory_usage, + 'temp': hbm_info.temp, + 'bandwith_util_rate': hbm_info.bandwith_util_rate + } + + +def dsmi_get_device_utilization_rate(device_id, device_type): + """ + Get device utilization rate, %. + + Note: Query AI Core when profiling turns on will return failure. + + Args: + device_id (int): The specific device id + device_type (int): The device type, 1 for memory, 2 AI Core, 5 memory bandwidth, 6 HBM, 10 HBM bandwidth. + Returns: + int, the utilization rate. + """ + device_id = c_int(device_id) + device_type = c_int(device_type) + utilization_rate = c_uint() + + libsmicall(device_id, device_type, byref(utilization_rate)) + + return utilization_rate.value + + +def dsmi_get_device_power_info(device_id): + """ + Get the device power. + + Args: + device_id (int): The specific device id. + + Returns: + dict, the device power info. + - power, the device power, in Watt. + """ + + class PowerInfoStruct(Structure): + _fields_ = [('power', c_ushort)] + + power_info = PowerInfoStruct() + device_id = c_int(device_id) + + libsmicall(device_id, byref(power_info)) + return {'power': round(power_info.power * 0.1, 2)} + + +def dsmi_get_device_temperature(device_id): + """ + Get the device temperature. + + Args: + device_id (int): The specific device id. + + Returns: + int, the device temperature, in °C. + """ + device_id = c_int(device_id) + temperature = c_uint() + + libsmicall(device_id, byref(temperature)) + + return temperature.value + + +def collect_npu(): + """Collect the metrics for each NPUs. + + Returns: + List[dict], the metrics of each NPUs. + """ + if not libsmi: + return None + kb_to_mb, memory_threshold = 1024, 4 + count = dsmi_get_device_count() + device_ids = dsmi_list_device(count) + npus = [] + for device_id in device_ids: + health = dsmi_get_device_health(device_id) + hbm_info = dsmi_get_hbm_info(device_id) + npus.append({ + 'chip_name': dsmi_get_chip_info(device_id).get('chip_name'), + 'device_id': device_id, + 'available': health == 0 and hbm_info.get('memory_usage', 0) // kb_to_mb < memory_threshold, + 'health': health, + 'ip_address': dsmi_get_device_ip_address(device_id).get('ip_address'), + 'aicore_rate': dsmi_get_device_utilization_rate(device_id, 2), + 'hbm_info': { + 'memory_size': hbm_info.get('memory_size') // kb_to_mb, + 'memory_usage': hbm_info.get('memory_usage') // kb_to_mb + }, + 'power': dsmi_get_device_power_info(device_id).get('power'), + 'temperature': dsmi_get_device_temperature(device_id) + }) + return npus diff --git a/mindinsight/sysmetric/common/__init__.py b/mindinsight/sysmetric/common/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e30774307ca2107b3a81c071ad33c042ef924790 --- /dev/null +++ b/mindinsight/sysmetric/common/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/mindinsight/sysmetric/common/log.py b/mindinsight/sysmetric/common/log.py new file mode 100644 index 0000000000000000000000000000000000000000..e4e35e28f0ea0186ddcaea62f907f15afef131e2 --- /dev/null +++ b/mindinsight/sysmetric/common/log.py @@ -0,0 +1,18 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Resource logger.""" +from mindinsight.utils.log import setup_logger + +logger = setup_logger(sub_module='sysmetric', log_name='sysmetric') diff --git a/tests/ut/sysmetric/__init__.py b/tests/ut/sysmetric/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8783813b1bb37d36d1c2320f20e8919d6a1adf02 --- /dev/null +++ b/tests/ut/sysmetric/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Test the system metrics.""" diff --git a/tests/ut/sysmetric/metrics_collector.py b/tests/ut/sysmetric/metrics_collector.py new file mode 100644 index 0000000000000000000000000000000000000000..3f28d92646b0968840a029125bb0cca7b06ae1e6 --- /dev/null +++ b/tests/ut/sysmetric/metrics_collector.py @@ -0,0 +1,42 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Test the metrics collector.""" +from os import cpu_count + +from mindinsight.sysmetric.collector import collect_cpu, collect_mem, collect_npu + + +def test_collect_cpu(): + overall = collect_cpu(percent=True) + assert isinstance(overall, dict) + for value in overall.values(): + assert 0 <= value <= 100 + for key in 'user', 'system', 'idle': + assert key in overall + cores = collect_cpu(percpu=True) + assert isinstance(cores, list) and len(cores) == cpu_count() + + +def test_collect_mem(): + mem = collect_mem() + assert 'total' in mem + assert 'available' in mem + assert mem['total'] > mem['available'] + + +def test_collect_npu(): + npu = collect_npu() + if npu is not None: + assert len(npu) == 8