Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MindSpore
mindinsight
提交
3da4d71d
M
mindinsight
项目概览
MindSpore
/
mindinsight
通知
7
Star
3
Fork
2
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindinsight
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
3da4d71d
编写于
7月 02, 2020
作者:
L
Li Hongzhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add the resource monitor api
- collect_cpu - collect_mem - collect_npu
上级
f674ae3e
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
525 addition
and
0 deletion
+525
-0
mindinsight/backend/application.py
mindinsight/backend/application.py
+1
-0
mindinsight/backend/datavisual/__init__.py
mindinsight/backend/datavisual/__init__.py
+2
-0
mindinsight/backend/datavisual/sysmetric_api.py
mindinsight/backend/datavisual/sysmetric_api.py
+39
-0
mindinsight/sysmetric/collector/__init__.py
mindinsight/sysmetric/collector/__init__.py
+42
-0
mindinsight/sysmetric/collector/_collect_cpu.py
mindinsight/sysmetric/collector/_collect_cpu.py
+37
-0
mindinsight/sysmetric/collector/_collect_mem.py
mindinsight/sysmetric/collector/_collect_mem.py
+34
-0
mindinsight/sysmetric/collector/_collect_npu.py
mindinsight/sysmetric/collector/_collect_npu.py
+281
-0
mindinsight/sysmetric/common/__init__.py
mindinsight/sysmetric/common/__init__.py
+14
-0
mindinsight/sysmetric/common/log.py
mindinsight/sysmetric/common/log.py
+18
-0
tests/ut/sysmetric/__init__.py
tests/ut/sysmetric/__init__.py
+15
-0
tests/ut/sysmetric/metrics_collector.py
tests/ut/sysmetric/metrics_collector.py
+42
-0
未找到文件。
mindinsight/backend/application.py
浏览文件 @
3da4d71d
...
...
@@ -111,6 +111,7 @@ def create_app():
static_folder_path
=
os
.
path
.
realpath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
os
.
pardir
,
'ui'
,
'dist'
,
'static'
))
app
=
Flask
(
__name__
,
static_url_path
=
static_url_path
,
static_folder
=
static_folder_path
)
app
.
config
[
'JSON_SORT_KEYS'
]
=
False
if
settings
.
ENABLE_CORS
:
CORS
(
app
,
supports_credentials
=
True
)
...
...
mindinsight/backend/datavisual/__init__.py
浏览文件 @
3da4d71d
...
...
@@ -17,6 +17,7 @@
from
mindinsight.backend.datavisual.static_resource_api
import
init_module
as
static_init_module
from
mindinsight.backend.datavisual.task_manager_api
import
init_module
as
task_init_module
from
mindinsight.backend.datavisual.train_visual_api
import
init_module
as
train_init_module
from
mindinsight.backend.datavisual.sysmetric_api
import
init_module
as
sysmetric_init_module
def
init_module
(
app
):
...
...
@@ -30,3 +31,4 @@ def init_module(app):
static_init_module
(
app
)
task_init_module
(
app
)
train_init_module
(
app
)
sysmetric_init_module
(
app
)
mindinsight/backend/datavisual/sysmetric_api.py
0 → 100644
浏览文件 @
3da4d71d
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""System metrics API."""
from
flask
import
Blueprint
,
jsonify
from
mindinsight.conf
import
settings
from
mindinsight.sysmetric.collector
import
get_metrics
BLUEPRINT
=
Blueprint
(
"sysmetric"
,
__name__
,
url_prefix
=
settings
.
URL_PATH_PREFIX
+
settings
.
API_PREFIX
)
@
BLUEPRINT
.
route
(
"/sysmetric/current"
,
methods
=
[
"GET"
])
def
query_sysmetric
():
"""Query the system metrics."""
return
jsonify
(
get_metrics
())
def
init_module
(
app
):
"""
Init module entry.
Args:
app: the application obj.
"""
app
.
register_blueprint
(
BLUEPRINT
)
mindinsight/sysmetric/collector/__init__.py
0 → 100644
浏览文件 @
3da4d71d
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""The metrics collector."""
from
._collect_cpu
import
collect_cpu
from
._collect_mem
import
collect_mem
from
._collect_npu
import
collect_npu
__all__
=
[
'collect_cpu'
,
'collect_mem'
,
'collect_npu'
,
]
def
get_metrics
():
mem
=
collect_mem
()
return
{
'npu'
:
collect_npu
(),
'cpu'
:
{
'overall'
:
collect_cpu
(
percent
=
True
),
'percpu'
:
collect_cpu
(
percpu
=
True
,
percent
=
True
)
},
'memory'
:
{
'virtual'
:
{
'available'
:
mem
.
get
(
'available'
),
'used'
:
mem
.
get
(
'used'
)
}
}
}
mindinsight/sysmetric/collector/_collect_cpu.py
0 → 100644
浏览文件 @
3da4d71d
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""The cpu collector."""
import
psutil
def
collect_cpu
(
percpu
=
False
,
percent
=
False
):
"""
Collect the cpu info.
Args:
percpu (bool): To return a list of cpu info for each logical CPU on the system.
percent (bool): Represent the sized in percentage.
Returns:
Union[dict, List[dict]], the CPUs info.
"""
if
percent
:
times
=
psutil
.
cpu_times_percent
(
percpu
=
percpu
)
else
:
times
=
psutil
.
cpu_times
(
percpu
=
percpu
)
if
not
percpu
:
return
dict
(
times
.
_asdict
())
return
[
dict
(
time
.
_asdict
())
for
time
in
times
]
mindinsight/sysmetric/collector/_collect_mem.py
0 → 100644
浏览文件 @
3da4d71d
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""The memory collector."""
import
psutil
from
psutil._common
import
bytes2human
def
collect_mem
(
readable
=
False
):
"""
Collect the virtual memory info.
Args:
readable (bool): Read the sizes like 1K, 234M, 2G etc.
Returns:
dict, the virtual memory info.
"""
mem
=
psutil
.
virtual_memory
().
_asdict
()
if
not
readable
:
return
dict
(
mem
)
return
{
k
:
v
if
k
==
'percent'
else
bytes2human
(
v
)
for
k
,
v
in
mem
.
items
()}
mindinsight/sysmetric/collector/_collect_npu.py
0 → 100644
浏览文件 @
3da4d71d
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""The npu collector."""
import
inspect
from
ctypes
import
CDLL
,
Structure
,
byref
,
c_char
,
c_int
,
c_uint
,
c_ulong
,
c_ushort
from
functools
import
lru_cache
from
mindinsight.sysmetric.common.log
import
logger
try
:
libsmi
=
CDLL
(
'libdrvdsmi_host.so'
)
except
OSError
:
logger
.
info
(
'Failed to load libdrvdsmi_host.so.'
)
libsmi
=
None
def
libsmicall
(
*
args
,
**
kwargs
):
if
not
libsmi
:
logger
.
error
(
'Trying to call the libdrvdsmi_host which is not loaded.'
)
raise
ValueError
(
'Trying to call the libdrvdsmi_host which is not loaded.'
)
fname
=
inspect
.
stack
()[
1
].
function
return
getattr
(
libsmi
,
fname
)(
*
args
,
**
kwargs
)
@
lru_cache
(
maxsize
=
4
)
def
dsmi_get_device_count
():
"""
Get device count.
Returns:
int, the device count.
"""
device_count
=
c_int
()
libsmicall
(
byref
(
device_count
))
return
device_count
.
value
@
lru_cache
(
maxsize
=
4
)
def
dsmi_list_device
(
count
):
"""
List the device IDs.
Args:
count (int): The device count.
Returns:
List[int], the device IDs.
"""
device_id_array
=
c_int
*
count
device_id_list
=
device_id_array
()
count
=
c_int
(
count
)
libsmicall
(
device_id_list
,
count
)
return
list
(
device_id_list
)
@
lru_cache
(
maxsize
=
8
)
def
dsmi_get_chip_info
(
device_id
):
"""
Get chip info.
Args:
device_id (int): The specific device id.
Returns:
dict, the chip info:
- chip_type (str): The chip type.
- chip_name (str): The chip name.
- chip_ver (str): The chip name.
"""
class
ChipInfoStruct
(
Structure
):
_fields_
=
[(
'chip_type'
,
c_char
*
32
),
(
'chip_name'
,
c_char
*
32
),
(
'chip_ver'
,
c_char
*
32
)]
device_id
=
c_int
(
device_id
)
chip_info
=
ChipInfoStruct
()
libsmicall
(
device_id
,
byref
(
chip_info
))
return
{
'chip_type'
:
chip_info
.
chip_type
.
decode
(
'utf-8'
),
'chip_name'
:
chip_info
.
chip_name
.
decode
(
'utf-8'
),
'chip_ver'
:
chip_info
.
chip_ver
.
decode
(
'utf-8'
)
}
def
dsmi_get_device_health
(
device_id
):
"""
Get device health.
Args:
device_id (int): The specific device id.
Returns:
int, 0 indicats normal, 1 minor alarm, 2 major alarm, 3 critical alarm, 0xffffffff device not found.
"""
device_id
=
c_int
(
device_id
)
health
=
c_uint
()
libsmicall
(
device_id
,
byref
(
health
))
return
health
.
value
@
lru_cache
(
maxsize
=
8
)
def
dsmi_get_device_ip_address
(
device_id
):
"""
Get device IP address.
Args:
device_id (int): The specific device ID.
Returns:
dict, the device IP address:
- ip_address (str): the IP address.
- mask_address (str): the mask address.
"""
is_ipv6
,
port_type
,
port_id
=
False
,
1
,
0
class
Ipaddrstruct
(
Structure
):
_fields_
=
[(
'u_addr'
,
c_char
*
(
16
if
is_ipv6
else
4
)),
(
'ip_type'
,
c_int
)]
ip_type
=
c_int
(
1
if
is_ipv6
else
0
)
device_id
=
c_int
(
device_id
)
ip_address
=
Ipaddrstruct
(
b
''
,
ip_type
)
mask_address
=
Ipaddrstruct
(
b
''
,
ip_type
)
libsmicall
(
device_id
,
port_type
,
port_id
,
byref
(
ip_address
),
byref
(
mask_address
))
def
pad
(
u_addr
):
for
i
in
range
(
4
):
if
i
<
len
(
u_addr
):
yield
u_addr
[
i
]
else
:
yield
0
return
{
'ip_address'
:
'.'
.
join
(
str
(
c
)
for
c
in
pad
(
ip_address
.
u_addr
)),
'mask_address'
:
'.'
.
join
(
str
(
c
)
for
c
in
pad
(
mask_address
.
u_addr
))
}
def
dsmi_get_hbm_info
(
device_id
):
"""
Get the HBM info.
Args:
device_id (int): The specific device id.
Returns:
dict, the HBM info:
memory_size (int), The total HBM memory, in KB.
frep (int), The HBM frequency, in MHZ.
memory_usage (int), The used HBM memory, in KB.
temp (int), The HBM temperature, in °C.
bandwith_util_rate (int): The bandwith util rate, in %.
"""
class
HbmInfoStruct
(
Structure
):
_fields_
=
[(
'memory_size'
,
c_ulong
),
(
'freq'
,
c_uint
),
(
'memory_usage'
,
c_ulong
),
(
'temp'
,
c_int
),
(
'bandwith_util_rate'
,
c_uint
)]
device_id
=
c_int
(
device_id
)
hbm_info
=
HbmInfoStruct
()
libsmicall
(
device_id
,
byref
(
hbm_info
))
return
{
'memory_size'
:
hbm_info
.
memory_size
,
'freq'
:
hbm_info
.
freq
,
'memory_usage'
:
hbm_info
.
memory_usage
,
'temp'
:
hbm_info
.
temp
,
'bandwith_util_rate'
:
hbm_info
.
bandwith_util_rate
}
def
dsmi_get_device_utilization_rate
(
device_id
,
device_type
):
"""
Get device utilization rate, %.
Note: Query AI Core when profiling turns on will return failure.
Args:
device_id (int): The specific device id
device_type (int): The device type, 1 for memory, 2 AI Core, 5 memory bandwidth, 6 HBM, 10 HBM bandwidth.
Returns:
int, the utilization rate.
"""
device_id
=
c_int
(
device_id
)
device_type
=
c_int
(
device_type
)
utilization_rate
=
c_uint
()
libsmicall
(
device_id
,
device_type
,
byref
(
utilization_rate
))
return
utilization_rate
.
value
def
dsmi_get_device_power_info
(
device_id
):
"""
Get the device power.
Args:
device_id (int): The specific device id.
Returns:
dict, the device power info.
- power, the device power, in Watt.
"""
class
PowerInfoStruct
(
Structure
):
_fields_
=
[(
'power'
,
c_ushort
)]
power_info
=
PowerInfoStruct
()
device_id
=
c_int
(
device_id
)
libsmicall
(
device_id
,
byref
(
power_info
))
return
{
'power'
:
round
(
power_info
.
power
*
0.1
,
2
)}
def
dsmi_get_device_temperature
(
device_id
):
"""
Get the device temperature.
Args:
device_id (int): The specific device id.
Returns:
int, the device temperature, in °C.
"""
device_id
=
c_int
(
device_id
)
temperature
=
c_uint
()
libsmicall
(
device_id
,
byref
(
temperature
))
return
temperature
.
value
def
collect_npu
():
"""Collect the metrics for each NPUs.
Returns:
List[dict], the metrics of each NPUs.
"""
if
not
libsmi
:
return
None
kb_to_mb
,
memory_threshold
=
1024
,
4
count
=
dsmi_get_device_count
()
device_ids
=
dsmi_list_device
(
count
)
npus
=
[]
for
device_id
in
device_ids
:
health
=
dsmi_get_device_health
(
device_id
)
hbm_info
=
dsmi_get_hbm_info
(
device_id
)
npus
.
append
({
'chip_name'
:
dsmi_get_chip_info
(
device_id
).
get
(
'chip_name'
),
'device_id'
:
device_id
,
'available'
:
health
==
0
and
hbm_info
.
get
(
'memory_usage'
,
0
)
//
kb_to_mb
<
memory_threshold
,
'health'
:
health
,
'ip_address'
:
dsmi_get_device_ip_address
(
device_id
).
get
(
'ip_address'
),
'aicore_rate'
:
dsmi_get_device_utilization_rate
(
device_id
,
2
),
'hbm_info'
:
{
'memory_size'
:
hbm_info
.
get
(
'memory_size'
)
//
kb_to_mb
,
'memory_usage'
:
hbm_info
.
get
(
'memory_usage'
)
//
kb_to_mb
},
'power'
:
dsmi_get_device_power_info
(
device_id
).
get
(
'power'
),
'temperature'
:
dsmi_get_device_temperature
(
device_id
)
})
return
npus
mindinsight/sysmetric/common/__init__.py
0 → 100644
浏览文件 @
3da4d71d
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
mindinsight/sysmetric/common/log.py
0 → 100644
浏览文件 @
3da4d71d
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Resource logger."""
from
mindinsight.utils.log
import
setup_logger
logger
=
setup_logger
(
sub_module
=
'sysmetric'
,
log_name
=
'sysmetric'
)
tests/ut/sysmetric/__init__.py
0 → 100644
浏览文件 @
3da4d71d
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Test the system metrics."""
tests/ut/sysmetric/metrics_collector.py
0 → 100644
浏览文件 @
3da4d71d
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Test the metrics collector."""
from
os
import
cpu_count
from
mindinsight.sysmetric.collector
import
collect_cpu
,
collect_mem
,
collect_npu
def
test_collect_cpu
():
overall
=
collect_cpu
(
percent
=
True
)
assert
isinstance
(
overall
,
dict
)
for
value
in
overall
.
values
():
assert
0
<=
value
<=
100
for
key
in
'user'
,
'system'
,
'idle'
:
assert
key
in
overall
cores
=
collect_cpu
(
percpu
=
True
)
assert
isinstance
(
cores
,
list
)
and
len
(
cores
)
==
cpu_count
()
def
test_collect_mem
():
mem
=
collect_mem
()
assert
'total'
in
mem
assert
'available'
in
mem
assert
mem
[
'total'
]
>
mem
[
'available'
]
def
test_collect_npu
():
npu
=
collect_npu
()
if
npu
is
not
None
:
assert
len
(
npu
)
==
8
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录