Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MindSpore
mindinsight
提交
008bdf56
M
mindinsight
项目概览
MindSpore
/
mindinsight
通知
7
Star
3
Fork
2
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindinsight
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
008bdf56
编写于
7月 20, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
7月 20, 2020
浏览文件
操作
浏览文件
下载
差异文件
!437 Use multiple processes to calc events
Merge pull request !437 from wangshuide/wsd_multiple_processes_for_file_parsing
上级
4211e0ec
7877f33b
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
148 addition
and
81 deletion
+148
-81
mindinsight/backend/run.py
mindinsight/backend/run.py
+5
-6
mindinsight/conf/constants.py
mindinsight/conf/constants.py
+2
-0
mindinsight/datavisual/data_transform/data_loader.py
mindinsight/datavisual/data_transform/data_loader.py
+8
-3
mindinsight/datavisual/data_transform/data_manager.py
mindinsight/datavisual/data_transform/data_manager.py
+4
-4
mindinsight/datavisual/data_transform/events_data.py
mindinsight/datavisual/data_transform/events_data.py
+1
-0
mindinsight/datavisual/data_transform/ms_data_loader.py
mindinsight/datavisual/data_transform/ms_data_loader.py
+101
-48
mindinsight/datavisual/data_transform/tensor_container.py
mindinsight/datavisual/data_transform/tensor_container.py
+2
-3
mindinsight/datavisual/processors/tensor_processor.py
mindinsight/datavisual/processors/tensor_processor.py
+3
-3
mindinsight/scripts/stop.py
mindinsight/scripts/stop.py
+22
-14
未找到文件。
mindinsight/backend/run.py
浏览文件 @
008bdf56
...
...
@@ -236,9 +236,10 @@ def start():
process
=
subprocess
.
Popen
(
shlex
.
split
(
cmd
),
shell
=
False
,
stdin
=
subprocess
.
PIPE
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
# Change stdout to DEVNULL to prevent broken pipe error when creating new processes.
stdin
=
subprocess
.
DEVNULL
,
stdout
=
subprocess
.
DEVNULL
,
stderr
=
subprocess
.
STDOUT
)
# sleep 1 second for gunicorn appplication to load modules
...
...
@@ -246,9 +247,7 @@ def start():
# check if gunicorn application is running
if
process
.
poll
()
is
not
None
:
_
,
stderr
=
process
.
communicate
()
for
line
in
stderr
.
decode
().
split
(
'
\n
'
):
console
.
error
(
line
)
console
.
error
(
"Start MindInsight failed. See log for details."
)
else
:
state_result
=
_check_server_start_stat
(
errorlog_abspath
,
log_size
)
# print gunicorn start state to stdout
...
...
mindinsight/conf/constants.py
浏览文件 @
008bdf56
...
...
@@ -14,6 +14,7 @@
# ============================================================================
"""Constants module for mindinsight settings."""
import
logging
import
os
####################################
# Global default settings.
...
...
@@ -48,6 +49,7 @@ API_PREFIX = '/v1/mindinsight'
# Datavisual default settings.
####################################
MAX_THREADS_COUNT
=
15
MAX_PROCESSES_COUNT
=
max
(
os
.
cpu_count
()
or
0
,
15
)
MAX_TAG_SIZE_PER_EVENTS_DATA
=
300
DEFAULT_STEP_SIZES_PER_TAG
=
500
...
...
mindinsight/datavisual/data_transform/data_loader.py
浏览文件 @
008bdf56
...
...
@@ -34,8 +34,13 @@ class DataLoader:
self
.
_summary_dir
=
summary_dir
self
.
_loader
=
None
def
load
(
self
):
"""Load the data when loader is exist."""
def
load
(
self
,
workers_count
=
1
):
"""Load the data when loader is exist.
Args:
workers_count (int): The count of workers. Default value is 1.
"""
if
self
.
_loader
is
None
:
ms_dataloader
=
MSDataLoader
(
self
.
_summary_dir
)
loaders
=
[
ms_dataloader
]
...
...
@@ -48,7 +53,7 @@ class DataLoader:
logger
.
warning
(
"No valid files can be loaded, summary_dir: %s."
,
self
.
_summary_dir
)
raise
exceptions
.
SummaryLogPathInvalid
()
self
.
_loader
.
load
()
self
.
_loader
.
load
(
workers_count
)
def
get_events_data
(
self
):
"""
...
...
mindinsight/datavisual/data_transform/data_manager.py
浏览文件 @
008bdf56
...
...
@@ -510,7 +510,7 @@ class _DetailCacheManager(_BaseCacheManager):
logger
.
debug
(
"delete loader %s"
,
loader_id
)
self
.
_loader_pool
.
pop
(
loader_id
)
def
_execute_loader
(
self
,
loader_id
):
def
_execute_loader
(
self
,
loader_id
,
workers_count
):
"""
Load data form data_loader.
...
...
@@ -518,7 +518,7 @@ class _DetailCacheManager(_BaseCacheManager):
Args:
loader_id (str): An ID for `Loader`.
workers_count (int): The count of workers.
"""
try
:
with
self
.
_loader_pool_mutex
:
...
...
@@ -527,7 +527,7 @@ class _DetailCacheManager(_BaseCacheManager):
logger
.
debug
(
"Loader %r has been deleted, will not load data."
,
loader_id
)
return
loader
.
data_loader
.
load
()
loader
.
data_loader
.
load
(
workers_count
)
# Update loader cache status to CACHED.
# Loader with cache status CACHED should remain the same cache status.
...
...
@@ -584,7 +584,7 @@ class _DetailCacheManager(_BaseCacheManager):
futures
=
[]
loader_pool
=
self
.
_get_snapshot_loader_pool
()
for
loader_id
in
loader_pool
:
future
=
executor
.
submit
(
self
.
_execute_loader
,
loader_id
)
future
=
executor
.
submit
(
self
.
_execute_loader
,
loader_id
,
threads_count
)
futures
.
append
(
future
)
wait
(
futures
,
return_when
=
ALL_COMPLETED
)
...
...
mindinsight/datavisual/data_transform/events_data.py
浏览文件 @
008bdf56
...
...
@@ -85,6 +85,7 @@ class EventsData:
deleted_tag
=
self
.
_check_tag_out_of_spec
(
plugin_name
)
if
deleted_tag
is
not
None
:
if
tag
in
self
.
_deleted_tags
:
logger
.
debug
(
"Tag is in deleted tags: %s."
,
tag
)
return
self
.
delete_tensor_event
(
deleted_tag
)
...
...
mindinsight/datavisual/data_transform/ms_data_loader.py
浏览文件 @
008bdf56
...
...
@@ -19,12 +19,17 @@ This module is used to load the MindSpore training log file.
Each instance will read an entire run, a run can contain one or
more log file.
"""
import
concurrent.futures
as
futures
import
math
import
os
import
re
import
struct
import
threading
from
google.protobuf.message
import
DecodeError
from
google.protobuf.text_format
import
ParseError
from
mindinsight.conf
import
settings
from
mindinsight.datavisual.common
import
exceptions
from
mindinsight.datavisual.common.enums
import
PluginNameEnum
from
mindinsight.datavisual.common.log
import
logger
...
...
@@ -32,13 +37,13 @@ from mindinsight.datavisual.data_access.file_handler import FileHandler
from
mindinsight.datavisual.data_transform.events_data
import
EventsData
from
mindinsight.datavisual.data_transform.events_data
import
TensorEvent
from
mindinsight.datavisual.data_transform.graph
import
MSGraph
from
mindinsight.datavisual.proto_files
import
mindinsight_summary_pb2
as
summary_pb2
from
mindinsight.datavisual.proto_files
import
mindinsight_anf_ir_pb2
as
anf_ir_pb2
from
mindinsight.datavisual.utils
import
crc32
from
mindinsight.utils.exceptions
import
UnknownError
from
mindinsight.datavisual.data_transform.histogram
import
Histogram
from
mindinsight.datavisual.data_transform.histogram_container
import
HistogramContainer
from
mindinsight.datavisual.data_transform.tensor_container
import
TensorContainer
from
mindinsight.datavisual.proto_files
import
mindinsight_anf_ir_pb2
as
anf_ir_pb2
from
mindinsight.datavisual.proto_files
import
mindinsight_summary_pb2
as
summary_pb2
from
mindinsight.datavisual.utils
import
crc32
from
mindinsight.utils.exceptions
import
UnknownError
HEADER_SIZE
=
8
CRC_STR_SIZE
=
4
...
...
@@ -79,11 +84,14 @@ class MSDataLoader:
"we will reload all files in path %s."
,
self
.
_summary_dir
)
self
.
__init__
(
self
.
_summary_dir
)
def
load
(
self
):
def
load
(
self
,
workers_count
=
1
):
"""
Load all log valid files.
When the file is reloaded, it will continue to load from where it left off.
Args:
workers_count (int): The count of workers. Default value is 1.
"""
logger
.
debug
(
"Start to load data in ms data loader."
)
filenames
=
self
.
filter_valid_files
()
...
...
@@ -95,7 +103,7 @@ class MSDataLoader:
self
.
_check_files_deleted
(
filenames
,
old_filenames
)
for
parser
in
self
.
_parser_list
:
parser
.
parse_files
(
filenames
,
events_data
=
self
.
_events_data
)
parser
.
parse_files
(
workers_count
,
filenames
,
events_data
=
self
.
_events_data
)
def
filter_valid_files
(
self
):
"""
...
...
@@ -125,11 +133,12 @@ class _Parser:
self
.
_latest_mtime
=
0
self
.
_summary_dir
=
summary_dir
def
parse_files
(
self
,
filenames
,
events_data
):
def
parse_files
(
self
,
workers_count
,
filenames
,
events_data
):
"""
Load files and parse files content.
Args:
workers_count (int): The count of workers.
filenames (list[str]): File name list.
events_data (EventsData): The container of event data.
"""
...
...
@@ -177,7 +186,7 @@ class _Parser:
class
_PbParser
(
_Parser
):
"""This class is used to parse pb file."""
def
parse_files
(
self
,
filenames
,
events_data
):
def
parse_files
(
self
,
workers_count
,
filenames
,
events_data
):
pb_filenames
=
self
.
filter_files
(
filenames
)
pb_filenames
=
self
.
sort_files
(
pb_filenames
)
for
filename
in
pb_filenames
:
...
...
@@ -255,11 +264,12 @@ class _SummaryParser(_Parser):
self
.
_summary_file_handler
=
None
self
.
_events_data
=
None
def
parse_files
(
self
,
filenames
,
events_data
):
def
parse_files
(
self
,
workers_count
,
filenames
,
events_data
):
"""
Load summary file and parse file content.
Args:
workers_count (int): The count of workers.
filenames (list[str]): File name list.
events_data (EventsData): The container of event data.
"""
...
...
@@ -285,7 +295,7 @@ class _SummaryParser(_Parser):
self
.
_latest_file_size
=
new_size
try
:
self
.
_load_single_file
(
self
.
_summary_file_handler
)
self
.
_load_single_file
(
self
.
_summary_file_handler
,
workers_count
)
except
UnknownError
as
ex
:
logger
.
warning
(
"Parse summary file failed, detail: %r,"
"file path: %s."
,
str
(
ex
),
file_path
)
...
...
@@ -304,36 +314,75 @@ class _SummaryParser(_Parser):
lambda
filename
:
(
re
.
search
(
r
'summary\.\d+'
,
filename
)
and
not
filename
.
endswith
(
"_lineage"
)),
filenames
))
def
_load_single_file
(
self
,
file_handler
):
def
_load_single_file
(
self
,
file_handler
,
workers_count
):
"""
Load a log file data.
Args:
file_handler (FileHandler): A file handler.
workers_count (int): The count of workers.
"""
logger
.
debug
(
"Load single summary file, file path: %s."
,
file_handler
.
file_path
)
while
True
:
start_offset
=
file_handler
.
offset
try
:
event_str
=
self
.
_event_load
(
file_handler
)
if
event_str
is
None
:
default_concurrency
=
1
cpu_count
=
os
.
cpu_count
()
if
cpu_count
is
None
:
concurrency
=
default_concurrency
else
:
concurrency
=
min
(
math
.
floor
(
cpu_count
/
workers_count
),
math
.
floor
(
settings
.
MAX_PROCESSES_COUNT
/
workers_count
))
if
concurrency
<=
0
:
concurrency
=
default_concurrency
logger
.
debug
(
"Load single summary file, file path: %s, concurrency: %s."
,
file_handler
.
file_path
,
concurrency
)
semaphore
=
threading
.
Semaphore
(
value
=
concurrency
)
with
futures
.
ProcessPoolExecutor
(
max_workers
=
concurrency
)
as
executor
:
while
True
:
start_offset
=
file_handler
.
offset
try
:
event_str
=
self
.
_event_load
(
file_handler
)
if
event_str
is
None
:
file_handler
.
reset_offset
(
start_offset
)
break
# Make sure we have at most concurrency tasks not finished to save memory.
semaphore
.
acquire
()
future
=
executor
.
submit
(
self
.
_event_parse
,
event_str
,
self
.
_latest_filename
)
def
_add_tensor_event_callback
(
future_value
):
try
:
tensor_values
=
future_value
.
result
()
for
tensor_value
in
tensor_values
:
if
tensor_value
.
plugin_name
==
PluginNameEnum
.
GRAPH
.
value
:
try
:
graph_tags
=
self
.
_events_data
.
list_tags_by_plugin
(
PluginNameEnum
.
GRAPH
.
value
)
except
KeyError
:
graph_tags
=
[]
summary_tags
=
self
.
filter_files
(
graph_tags
)
for
tag
in
summary_tags
:
self
.
_events_data
.
delete_tensor_event
(
tag
)
self
.
_events_data
.
add_tensor_event
(
tensor_value
)
except
Exception
as
exc
:
# Log exception for debugging.
logger
.
exception
(
exc
)
raise
finally
:
semaphore
.
release
()
future
.
add_done_callback
(
_add_tensor_event_callback
)
except
exceptions
.
CRCFailedError
:
file_handler
.
reset_offset
(
start_offset
)
logger
.
warning
(
"Check crc faild and ignore this file, file_path=%s, "
"offset=%s."
,
file_handler
.
file_path
,
file_handler
.
offset
)
break
event
=
summary_pb2
.
Event
.
FromString
(
event_str
)
self
.
_event_parse
(
event
)
except
exceptions
.
CRCFailedError
:
file_handler
.
reset_offset
(
start_offset
)
logger
.
warning
(
"Check crc faild and ignore this file, file_path=%s, "
"offset=%s."
,
file_handler
.
file_path
,
file_handler
.
offset
)
break
except
(
OSError
,
DecodeError
,
exceptions
.
MindInsightException
)
as
ex
:
logger
.
warning
(
"Parse log file fail, and ignore this file, detail: %r,"
"file path: %s."
,
str
(
ex
),
file_handler
.
file_path
)
break
except
Exception
as
ex
:
logger
.
exception
(
ex
)
raise
UnknownError
(
str
(
ex
))
except
(
OSError
,
DecodeError
,
exceptions
.
MindInsightException
)
as
ex
:
logger
.
warning
(
"Parse log file fail, and ignore this file, detail: %r,"
"file path: %s."
,
str
(
ex
),
file_handler
.
file_path
)
break
except
Exception
as
ex
:
logger
.
exception
(
ex
)
raise
UnknownError
(
str
(
ex
))
def
_event_load
(
self
,
file_handler
):
"""
...
...
@@ -381,20 +430,29 @@ class _SummaryParser(_Parser):
return
event_str
def
_event_parse
(
self
,
event
):
@
staticmethod
def
_event_parse
(
event_str
,
latest_file_name
):
"""
Transform `Event` data to tensor_event and update it to EventsData.
This method is static to avoid sending unnecessary objects to other processes.
Args:
event (Event): Message event in summary proto, data read from file handler.
event (str): Message event string in summary proto, data read from file handler.
latest_file_name (str): Latest file name.
"""
plugins
=
{
'scalar_value'
:
PluginNameEnum
.
SCALAR
,
'image'
:
PluginNameEnum
.
IMAGE
,
'histogram'
:
PluginNameEnum
.
HISTOGRAM
,
'tensor'
:
PluginNameEnum
.
TENSOR
}
logger
.
debug
(
"Start to parse event string. Event string len: %s."
,
len
(
event_str
))
event
=
summary_pb2
.
Event
.
FromString
(
event_str
)
logger
.
debug
(
"Deserialize event string completed."
)
ret_tensor_events
=
[]
if
event
.
HasField
(
'summary'
):
for
value
in
event
.
summary
.
value
:
for
plugin
in
plugins
:
...
...
@@ -402,6 +460,7 @@ class _SummaryParser(_Parser):
continue
plugin_name_enum
=
plugins
[
plugin
]
tensor_event_value
=
getattr
(
value
,
plugin
)
logger
.
debug
(
"Processing plugin value: %s."
,
plugin_name_enum
)
if
plugin
==
'histogram'
:
tensor_event_value
=
HistogramContainer
(
tensor_event_value
)
...
...
@@ -419,29 +478,23 @@ class _SummaryParser(_Parser):
tag
=
'{}/{}'
.
format
(
value
.
tag
,
plugin_name_enum
.
value
),
plugin_name
=
plugin_name_enum
.
value
,
value
=
tensor_event_value
,
filename
=
self
.
_latest_filename
)
self
.
_events_data
.
add_tensor_event
(
tensor_event
)
filename
=
latest_file_name
)
logger
.
debug
(
"Tensor event generated, plugin is %s, tag is %s, step is %s."
,
plugin_name_enum
,
value
.
tag
,
event
.
step
)
ret_tensor_events
.
append
(
tensor_event
)
elif
event
.
HasField
(
'graph_def'
):
graph
=
MSGraph
()
graph
.
build_graph
(
event
.
graph_def
)
tensor_event
=
TensorEvent
(
wall_time
=
event
.
wall_time
,
step
=
event
.
step
,
tag
=
self
.
_latest_file
name
,
tag
=
latest_file_
name
,
plugin_name
=
PluginNameEnum
.
GRAPH
.
value
,
value
=
graph
,
filename
=
self
.
_latest_filename
)
try
:
graph_tags
=
self
.
_events_data
.
list_tags_by_plugin
(
PluginNameEnum
.
GRAPH
.
value
)
except
KeyError
:
graph_tags
=
[]
summary_tags
=
self
.
filter_files
(
graph_tags
)
for
tag
in
summary_tags
:
self
.
_events_data
.
delete_tensor_event
(
tag
)
filename
=
latest_file_name
)
ret_tensor_events
.
append
(
tensor_event
)
self
.
_events_data
.
add_tensor_event
(
tensor_event
)
return
ret_tensor_events
@
staticmethod
def
_compare_summary_file
(
current_file
,
dst_file
):
...
...
mindinsight/datavisual/data_transform/tensor_container.py
浏览文件 @
008bdf56
...
...
@@ -199,8 +199,8 @@ class TensorContainer:
def
__init__
(
self
,
tensor_message
):
self
.
_lock
=
threading
.
Lock
self
.
_msg
=
tensor_message
self
.
_dims
=
t
ensor_message
.
dims
# Original dims can not be pickled to transfer to other process, so tuple is used.
self
.
_dims
=
t
uple
(
tensor_message
.
dims
)
self
.
_data_type
=
tensor_message
.
data_type
self
.
_np_array
=
None
self
.
_data
=
_get_data_from_tensor
(
tensor_message
)
...
...
@@ -265,5 +265,4 @@ class TensorContainer:
logger
.
error
(
"Reshape array fail, detail: %r"
,
str
(
ex
))
return
self
.
_msg
=
None
self
.
_np_array
=
ndarray
mindinsight/datavisual/processors/tensor_processor.py
浏览文件 @
008bdf56
...
...
@@ -245,7 +245,7 @@ class TensorProcessor(BaseProcessor):
# This value is an instance of TensorContainer
value
=
tensor
.
value
value_dict
=
{
"dims"
:
tuple
(
value
.
dims
)
,
"dims"
:
value
.
dims
,
"data_type"
:
anf_ir_pb2
.
DataType
.
Name
(
value
.
data_type
)
}
if
detail
and
detail
==
'stats'
:
...
...
@@ -313,7 +313,7 @@ class TensorProcessor(BaseProcessor):
"wall_time"
:
tensor
.
wall_time
,
"step"
:
tensor
.
step
,
"value"
:
{
"dims"
:
tuple
(
value
.
dims
)
,
"dims"
:
value
.
dims
,
"data_type"
:
anf_ir_pb2
.
DataType
.
Name
(
value
.
data_type
),
"data"
:
res_data
.
tolist
(),
"statistics"
:
get_statistics_dict
(
value
,
flatten_data
)
...
...
@@ -362,7 +362,7 @@ class TensorProcessor(BaseProcessor):
"wall_time"
:
tensor
.
wall_time
,
"step"
:
tensor
.
step
,
"value"
:
{
"dims"
:
tuple
(
value
.
dims
)
,
"dims"
:
value
.
dims
,
"data_type"
:
anf_ir_pb2
.
DataType
.
Name
(
value
.
data_type
),
"histogram_buckets"
:
buckets
,
"statistics"
:
get_statistics_dict
(
value
,
None
)
...
...
mindinsight/scripts/stop.py
浏览文件 @
008bdf56
...
...
@@ -103,21 +103,17 @@ class Command(BaseCommand):
self
.
logfile
.
info
(
'Stop mindinsight with port %s and pid %s.'
,
port
,
pid
)
process
=
psutil
.
Process
(
pid
)
child_pids
=
[
child
.
pid
for
child
in
process
.
children
()]
processes_to_kill
=
[
process
]
# Set recursive to True to kill grand children processes.
for
child
in
process
.
children
(
recursive
=
True
):
processes_to_kill
.
append
(
child
)
# kill gunicorn master process
try
:
os
.
kill
(
pid
,
signal
.
SIGKILL
)
except
PermissionError
:
self
.
console
.
info
(
'kill pid %s failed due to permission error'
,
pid
)
sys
.
exit
(
1
)
# cleanup gunicorn worker processes
for
child_pid
in
child_pids
:
for
proc
in
processes_to_kill
:
self
.
logfile
.
info
(
'Stopping mindinsight process %s.'
,
proc
.
pid
)
try
:
os
.
kill
(
child_pid
,
signal
.
SIGKILL
)
except
ProcessLookupError
:
pass
proc
.
send_signal
(
signal
.
SIGKILL
)
except
psutil
.
Error
as
ex
:
self
.
logfile
.
warning
(
"Stop process %s failed. Detail: %s."
,
proc
.
pid
,
str
(
ex
))
for
hook
in
HookUtils
.
instance
().
hooks
():
hook
.
on_shutdown
(
self
.
logfile
)
...
...
@@ -154,7 +150,19 @@ class Command(BaseCommand):
if
user
!=
process
.
username
():
continue
pid
=
process
.
pid
if
process
.
ppid
()
==
1
else
process
.
ppid
()
gunicorn_master_process
=
process
# The gunicorn master process might have grand children (eg forked by process pool).
while
True
:
parent_process
=
gunicorn_master_process
.
parent
()
if
parent_process
is
None
or
parent_process
.
pid
==
1
:
break
parent_cmd
=
parent_process
.
cmdline
()
if
' '
.
join
(
parent_cmd
).
find
(
self
.
cmd_regex
)
==
-
1
:
break
gunicorn_master_process
=
parent_process
pid
=
gunicorn_master_process
.
pid
for
open_file
in
process
.
open_files
():
if
open_file
.
path
.
endswith
(
self
.
access_log_path
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录