Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MindSpore
mindinsight
提交
cce05461
M
mindinsight
项目概览
MindSpore
/
mindinsight
通知
8
Star
4
Fork
2
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindinsight
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
cce05461
编写于
6月 04, 2020
作者:
Y
yelihua
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
implementation of step trace profiler
上级
e569b96b
变更
12
隐藏空白更改
内联
并排
Showing
12 changed file
with
863 addition
and
27 deletion
+863
-27
mindinsight/backend/profiler/profile_api.py
mindinsight/backend/profiler/profile_api.py
+63
-5
mindinsight/profiler/analyser/__init__.py
mindinsight/profiler/analyser/__init__.py
+3
-0
mindinsight/profiler/analyser/analyser_factory.py
mindinsight/profiler/analyser/analyser_factory.py
+4
-3
mindinsight/profiler/analyser/step_trace_analyser.py
mindinsight/profiler/analyser/step_trace_analyser.py
+266
-0
mindinsight/profiler/common/_utils.py
mindinsight/profiler/common/_utils.py
+12
-10
mindinsight/profiler/common/exceptions/error_code.py
mindinsight/profiler/common/exceptions/error_code.py
+7
-0
mindinsight/profiler/common/exceptions/exceptions.py
mindinsight/profiler/common/exceptions/exceptions.py
+22
-0
mindinsight/profiler/common/util.py
mindinsight/profiler/common/util.py
+98
-0
mindinsight/profiler/common/validator/validate.py
mindinsight/profiler/common/validator/validate.py
+48
-1
mindinsight/profiler/common/validator/validate_path.py
mindinsight/profiler/common/validator/validate_path.py
+33
-0
mindinsight/profiler/parser/step_trace_parser.py
mindinsight/profiler/parser/step_trace_parser.py
+264
-0
mindinsight/profiler/profiling.py
mindinsight/profiler/profiling.py
+43
-8
未找到文件。
mindinsight/backend/profiler/profile_api.py
浏览文件 @
cce05461
...
...
@@ -21,17 +21,17 @@ import json
import
os
from
flask
import
Blueprint
from
flask
import
request
from
flask
import
jsonify
from
flask
import
request
from
marshmallow
import
ValidationError
from
mindinsight.conf
import
settings
from
mindinsight.datavisual.utils.tools
import
get_train_id
,
get_profiler_dir
from
mindinsight.profiler.analyser.analyser_factory
import
AnalyserFactory
from
mindinsight.datavisual.utils.tools
import
get_train_id
,
get_profiler_dir
,
to_int
from
mindinsight.lineagemgr.common.validator.validate_path
import
validate_and_normalize_path
from
mindinsight.profiler.analyser.analyser_factory
import
AnalyserFactory
from
mindinsight.profiler.common.util
import
analyse_device_list_from_profiler_dir
from
mindinsight.profiler.common.validator.validate
import
validate_condition
from
mindinsight.profiler.common.validator.validate
import
validate_condition
,
validate_ui_proc
from
mindinsight.profiler.common.validator.validate_path
import
validate_and_normalize_profiler_path
from
mindinsight.utils.exceptions
import
ParamValueError
BLUEPRINT
=
Blueprint
(
"profile"
,
__name__
,
url_prefix
=
settings
.
URL_PREFIX
)
...
...
@@ -109,6 +109,64 @@ def get_profile_device_list():
return
jsonify
(
device_list
)
@
BLUEPRINT
.
route
(
"/profile/training-trace/graph"
,
methods
=
[
"GET"
])
def
get_training_trace_graph
():
"""
Get training trace info of one step.
Returns:
Response, the training trace info of one step.
Examples:
>>> GET http://xxxx/v1/mindinsight/profile/training-trace/graph
"""
summary_dir
=
request
.
args
.
get
(
"dir"
)
profiler_dir
=
validate_and_normalize_profiler_path
(
summary_dir
)
graph_type
=
request
.
args
.
get
(
"type"
,
default
=
'0'
)
graph_type
=
to_int
(
graph_type
,
'graph_type'
)
device_id
=
request
.
args
.
get
(
"device_id"
,
default
=
'0'
)
_
=
to_int
(
device_id
,
'device_id'
)
analyser
=
AnalyserFactory
.
instance
().
get_analyser
(
'step_trace'
,
profiler_dir
,
device_id
)
graph_info
=
analyser
.
query
({
'filter_condition'
:
{
'mode'
:
'step'
,
'step_id'
:
graph_type
}})
return
jsonify
(
graph_info
)
@
BLUEPRINT
.
route
(
"/profile/training-trace/target-time-info"
,
methods
=
[
"GET"
])
def
get_target_time_info
():
"""
Get all the time information of the specified column.
Returns:
Response, all the time information of the specified column.
Examples:
>>> GET http://xxxx/v1/mindinsight/profile/training-trace/target-time-info
"""
summary_dir
=
request
.
args
.
get
(
"dir"
)
profiler_dir
=
validate_and_normalize_profiler_path
(
summary_dir
)
proc_name
=
request
.
args
.
get
(
"type"
)
validate_ui_proc
(
proc_name
)
device_id
=
request
.
args
.
get
(
"device_id"
,
default
=
'0'
)
_
=
to_int
(
device_id
,
'device_id'
)
analyser
=
AnalyserFactory
.
instance
().
get_analyser
(
'step_trace'
,
profiler_dir
,
device_id
)
target_time_info
=
analyser
.
query
({
'filter_condition'
:
{
'mode'
:
'proc'
,
'proc_name'
:
proc_name
}})
target_time_info
[
'summary'
]
=
analyser
.
summary
return
jsonify
(
target_time_info
)
def
init_module
(
app
):
"""
Init module entry.
...
...
mindinsight/profiler/analyser/__init__.py
浏览文件 @
cce05461
...
...
@@ -12,3 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Import analyser."""
from
.analyser
import
*
from
.step_trace_analyser
import
StepTraceAnalyser
mindinsight/profiler/analyser/analyser_factory.py
浏览文件 @
cce05461
...
...
@@ -15,7 +15,7 @@
"""The analyser factory."""
import
threading
import
mindinsight.profiler.analyser
.analyser
as
analyser_module
import
mindinsight.profiler.analyser
as
analyser_module
from
mindinsight.profiler.common.exceptions.exceptions
import
\
ProfilerAnalyserNotExistException
...
...
@@ -25,11 +25,12 @@ class AnalyserFactory:
The analyser factory is used to create analyser special instance.
Currently the factory supports creating `AicoreTypeAnalyser`,
`AicoreDetailAnalyser`
and `AicpuAnalyser`. The `AicoreTypeAnalyser` is used
to analyze execution time according to AICORE operator type.
`AicoreDetailAnalyser`
, `AicpuAnalyser` and `StepTraceAnalyser`.
The `AicoreTypeAnalyser` is used
to analyze execution time according to AICORE operator type.
The `AicoreDetailAnalyser` is used to analyze execution time according to
all specific AICORE operator. The `AicpuAnalyser` is used to analyze
execution time according to all specific AICPU operator.
The `StepTraceAnalyser` is used to analyze the execution time according to different process.
Examples:
>>> analyser = AnalyserFactory.instance().get_analyser(
...
...
mindinsight/profiler/analyser/step_trace_analyser.py
0 → 100644
浏览文件 @
cce05461
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""The StepTraceAnalyser analyser class."""
import
csv
from
mindinsight.profiler.analyser.base_analyser
import
BaseAnalyser
from
mindinsight.profiler.common.exceptions.exceptions
import
ProfilerParamValueErrorException
,
\
ProfilerFileNotFoundException
,
StepNumNotSupportedException
from
mindinsight.profiler.common.log
import
logger
as
log
from
mindinsight.profiler.common.util
import
query_latest_trace_time_file
,
get_field_value
,
\
get_summary_for_step_trace
class
StepTraceAnalyser
(
BaseAnalyser
):
"""The analyser for analyzing training steps."""
__col_names__
=
[]
_attr_ui_name
=
'name'
_attr_ui_start
=
'start'
_attr_ui_duration
=
'duration'
@
property
def
summary
(
self
):
"""The property of summary info."""
summary
=
get_summary_for_step_trace
(
self
.
_data
[
-
1
],
self
.
__column__
)
summary
[
'total_steps'
]
=
self
.
_size
return
summary
def
query
(
self
,
condition
=
None
):
"""
Query data according to the condition.
Args:
condition (dict): The search condition, only contains `filter_condition` parameter.
Default: None.
Returns:
dict, the result after filtered, sorted and grouped.
"""
if
condition
is
None
:
condition
=
{}
filter_condition
=
condition
.
get
(
'filter_condition'
,
{})
self
.
_validate_filter_condition
(
filter_condition
)
self
.
_result
=
{
'size'
:
self
.
_size
}
self
.
_filter
(
filter_condition
)
return
self
.
_result
def
query_for_all_reduce
(
self
):
"""
Query for all reduce info.
Returns:
list[dict], each item is the reduce info for one step, the reduce info is format like:
{stream_id: List[Tuple(start_point, end_point, duration, field_name)]}.
"""
reduce_infos
=
[]
for
row_info
in
self
.
_data
[:
-
1
]:
reduce_info
=
self
.
_get_reduce_time_in_order
(
row_info
,
'systime'
)
reduce_infos
.
append
(
reduce_info
)
return
reduce_infos
def
_load
(
self
):
"""Load data according to the parsed AICORE operator types file."""
file_path
=
query_latest_trace_time_file
(
self
.
_profiling_dir
,
self
.
_device_id
)
if
not
file_path
:
log
.
error
(
"Failed to find parsed trace time file."
)
raise
ProfilerFileNotFoundException
(
'parsed step trace time file'
)
with
open
(
file_path
,
'r'
)
as
handle
:
csv_reader
=
csv
.
reader
(
handle
)
self
.
__column__
=
next
(
csv_reader
)
self
.
_data
=
list
(
csv_reader
)
self
.
_size
=
len
(
self
.
_data
)
-
1
self
.
_display_col_names
=
self
.
__col_names__
[:]
def
_filter
(
self
,
filter_condition
):
"""
Filter the profiling data according to the filter condition.
Args:
filter_condition (dict): The filter condition.
- mode (str): The kind of information. `step` return the info about specific
step. `proc` return the info about specific field in parsed trace file.
- step_id (int): The selected step_id. If not given, it means all steps is required.
If the value is 0, it means average info for all steps except the first is
required.
- proc_name (str): The selected field name.
- time_type (str): The value type. `systime` keeps the original value.
`realtime` transforms the value in millisecond. Default: `realtime`.
"""
mode
=
filter_condition
.
get
(
'mode'
,
'step'
)
if
mode
==
'step'
:
self
.
_get_step_details
(
step_id
=
filter_condition
.
get
(
'step_id'
),
time_type
=
filter_condition
.
get
(
'time_type'
,
'realtime'
))
else
:
self
.
_get_proc_details
(
step_id
=
filter_condition
.
get
(
'step_id'
),
proc_name
=
filter_condition
.
get
(
'proc_name'
),
time_type
=
filter_condition
.
get
(
'time_type'
,
'realtime'
))
def
_construct_time_point
(
self
,
name
,
start
,
duration
):
"""Construct time point."""
point
=
{
self
.
_attr_ui_name
:
name
,
self
.
_attr_ui_start
:
round
(
start
,
4
),
self
.
_attr_ui_duration
:
round
(
duration
,
4
)
}
return
point
def
_get_step_details
(
self
,
step_id
,
time_type
=
'realtime'
):
"""
Get step trace info for selected step and save the result.
Args:
step_id (int): The selected step_id. If the value is 0, it means average info
for all steps except the first is required.
time_type (str): The value type. `systime` keeps the original value.
`realtime` transforms the value in millisecond. Default: `realtime`.
"""
if
step_id
is
None
:
step_id
=
0
row_info
=
self
.
_data
[
step_id
-
1
]
start_point
=
get_field_value
(
row_info
,
'start_point'
,
self
.
__column__
,
time_type
)
total
=
get_field_value
(
row_info
,
'total'
,
self
.
__column__
,
time_type
)
iteration_interval
=
get_field_value
(
row_info
,
'iteration_interval'
,
self
.
__column__
,
time_type
)
fp_point
=
get_field_value
(
row_info
,
'fp_point'
,
self
.
__column__
,
time_type
)
fp_and_bp
=
get_field_value
(
row_info
,
'fp_and_bp'
,
self
.
__column__
,
time_type
)
bp_point
=
get_field_value
(
row_info
,
'bp_point'
,
self
.
__column__
,
time_type
)
tail
=
get_field_value
(
row_info
,
'tail'
,
self
.
__column__
,
time_type
)
# first line only contains total time
first_line
=
[
self
.
_construct_time_point
(
''
,
0
,
total
)]
# second line contains iteration_interval, fp_and_bp and tail
second_line
=
[
self
.
_construct_time_point
(
''
,
0
,
iteration_interval
),
self
.
_construct_time_point
(
'fp_and_bp'
,
fp_point
-
start_point
,
fp_and_bp
),
self
.
_construct_time_point
(
''
,
bp_point
-
start_point
,
tail
),
]
# construct reduces lines
reduce_lines
=
self
.
_construct_reduce_lines
(
row_info
,
time_type
)
graph
=
[
first_line
,
second_line
]
graph
.
extend
(
reduce_lines
)
self
.
_result
[
'training_trace_graph'
]
=
graph
def
_get_reduce_time_in_order
(
self
,
row_info
,
time_type
):
"""Get reduce time in order."""
reduce_info
=
{}
reduce_fields
=
[
field_name
for
field_name
in
self
.
__column__
if
field_name
.
startswith
(
'stream_'
)
and
not
field_name
.
endswith
(
'point'
)]
for
reduce_field
in
reduce_fields
:
cur_stream_id
=
reduce_field
.
split
(
'_'
,
2
)[
1
]
cur_stream
=
reduce_info
.
get
(
cur_stream_id
)
if
not
cur_stream
:
cur_stream
=
[]
reduce_info
[
cur_stream_id
]
=
cur_stream
reduce_start
=
get_field_value
(
row_info
,
reduce_field
+
'_start_point'
,
self
.
__column__
,
time_type
)
reduce_end
=
get_field_value
(
row_info
,
reduce_field
+
'_end_point'
,
self
.
__column__
,
time_type
)
reduce_duration
=
get_field_value
(
row_info
,
reduce_field
,
self
.
__column__
,
time_type
)
cur_stream
.
append
((
reduce_start
,
reduce_end
,
reduce_duration
,
reduce_field
))
for
_
,
reduce_events
in
reduce_info
.
items
():
reduce_events
.
sort
(
key
=
lambda
elem
:
elem
[
1
])
return
reduce_info
def
_construct_reduce_lines
(
self
,
row_info
,
time_type
):
"""Contruct first line in detailed graph."""
reduce_lines
=
[]
start_point
=
get_field_value
(
row_info
,
'start_point'
,
self
.
__column__
,
time_type
)
fp_point
=
get_field_value
(
row_info
,
'fp_point'
,
self
.
__column__
,
time_type
)
end_point
=
get_field_value
(
row_info
,
'end_point'
,
self
.
__column__
,
time_type
)
reduce_info
=
self
.
_get_reduce_time_in_order
(
row_info
,
time_type
)
# construct time point for each line
for
_
,
reduce_events
in
reduce_info
.
items
():
current_line
=
self
.
_construct_reduce_line
(
start_point
,
end_point
,
fp_point
,
reduce_events
)
reduce_lines
.
append
(
current_line
)
return
reduce_lines
def
_construct_reduce_line
(
self
,
start_point
,
end_point
,
fp_point
,
reduce_events
):
"""Construct list of time points for reduce line."""
current_line
=
[]
previous_start
=
fp_point
for
start
,
end
,
duration
,
field_name
in
reduce_events
:
current_line
.
extend
([
self
.
_construct_time_point
(
''
,
previous_start
-
start_point
,
start
-
previous_start
),
self
.
_construct_time_point
(
field_name
,
start
-
start_point
,
duration
)
])
previous_start
=
end
current_line
.
append
(
self
.
_construct_time_point
(
''
,
previous_start
-
start_point
,
end_point
-
previous_start
))
return
current_line
def
_get_proc_details
(
self
,
proc_name
,
step_id
=
None
,
time_type
=
'realtime'
):
"""
Get step trace info for selected step and save the result.
Args:
proc_name (str): The selected field name.
step_id (int): The selected step_id. If not given, it means all steps is required.
If the value is 0, it means average info for all steps except the first is
required. Default: None.
time_type (str): The value type. `systime` keeps the original value.
`realtime` transforms the value in millisecond. Default: `realtime`.
"""
if
step_id
is
None
:
rows_info
=
self
.
_data
[:
-
1
]
else
:
rows_info
=
[
self
.
_data
[
step_id
-
1
]]
proc_info
=
[
get_field_value
(
row_info
,
proc_name
,
self
.
__column__
,
time_type
)
for
row_info
in
rows_info
]
self
.
_result
[
'info'
]
=
{
proc_name
:
proc_info
}
def
_validate_filter_condition
(
self
,
filter_condition
):
"""Validate step trace filter_condition."""
mode
=
filter_condition
.
get
(
'mode'
,
'step'
)
self
.
_validate_str_param
(
mode
,
[
'step'
,
'proc'
],
'mode'
)
step_id
=
filter_condition
.
get
(
'step_id'
,
0
)
self
.
_validate_step_id
(
step_id
)
proc_name
=
filter_condition
.
get
(
'proc_name'
)
self
.
_validate_str_param
(
proc_name
,
self
.
__column__
,
'proc_name'
)
time_type
=
filter_condition
.
get
(
'time_type'
,
'realtime'
)
self
.
_validate_str_param
(
time_type
,
[
'realtime'
,
'systime'
],
'time_type'
)
def
_validate_step_id
(
self
,
step_id
):
"""Validate step_id."""
if
isinstance
(
step_id
,
int
)
and
0
<=
step_id
<=
self
.
_size
:
return
log
.
error
(
"Invalid step_id in request. step_id should be in [0, %d]."
,
self
.
_size
)
raise
StepNumNotSupportedException
([
0
,
self
.
_size
])
@
staticmethod
def
_validate_str_param
(
proc_name
,
accept_param
,
error_name
=
''
):
"""Validate proc_name."""
if
proc_name
is
None
or
isinstance
(
proc_name
,
str
)
and
proc_name
in
accept_param
:
return
log
.
error
(
"Invalid param %s in request. Acceptable value is %s."
,
error_name
,
accept_param
)
raise
ProfilerParamValueErrorException
(
"Invalid proc_name."
)
mindinsight/profiler/common/_utils.py
浏览文件 @
cce05461
...
...
@@ -16,15 +16,16 @@
import
os
import
re
def
fwrite_format
(
output_data_path
,
data_source
=
None
,
is_print
=
False
,
is_start
=
False
):
"""
Write data to the output file.
Args:
output_data_path
(str): t
he output file path of the data.
data_source
(list): t
he data to write.
is_print(bool): whether to print the data to stdout.
is_start(bool): Whether is the first line of the output file, will remove the old file if True."
output_data_path
(str): T
he output file path of the data.
data_source
(list): T
he data to write.
is_print
(bool): whether to print the data to stdout.
is_start
(bool): Whether is the first line of the output file, will remove the old file if True."
"""
if
is_start
is
True
and
os
.
path
.
exists
(
output_data_path
):
...
...
@@ -48,16 +49,17 @@ def get_log_slice_id(file_name):
index
=
re
.
findall
(
r
'\d+'
,
slice_list
[
0
])
return
int
(
index
[
0
])
def
get_file_join_name
(
input_path
,
file_name
):
"""
Search files under the special path, and will join all the files to one file.
Args:
input_path
(str): t
he source path, will search files under it.
file_name
(str): t
he target of the filename, such as 'hwts.log.data.45.dev'.
input_path
(str): T
he source path, will search files under it.
file_name
(str): T
he target of the filename, such as 'hwts.log.data.45.dev'.
Returns:
str
:
the join file name.
str
,
the join file name.
"""
name_list
=
[]
file_join_name
=
''
...
...
@@ -90,11 +92,11 @@ def get_file_names(input_path, file_name):
Search files under the special path.
Args:
input_path
(str): the souce path, will se
rch files under it.
file_name
(str): t
he target of the filename, such as 'host_start_log'.
input_path
(str): The source path, will sea
rch files under it.
file_name
(str): T
he target of the filename, such as 'host_start_log'.
Returns:
list
:
file name list.
list
,
file name list.
"""
input_path
=
os
.
path
.
realpath
(
input_path
)
...
...
mindinsight/profiler/common/exceptions/error_code.py
浏览文件 @
cce05461
...
...
@@ -37,6 +37,8 @@ class ProfilerErrors(ProfilerMgrErrors):
# parser error code
DEVICE_ID_MISMATCH_ERROR
=
0
|
_PARSER_MASK
RAW_FILE_ERROR
=
1
|
_PARSER_MASK
STEP_NUM_NOT_SUPPORTED_ERROR
=
2
|
_PARSER_MASK
JOB_ID_MISMATCH_ERROR
=
3
|
_PARSER_MASK
# analyser error code
COLUMN_NOT_EXIST_ERROR
=
0
|
_ANALYSER_MASK
...
...
@@ -49,6 +51,8 @@ class ProfilerErrors(ProfilerMgrErrors):
COLUMN_NOT_SUPPORT_SORT_ERROR
=
7
|
_ANALYSER_MASK
@
unique
class
ProfilerErrorMsg
(
Enum
):
"""Profiler error messages."""
...
...
@@ -63,6 +67,9 @@ class ProfilerErrorMsg(Enum):
# parser error msg
DEVICE_ID_MISMATCH_ERROR
=
'The device ID mismatch.'
RAW_FILE_ERROR
=
'Raw file error. {}'
STEP_NUM_NOT_SUPPORTED_ERROR
=
'The step num must be in {}'
JOB_ID_MISMATCH_ERROR
=
'The job id in the parameter is not the same as '
\
'in the training trace file. '
# analyser error msg
COLUMN_NOT_EXIST_ERROR
=
'The column {} does not exist.'
...
...
mindinsight/profiler/common/exceptions/exceptions.py
浏览文件 @
cce05461
...
...
@@ -192,3 +192,25 @@ class ProfilerColumnNotSupportSortException(MindInsightException):
message
=
ProfilerErrorMsg
.
COLUMN_NOT_SUPPORT_SORT_ERROR
.
value
.
format
(
msg
),
http_code
=
400
)
class
StepNumNotSupportedException
(
MindInsightException
):
"""The step number error in profiler module."""
def
__init__
(
self
,
msg
):
super
(
StepNumNotSupportedException
,
self
).
__init__
(
error
=
ProfilerErrors
.
STEP_NUM_NOT_SUPPORTED_ERROR
,
message
=
ProfilerErrorMsg
.
STEP_NUM_NOT_SUPPORTED_ERROR
.
value
.
format
(
msg
),
http_code
=
400
)
class
JobIdMismatchException
(
MindInsightException
):
"""The Job ID mismatch error in profiler module."""
def
__init__
(
self
):
super
(
JobIdMismatchException
,
self
).
__init__
(
error
=
ProfilerErrors
.
JOB_ID_MISMATCH_ERROR
,
message
=
ProfilerErrorMsg
.
JOB_ID_MISMATCH_ERROR
.
value
,
http_code
=
400
)
mindinsight/profiler/common/util.py
浏览文件 @
cce05461
...
...
@@ -19,6 +19,8 @@ This module provides the utils.
"""
import
os
from
mindinsight.datavisual.utils.tools
import
to_int
def
analyse_device_list_from_profiler_dir
(
profiler_dir
):
"""
...
...
@@ -40,3 +42,99 @@ def analyse_device_list_from_profiler_dir(profiler_dir):
device_id_list
.
add
(
device_num
)
return
list
(
device_id_list
)
def
query_latest_trace_time_file
(
profiler_dir
,
device_id
=
0
):
"""
Query the latest trace time file.
Args:
profiler_dir (str): The profiler directory.
device_id (int): The id of device.
Returns:
str, the latest trace time file path.
"""
files
=
os
.
listdir
(
profiler_dir
)
target_file
=
f
'step_trace_raw_
{
device_id
}
_detail_time.csv'
try
:
latest_file
=
max
(
filter
(
lambda
file
:
file
==
target_file
,
files
),
key
=
lambda
file
:
os
.
stat
(
os
.
path
.
join
(
profiler_dir
,
file
)).
st_mtime
)
except
ValueError
:
return
None
return
os
.
path
.
join
(
profiler_dir
,
latest_file
)
def
query_step_trace_file
(
profiler_dir
):
"""
Query for all step trace file.
Args:
profiler_dir (str): The directory that contains all step trace files.
Returns:
str, the file path of step trace time.
"""
files
=
os
.
listdir
(
profiler_dir
)
training_trace_file
=
list
(
filter
(
lambda
file
:
file
.
startswith
(
'training_trace'
)
and
not
file
.
endswith
(
'.done'
),
files
)
)
if
training_trace_file
:
return
os
.
path
.
join
(
profiler_dir
,
training_trace_file
[
0
])
return
None
def
get_summary_for_step_trace
(
average_info
,
header
):
"""The property of summary info."""
if
not
average_info
or
not
header
:
return
{}
total_time
=
get_field_value
(
average_info
,
'total'
,
header
)
iteration_interval
=
get_field_value
(
average_info
,
'iteration_interval'
,
header
)
fp_and_bp
=
get_field_value
(
average_info
,
'fp_and_bp'
,
header
)
tail
=
get_field_value
(
average_info
,
'tail'
,
header
)
summary
=
{
'total_time'
:
total_time
,
'iteration_interval'
:
calculate_percent
(
iteration_interval
,
total_time
),
'fp_and_bp'
:
calculate_percent
(
fp_and_bp
,
total_time
),
'tail'
:
calculate_percent
(
tail
,
total_time
)
}
return
summary
def
calculate_percent
(
partial
,
total
):
"""Calculate percent value."""
percent
=
round
(
partial
/
total
*
100
,
2
)
return
f
'
{
percent
}
%'
def
get_field_value
(
row_info
,
field_name
,
header
,
time_type
=
'realtime'
):
"""
Extract basic info through row_info.
Args:
row_info (list): The list of data info in one row.
header (list[str]): The list of field names.
field_name (str): The name in header.
time_type (str): The type of value, `realtime` or `systime`. Default: `realtime`.
Returns:
dict, step trace info in dict format.
"""
# one sys count takes 10 ns, 1 ms has 100000 syscnt
per_ms_syscnt
=
100000
field_index
=
header
.
index
(
field_name
)
value
=
row_info
[
field_index
]
value
=
to_int
(
value
,
field_name
)
if
time_type
==
'realtime'
:
value
=
value
/
per_ms_syscnt
return
value
mindinsight/profiler/common/validator/validate.py
浏览文件 @
cce05461
...
...
@@ -13,9 +13,14 @@
# limitations under the License.
# ============================================================================
"""Validate the profiler parameters."""
import
os
import
sys
from
mindinsight.datavisual.utils.tools
import
to_int
from
mindinsight.profiler.common.exceptions.exceptions
import
ProfilerParamTypeErrorException
,
\
ProfilerDeviceIdException
,
ProfilerOpTypeException
,
\
ProfilerSortConditionException
,
ProfilerFilterConditionException
,
ProfilerGroupConditionException
ProfilerSortConditionException
,
ProfilerFilterConditionException
,
\
ProfilerGroupConditionException
,
ProfilerParamValueErrorException
from
mindinsight.profiler.common.log
import
logger
as
log
AICORE_TYPE_COL
=
[
"op_type"
,
"execution_time"
,
"execution_frequency"
,
"precent"
]
...
...
@@ -71,6 +76,7 @@ def validate_condition(search_condition):
if
"filter_condition"
in
search_condition
:
validate_filter_condition
(
search_condition
)
def
validata_group_condition
(
search_condition
):
"""
Verify the group_condition in search_condition is valid or not.
...
...
@@ -185,3 +191,44 @@ def validate_filter_condition(search_condition):
validate_op_filter_condition
(
op_name_condition
)
if
"op_type"
not
in
filter_condition
and
"op_name"
not
in
filter_condition
:
raise
ProfilerFilterConditionException
(
"The key of filter_condition is not support"
)
def
validate_and_set_job_id_env
(
job_id_env
):
"""
Validate the job id and set it in environment.
Args:
job_id_env (str): The id that to be set in environment parameter `JOB_ID`.
Returns:
int, the valid job id env.
"""
if
job_id_env
is
None
:
return
job_id_env
# get job_id_env in int type
valid_id
=
to_int
(
job_id_env
,
'job_id_env'
)
# check the range of valid_id
if
valid_id
and
255
<
valid_id
<
sys
.
maxsize
:
os
.
environ
[
'JOB_ID'
]
=
job_id_env
else
:
log
.
warning
(
"Invalid job_id_env %s. The value should be int and between 255 and %s. Use"
"default job id env instead."
,
job_id_env
,
sys
.
maxsize
)
return
valid_id
def
validate_ui_proc
(
proc_name
):
"""
Validate proc name in restful request.
Args:
proc_name (str): The proc name to query. Acceptable value is in
[`iteration_interval`, `fp_and_bp`, `tail`].
Raises:
ProfilerParamValueErrorException: If the proc_name is invalid.
"""
accept_names
=
[
'iteration_interval'
,
'fp_and_bp'
,
'tail'
]
if
proc_name
not
in
accept_names
:
log
.
error
(
"Invalid proc_name. The proc_name for restful api is in %s"
,
accept_names
)
raise
ProfilerParamValueErrorException
(
f
'proc_name should be in
{
accept_names
}
.'
)
mindinsight/profiler/common/validator/validate_path.py
浏览文件 @
cce05461
...
...
@@ -15,8 +15,14 @@
"""Validate the input path."""
import
os
from
typing
import
Union
,
List
from
urllib.parse
import
unquote
from
marshmallow
import
ValidationError
from
mindinsight.profiler.common.exceptions.exceptions
import
\
ProfilerParamValueErrorException
from
mindinsight.profiler.common.log
import
logger
as
log
def
safe_normalize_path
(
path
,
...
...
@@ -117,3 +123,30 @@ def validate_and_normalize_path(
raise
ValidationError
({
raise_key
:
{
"The path is invalid!"
}})
return
normalized_path
def
validate_and_normalize_profiler_path
(
path
):
"""
Validate and normalize profiler path.
Args:
path (str): The path of summary directory.
Returns:
str, normalized path of profiler directory.
"""
if
not
path
:
raise
ProfilerParamValueErrorException
(
'The file dir does not exist.'
)
try
:
unquote_path
=
unquote
(
path
,
errors
=
'strict'
)
except
UnicodeDecodeError
:
raise
ProfilerParamValueErrorException
(
'Unquote error with strict mode'
)
profiler_dir
=
os
.
path
.
join
(
unquote_path
,
'profiler'
)
try
:
profiler_dir
=
validate_and_normalize_path
(
profiler_dir
,
'profiler'
)
except
ValidationError
:
log
.
error
(
'profiler dir <%s> is invalid'
,
unquote_path
)
raise
ProfilerParamValueErrorException
(
'Profiler dir is invalid.'
)
return
profiler_dir
mindinsight/profiler/parser/step_trace_parser.py
0 → 100644
浏览文件 @
cce05461
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""The parser for step trace data."""
import
csv
import
os
import
stat
import
struct
from
collections
import
namedtuple
from
decimal
import
Decimal
from
mindinsight.profiler.common.exceptions.exceptions
import
ProfilerPathErrorException
,
\
JobIdMismatchException
from
mindinsight.profiler.common.log
import
logger
as
log
from
mindinsight.profiler.common.util
import
get_summary_for_step_trace
from
mindinsight.utils.exceptions
import
MindInsightException
StepTraceStruct
=
namedtuple
(
'TrainingTraceStruct'
,
[
'tag_id'
,
'task_id'
,
'stream_id'
,
'sys_count'
]
)
class
StepTraceParser
:
"""
The parser for step trace data.
Args:
input_dir (str): The directory that contains original step trace data.
output_file_path (str): The output file path.
skip_first_step (bool): Whether skip the first step or not.
"""
_event_size
=
20
def
__init__
(
self
,
input_dir
,
output_file_path
,
job_id
,
skip_first_step
=
False
):
self
.
_input_dir
=
input_dir
self
.
_output_path
=
output_file_path
self
.
_job_id
=
job_id
self
.
_skip_first_step
=
skip_first_step
self
.
_result
=
[]
self
.
_header
=
[]
self
.
_step_num
=
0
@
property
def
output_file
(
self
):
"""The property of step trace header."""
file_name
=
self
.
_output_path
.
rsplit
(
'/'
,
2
)
return
file_name
def
show
(
self
):
"""The property of step trace info."""
summary_info
=
{}
if
self
.
_result
:
summary_info
=
get_summary_for_step_trace
(
self
.
_result
[
-
1
],
self
.
_header
)
summary_info
[
'total_steps'
]
=
len
(
self
.
_result
)
-
1
print
(
'
\n
Step trace summary info (unit: syscnt):'
)
print
(
summary_info
)
print
(
'
\n
The step trace parse result saves under {summary_dir}/%s'
%
self
.
output_file
)
def
parse_and_save
(
self
):
"""Parse step trace files and save the result."""
try
:
source_file
=
self
.
_get_step_trace_file
()
self
.
_parse
(
source_file
)
self
.
_save
()
except
MindInsightException
as
err
:
log
.
error
(
"Failed to parse and save step trace files."
)
log
.
exception
(
err
)
log
.
info
(
"Finish to save intermediate result for step trace file."
)
def
_get_step_trace_file
(
self
):
"""Get step trace file."""
profiling_path
=
self
.
_input_dir
# validate input_dir
if
not
os
.
path
.
isdir
(
profiling_path
):
raise
ProfilerPathErrorException
(
'{} does not exist or is not a dir'
.
format
(
profiling_path
)
)
# get step trace files
files
=
os
.
listdir
(
profiling_path
)
step_trace_files
=
list
(
filter
(
lambda
file
:
file
.
startswith
(
'training_trace'
)
and
not
file
.
endswith
(
'.done'
),
files
)
)
# validate result
if
not
step_trace_files
:
raise
ProfilerPathErrorException
(
'training trace file does not exist'
)
if
len
(
step_trace_files
)
>
1
:
log
.
warning
(
"Not enable to parse multiple step trace files yet."
)
step_trace_file
=
os
.
path
.
join
(
profiling_path
,
step_trace_files
[
0
])
return
step_trace_file
def
_parse
(
self
,
source_file
):
"""Parse source step trace file."""
log
.
info
(
"Start to parse step trace file."
)
with
open
(
source_file
,
'rb'
)
as
handler
:
content
=
handler
.
read
()
for
step_trace
in
self
.
_get_next_step_trace
(
content
):
if
self
.
_skip_first_step
:
self
.
_skip_first_step
=
False
else
:
self
.
_record_trace_event
(
step_trace
)
self
.
_record_average_info
()
log
.
info
(
"Finish to parse step trace file."
)
def
_get_next_step_trace
(
self
,
content
):
"""
Get next step trace info.
Args:
content (bytes): The input step trace info
Returns:
Generator, return the step trace one by one.
"""
event_info
=
{}
for
pos
in
range
(
0
,
len
(
content
),
20
):
next_event
=
self
.
_get_trace_struct
(
content
[
pos
:
pos
+
self
.
_event_size
])
self
.
_construct_event_info
(
next_event
,
event_info
)
if
event_info
.
get
(
'end'
):
yield
event_info
event_info
=
{
'start'
:
event_info
.
get
(
'end'
),
'reduce'
:
{}
}
def
_get_trace_struct
(
self
,
bin_info
):
"""Translate event info to StepTraceStruct."""
if
len
(
bin_info
)
==
self
.
_event_size
:
parsed_info
=
struct
.
unpack
(
'=QHHQ'
,
bin_info
)
return
StepTraceStruct
(
*
parsed_info
)
return
None
def
_construct_event_info
(
self
,
next_event
,
event_info
):
"""Construct event info according to next_event."""
min_job_id
=
255
step_flag
:
bool
=
lambda
tag
:
tag
>
min_job_id
end_flag
:
bool
=
lambda
tag
:
tag
==
min_job_id
fp_flag
:
bool
=
lambda
tag
:
tag
==
1
bp_flag
:
bool
=
lambda
tag
:
tag
==
2
def
_on_step_event
():
"""Handle step event."""
self
.
_validate_tag_id
(
tag_id
)
if
event_info
.
get
(
'start'
):
event_info
[
'end'
]
=
sys_count
else
:
event_info
[
'start'
]
=
sys_count
event_info
[
'reduce'
]
=
{}
def
_on_reduce_event
():
"""Handle reduce event."""
stream_id
=
next_event
.
stream_id
if
event_info
[
'reduce'
].
get
(
stream_id
):
event_info
[
'reduce'
][
stream_id
].
append
(
sys_count
)
else
:
event_info
[
'reduce'
][
stream_id
]
=
[
sys_count
]
tag_id
=
next_event
.
tag_id
sys_count
=
next_event
.
sys_count
if
end_flag
(
tag_id
):
event_info
[
'end'
]
=
sys_count
elif
step_flag
(
tag_id
):
_on_step_event
()
elif
fp_flag
(
tag_id
):
event_info
[
'fp'
]
=
sys_count
elif
bp_flag
(
tag_id
):
event_info
[
'bp'
]
=
sys_count
else
:
_on_reduce_event
()
def
_validate_tag_id
(
self
,
job_id
):
"""Check the job id in source step trace file is same os user set."""
if
not
self
.
_job_id
:
self
.
_job_id
=
job_id
elif
self
.
_job_id
!=
job_id
:
raise
JobIdMismatchException
()
def
_record_trace_event
(
self
,
step_trace
):
"""Record trace event."""
self
.
_step_num
+=
1
start_time
=
step_trace
.
get
(
'start'
)
end_time
=
step_trace
.
get
(
'end'
)
fp_time
=
step_trace
.
get
(
'fp'
)
bp_time
=
step_trace
.
get
(
'bp'
)
if
not
(
start_time
and
end_time
and
fp_time
and
bp_time
):
log
.
warning
(
"The step %d is missing basic time."
,
self
.
_step_num
)
return
row_data
=
{
'step_num'
:
self
.
_step_num
,
'start_point'
:
start_time
,
'end_point'
:
end_time
,
'total'
:
end_time
-
start_time
,
'fp_point'
:
fp_time
,
'bp_point'
:
bp_time
,
'iteration_interval'
:
fp_time
-
start_time
,
'fp_and_bp'
:
bp_time
-
fp_time
,
'tail'
:
end_time
-
bp_time
}
# update reduce info
self
.
_update_reduce_info
(
step_trace
,
row_data
)
# save the row data
if
not
self
.
_header
:
self
.
_header
=
list
(
row_data
.
keys
())
row_data_list
=
[
row_data
[
header_name
]
for
header_name
in
self
.
_header
]
self
.
_result
.
append
(
row_data_list
)
@
staticmethod
def
_update_reduce_info
(
step_trace
,
row_data
):
"""Extract reduce info."""
reduce_time
=
step_trace
.
get
(
'reduce'
,
{})
for
stream_id
,
time_points
in
reduce_time
.
items
():
time_point_num
=
len
(
time_points
)
if
time_point_num
%
2
:
log
.
warning
(
"Stream %d has %d reduce time points."
,
stream_id
,
time_point_num
)
continue
for
index
,
point_id
in
enumerate
(
range
(
0
,
time_point_num
,
2
)):
field_name
=
f
'stream_
{
stream_id
}
_parallel_
{
index
}
'
row_data
[
field_name
+
'_start_point'
]
=
time_points
[
point_id
]
row_data
[
field_name
+
'_end_point'
]
=
time_points
[
point_id
+
1
]
row_data
[
field_name
]
=
time_points
[
point_id
+
1
]
-
time_points
[
point_id
]
def
_record_average_info
(
self
):
"""Calculate average info."""
result_size
=
len
(
self
.
_result
)
if
result_size
<
2
:
return
# calculate average data for each column in result data
average_data
=
[
0
]
*
len
(
self
.
_header
)
for
row_info
in
self
.
_result
[
1
:]:
average_data
=
[
Decimal
(
i
)
+
Decimal
(
j
)
for
i
,
j
in
zip
(
row_info
,
average_data
)
]
average_data
=
[
round
((
item
/
(
result_size
-
1
)))
for
item
in
average_data
]
# change step num info in average_data to None
step_num_index
=
self
.
_header
.
index
(
'step_num'
)
average_data
[
step_num_index
]
=
'-'
self
.
_result
.
append
(
average_data
)
log
.
info
(
"Finish add average info for step trace."
)
def
_save
(
self
):
log
.
info
(
"Start to save step trace file."
)
if
not
self
.
_header
:
return
with
open
(
self
.
_output_path
,
'w'
)
as
file_handle
:
csv_writer
=
csv
.
writer
(
file_handle
)
csv_writer
.
writerow
(
self
.
_header
)
for
row_data
in
self
.
_result
:
csv_writer
.
writerow
(
row_data
)
os
.
chmod
(
self
.
_output_path
,
stat
.
S_IREAD
|
stat
.
S_IWRITE
)
mindinsight/profiler/profiling.py
浏览文件 @
cce05461
...
...
@@ -15,22 +15,26 @@
"""Profiling api file."""
import
os
import
time
from
tabulate
import
tabulate
from
mindinsight.profiler.parser.hwts_log_parser
import
HWTSLogParser
from
mindinsight.profiler.parser.framework_parser
import
FrameworkParser
from
mindinsight.profiler.parser.optime_parser
import
OPComputeTimeParser
from
mindinsight.profiler.parser.aicpu_data_parser
import
DataPreProcessParser
from
mindinsight.profiler.analyser.analyser_factory
import
AnalyserFactory
from
mindinsight.profiler.analyser.integrator
import
Integrator
from
mindinsight.profiler.common._utils
import
get_file_names
,
fwrite_format
from
mindinsight.profiler.common.validator.validate_path
import
\
validate_and_normalize_path
from
mindinsight.profiler.common.log
import
logger
from
mindinsight.profiler.common.validator.checkparam
import
\
check_bool
,
check_subgraph
from
mindinsight.profiler.common.log
import
logger
from
mindinsight.profiler.common.validator.validate_path
import
\
validate_and_normalize_path
from
mindinsight.profiler.parser.aicpu_data_parser
import
DataPreProcessParser
from
mindinsight.profiler.parser.framework_parser
import
FrameworkParser
from
mindinsight.profiler.parser.hwts_log_parser
import
HWTSLogParser
from
mindinsight.profiler.parser.optime_parser
import
OPComputeTimeParser
from
mindinsight.profiler.parser.step_trace_parser
import
StepTraceParser
from
mindinsight.utils.exceptions
import
MindInsightException
PROFILING_LOG_BASE_PATH
=
"/var/log/npu/profiling"
INIT_OP_NAME
=
'Default/InitDataSetQueue'
class
Profiler
:
...
...
@@ -87,7 +91,7 @@ class Profiler:
if
device_target
and
device_target
!=
"Davinci"
\
and
device_target
!=
"Ascend"
:
msg
=
(
"Profiling: unsupport backend: %s"
\
%
device_target
)
%
device_target
)
raise
RuntimeError
(
msg
)
self
.
_dev_id
=
dev_id
...
...
@@ -120,6 +124,8 @@ class Profiler:
self
.
_detail
=
check_bool
(
is_detail
,
'is_detail'
)
self
.
_withfullpath
=
check_bool
(
is_show_op_path
,
'is_show_op_path'
)
self
.
_profiling_job_id
=
job_id
# add job id env through user input later
self
.
_job_id_env
=
None
self
.
_start_time
=
int
(
time
.
time
()
*
10000000
)
logger
.
info
(
"Profiling: profiling start time: %d"
,
self
.
_start_time
)
...
...
@@ -193,6 +199,35 @@ class Profiler:
except
MindInsightException
as
err
:
logger
.
error
(
err
.
message
)
# analyse step trace info
self
.
_analyse_step_trace
(
source_path
,
framework_parser
)
def
_analyse_step_trace
(
self
,
source_path
,
framework_parser
):
"""
Analyse step trace data and save the result.
Args:
source_path (str): The directory that contains the step trace original data.
framework_parser (str): The framework parse instance.
"""
logger
.
info
(
"Begin to parse step trace."
)
# construct output path
step_trace_intermediate_file_path
=
os
.
path
.
join
(
self
.
_output_path
,
f
'step_trace_raw_
{
self
.
_dev_id
}
_detail_time.csv'
)
# whether keep the first step
skip_first_step_flag
=
framework_parser
.
check_op_name
(
INIT_OP_NAME
)
# parser the step trace files and save the result to disk
parser
=
StepTraceParser
(
input_dir
=
source_path
,
output_file_path
=
step_trace_intermediate_file_path
,
job_id
=
self
.
_job_id_env
,
skip_first_step
=
skip_first_step_flag
)
parser
.
parse_and_save
()
# print parser result
parser
.
show
()
logger
.
info
(
"Finish save the intermediate result %s"
,
step_trace_intermediate_file_path
)
def
__del__
(
self
):
"""Disable the profiling collection service, called after training."""
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录