提交 82ed137d 编写于 作者: Y yelihua

change the method to find step trace files

上级 615c8e61
...@@ -31,6 +31,7 @@ from mindinsight.datavisual.utils.tools import get_train_id, get_profiler_dir, \ ...@@ -31,6 +31,7 @@ from mindinsight.datavisual.utils.tools import get_train_id, get_profiler_dir, \
unquote_args, to_int, get_device_id unquote_args, to_int, get_device_id
from mindinsight.profiler.analyser.analyser_factory import AnalyserFactory from mindinsight.profiler.analyser.analyser_factory import AnalyserFactory
from mindinsight.profiler.analyser.minddata_analyser import MinddataAnalyser from mindinsight.profiler.analyser.minddata_analyser import MinddataAnalyser
from mindinsight.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException
from mindinsight.profiler.proposer.compose_proposer import ComposeProposal from mindinsight.profiler.proposer.compose_proposer import ComposeProposal
from mindinsight.profiler.common.util import analyse_device_list_from_profiler_dir from mindinsight.profiler.common.util import analyse_device_list_from_profiler_dir
from mindinsight.profiler.common.validator.validate import validate_condition, \ from mindinsight.profiler.common.validator.validate import validate_condition, \
...@@ -131,9 +132,13 @@ def get_training_trace_graph(): ...@@ -131,9 +132,13 @@ def get_training_trace_graph():
graph_type = to_int(graph_type, 'graph_type') graph_type = to_int(graph_type, 'graph_type')
device_id = request.args.get("device_id", default='0') device_id = request.args.get("device_id", default='0')
_ = to_int(device_id, 'device_id') _ = to_int(device_id, 'device_id')
graph_info = {}
try:
analyser = AnalyserFactory.instance().get_analyser( analyser = AnalyserFactory.instance().get_analyser(
'step_trace', profiler_dir, device_id) 'step_trace', profiler_dir, device_id)
except ProfilerFileNotFoundException:
return jsonify(graph_info)
graph_info = analyser.query({ graph_info = analyser.query({
'filter_condition': { 'filter_condition': {
'mode': 'step', 'mode': 'step',
......
...@@ -106,9 +106,12 @@ def get_summary_for_step_trace(average_info, header): ...@@ -106,9 +106,12 @@ def get_summary_for_step_trace(average_info, header):
tail = get_field_value(average_info, 'tail', header) tail = get_field_value(average_info, 'tail', header)
summary = { summary = {
'total_time': total_time, 'total_time': total_time,
'iteration_interval': calculate_percent(iteration_interval, total_time), 'iteration_interval': iteration_interval,
'fp_and_bp': calculate_percent(fp_and_bp, total_time), 'iteration_interval_percent': calculate_percent(iteration_interval, total_time),
'tail': calculate_percent(tail, total_time) 'fp_and_bp': fp_and_bp,
'fp_and_bp_percent': calculate_percent(fp_and_bp, total_time),
'tail': tail,
'tail_percent': calculate_percent(tail, total_time)
} }
return summary return summary
......
...@@ -21,10 +21,9 @@ from collections import namedtuple ...@@ -21,10 +21,9 @@ from collections import namedtuple
from decimal import Decimal from decimal import Decimal
from mindinsight.profiler.common.exceptions.exceptions import ProfilerPathErrorException, \ from mindinsight.profiler.common.exceptions.exceptions import ProfilerPathErrorException, \
JobIdMismatchException JobIdMismatchException, ProfilerIOException
from mindinsight.profiler.common.log import logger as log from mindinsight.profiler.common.log import logger as log
from mindinsight.profiler.common.util import get_summary_for_step_trace from mindinsight.profiler.common.util import get_summary_for_step_trace
from mindinsight.utils.exceptions import MindInsightException
StepTraceStruct = namedtuple( StepTraceStruct = namedtuple(
'TrainingTraceStruct', ['tag_id', 'task_id', 'stream_id', 'sys_count'] 'TrainingTraceStruct', ['tag_id', 'task_id', 'stream_id', 'sys_count']
...@@ -72,25 +71,39 @@ class StepTraceParser: ...@@ -72,25 +71,39 @@ class StepTraceParser:
def parse_and_save(self): def parse_and_save(self):
"""Parse step trace files and save the result.""" """Parse step trace files and save the result."""
try: try:
source_file = self._get_step_trace_file() source_files = self._get_step_trace_files()
self._parse(source_file) self._parse(source_files)
self._save() self._save()
except MindInsightException as err: except IOError as err:
log.error("Failed to parse and save step trace files.")
log.exception(err) log.exception(err)
raise ProfilerIOException()
else: else:
log.info("Finish to save intermediate result for step trace file.") log.info("Finish to save intermediate result for step trace file.")
def _get_step_trace_file(self): def _get_step_trace_files(self):
"""Get step trace file.""" """Get step trace files."""
profiling_path = self._input_dir # step trace files may under $profiler_dir or $profiler_dir/data
profiler_dir = self._input_dir
step_trace_files = self._search_file(profiler_dir)
if not step_trace_files:
# try to find step trace files under $profiler_dir/data
profiler_dir = os.path.join(profiler_dir, 'data')
step_trace_files = self._search_file(profiler_dir)
if not step_trace_files:
raise ProfilerPathErrorException('Training trace file does not exist.')
return step_trace_files
@staticmethod
def _search_file(input_dir):
"""Search step trace file under specific input directory."""
# validate input_dir # validate input_dir
if not os.path.isdir(profiling_path): if not os.path.isdir(input_dir):
raise ProfilerPathErrorException( raise ProfilerPathErrorException(
'{} does not exist or is not a dir'.format(profiling_path) '{} does not exist or is not a dir'.format(input_dir)
) )
# get step trace files # get step trace files
files = os.listdir(profiling_path) files = os.listdir(input_dir)
step_trace_files = list( step_trace_files = list(
filter( filter(
lambda file: file.startswith('training_trace') and not file.endswith('.done'), lambda file: file.startswith('training_trace') and not file.endswith('.done'),
...@@ -98,36 +111,46 @@ class StepTraceParser: ...@@ -98,36 +111,46 @@ class StepTraceParser:
) )
) )
# validate result # validate result
if not step_trace_files:
raise ProfilerPathErrorException('training trace file does not exist')
if len(step_trace_files) > 1: if len(step_trace_files) > 1:
log.warning("Not enable to parse multiple step trace files yet.") # the format of file name is like
step_trace_file = os.path.join(profiling_path, step_trace_files[0]) # `training_trace.46.dev.profiler_default_tag.$id.slice_$number`
return step_trace_file # use the $number as the sorted key
try:
step_trace_files.sort(key=lambda path: int(path.rsplit('_', 1)[-1]))
except ValueError as err:
log.warning("Unable to parse file names: %s. %s", step_trace_files, err)
step_trace_files = []
def _parse(self, source_file): file_paths = [os.path.join(input_dir, file) for file in step_trace_files]
"""Parse source step trace file.""" log.info("Find %d step trace files.", len(file_paths))
return file_paths
def _parse(self, source_files):
"""Parse source step trace files."""
log.info("Start to parse step trace file.") log.info("Start to parse step trace file.")
event_info = {}
for source_file in source_files:
with open(source_file, 'rb') as handler: with open(source_file, 'rb') as handler:
content = handler.read() content = handler.read()
for step_trace in self._get_next_step_trace(content): for step_trace in self._get_next_step_trace(content, event_info):
if self._skip_first_step: if self._skip_first_step:
self._skip_first_step = False self._skip_first_step = False
else: continue
self._record_trace_event(step_trace) self._record_trace_event(step_trace)
self._record_average_info() self._record_average_info()
log.info("Finish to parse step trace file.") log.info("Finish to parse step trace file.")
def _get_next_step_trace(self, content): def _get_next_step_trace(self, content, event_info):
""" """
Get next step trace info. Get next step trace info.
Args: Args:
content (bytes): The input step trace info content (bytes): The input step trace info.
event_info (dict): The event info.
Returns: Returns:
Generator, return the step trace one by one. Generator, return the step trace one by one.
""" """
event_info = {}
for pos in range(0, len(content), 20): for pos in range(0, len(content), 20):
next_event = self._get_trace_struct(content[pos:pos + self._event_size]) next_event = self._get_trace_struct(content[pos:pos + self._event_size])
self._construct_event_info(next_event, event_info) self._construct_event_info(next_event, event_info)
......
...@@ -214,7 +214,10 @@ class Profiler: ...@@ -214,7 +214,10 @@ class Profiler:
logger.warning(err.message) logger.warning(err.message)
# analyse step trace info # analyse step trace info
try:
self._analyse_step_trace(source_path, framework_parser) self._analyse_step_trace(source_path, framework_parser)
except MindInsightException as err:
logger.warning(err.message)
def _analyse_step_trace(self, source_path, framework_parser): def _analyse_step_trace(self, source_path, framework_parser):
""" """
......
...@@ -150,9 +150,12 @@ class TestProfilerAnalyse(TestCase): ...@@ -150,9 +150,12 @@ class TestProfilerAnalyse(TestCase):
summary = analyser.summary summary = analyser.summary
assert summary == { assert summary == {
'total_time': 205.3809, 'total_time': 205.3809,
'iteration_interval': '0.1%', 'iteration_interval': 0.2038,
'fp_and_bp': '57.48%', 'iteration_interval_percent': '0.1%',
'tail': '42.42%', 'fp_and_bp': 118.054,
'fp_and_bp_percent': '57.48%',
'tail': 87.1231,
'tail_percent': '42.42%',
'total_steps': 322} 'total_steps': 322}
@pytest.mark.level0 @pytest.mark.level0
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册