diff --git a/mindinsight/backend/profiler/profile_api.py b/mindinsight/backend/profiler/profile_api.py index 54a8bcf7b7dea3c2f32591808593a82fd56138b2..a538868b08a5b5badd3c57743a67aa3c9af7cd31 100644 --- a/mindinsight/backend/profiler/profile_api.py +++ b/mindinsight/backend/profiler/profile_api.py @@ -31,6 +31,7 @@ from mindinsight.datavisual.utils.tools import get_train_id, get_profiler_dir, \ unquote_args, to_int, get_device_id from mindinsight.profiler.analyser.analyser_factory import AnalyserFactory from mindinsight.profiler.analyser.minddata_analyser import MinddataAnalyser +from mindinsight.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException from mindinsight.profiler.proposer.compose_proposer import ComposeProposal from mindinsight.profiler.common.util import analyse_device_list_from_profiler_dir from mindinsight.profiler.common.validator.validate import validate_condition, \ @@ -131,9 +132,13 @@ def get_training_trace_graph(): graph_type = to_int(graph_type, 'graph_type') device_id = request.args.get("device_id", default='0') _ = to_int(device_id, 'device_id') + graph_info = {} + try: + analyser = AnalyserFactory.instance().get_analyser( + 'step_trace', profiler_dir, device_id) + except ProfilerFileNotFoundException: + return jsonify(graph_info) - analyser = AnalyserFactory.instance().get_analyser( - 'step_trace', profiler_dir, device_id) graph_info = analyser.query({ 'filter_condition': { 'mode': 'step', diff --git a/mindinsight/profiler/common/util.py b/mindinsight/profiler/common/util.py index d7910ba1948889bdf8abccde835a0ecdcaafd700..ee25bfc525efadf5e465ede71bc65b18f4ce310c 100644 --- a/mindinsight/profiler/common/util.py +++ b/mindinsight/profiler/common/util.py @@ -106,9 +106,12 @@ def get_summary_for_step_trace(average_info, header): tail = get_field_value(average_info, 'tail', header) summary = { 'total_time': total_time, - 'iteration_interval': calculate_percent(iteration_interval, total_time), - 'fp_and_bp': calculate_percent(fp_and_bp, total_time), - 'tail': calculate_percent(tail, total_time) + 'iteration_interval': iteration_interval, + 'iteration_interval_percent': calculate_percent(iteration_interval, total_time), + 'fp_and_bp': fp_and_bp, + 'fp_and_bp_percent': calculate_percent(fp_and_bp, total_time), + 'tail': tail, + 'tail_percent': calculate_percent(tail, total_time) } return summary diff --git a/mindinsight/profiler/parser/step_trace_parser.py b/mindinsight/profiler/parser/step_trace_parser.py index f683081772439e1c08157ff4c2061c4212e26951..dddb4448edff758a28e836f0fbc6dabac4a66400 100644 --- a/mindinsight/profiler/parser/step_trace_parser.py +++ b/mindinsight/profiler/parser/step_trace_parser.py @@ -21,10 +21,9 @@ from collections import namedtuple from decimal import Decimal from mindinsight.profiler.common.exceptions.exceptions import ProfilerPathErrorException, \ - JobIdMismatchException + JobIdMismatchException, ProfilerIOException from mindinsight.profiler.common.log import logger as log from mindinsight.profiler.common.util import get_summary_for_step_trace -from mindinsight.utils.exceptions import MindInsightException StepTraceStruct = namedtuple( 'TrainingTraceStruct', ['tag_id', 'task_id', 'stream_id', 'sys_count'] @@ -72,25 +71,39 @@ class StepTraceParser: def parse_and_save(self): """Parse step trace files and save the result.""" try: - source_file = self._get_step_trace_file() - self._parse(source_file) + source_files = self._get_step_trace_files() + self._parse(source_files) self._save() - except MindInsightException as err: - log.error("Failed to parse and save step trace files.") + except IOError as err: log.exception(err) + raise ProfilerIOException() else: log.info("Finish to save intermediate result for step trace file.") - def _get_step_trace_file(self): - """Get step trace file.""" - profiling_path = self._input_dir + def _get_step_trace_files(self): + """Get step trace files.""" + # step trace files may under $profiler_dir or $profiler_dir/data + profiler_dir = self._input_dir + step_trace_files = self._search_file(profiler_dir) + if not step_trace_files: + # try to find step trace files under $profiler_dir/data + profiler_dir = os.path.join(profiler_dir, 'data') + step_trace_files = self._search_file(profiler_dir) + if not step_trace_files: + raise ProfilerPathErrorException('Training trace file does not exist.') + + return step_trace_files + + @staticmethod + def _search_file(input_dir): + """Search step trace file under specific input directory.""" # validate input_dir - if not os.path.isdir(profiling_path): + if not os.path.isdir(input_dir): raise ProfilerPathErrorException( - '{} does not exist or is not a dir'.format(profiling_path) + '{} does not exist or is not a dir'.format(input_dir) ) # get step trace files - files = os.listdir(profiling_path) + files = os.listdir(input_dir) step_trace_files = list( filter( lambda file: file.startswith('training_trace') and not file.endswith('.done'), @@ -98,36 +111,46 @@ class StepTraceParser: ) ) # validate result - if not step_trace_files: - raise ProfilerPathErrorException('training trace file does not exist') if len(step_trace_files) > 1: - log.warning("Not enable to parse multiple step trace files yet.") - step_trace_file = os.path.join(profiling_path, step_trace_files[0]) - return step_trace_file + # the format of file name is like + # `training_trace.46.dev.profiler_default_tag.$id.slice_$number` + # use the $number as the sorted key + try: + step_trace_files.sort(key=lambda path: int(path.rsplit('_', 1)[-1])) + except ValueError as err: + log.warning("Unable to parse file names: %s. %s", step_trace_files, err) + step_trace_files = [] + + file_paths = [os.path.join(input_dir, file) for file in step_trace_files] + log.info("Find %d step trace files.", len(file_paths)) + return file_paths - def _parse(self, source_file): - """Parse source step trace file.""" - log.info("Start to parse step trace file.") - with open(source_file, 'rb') as handler: - content = handler.read() - for step_trace in self._get_next_step_trace(content): - if self._skip_first_step: - self._skip_first_step = False - else: + def _parse(self, source_files): + """Parse source step trace files.""" + log.info("Start to parse step trace file.") + event_info = {} + for source_file in source_files: + with open(source_file, 'rb') as handler: + content = handler.read() + for step_trace in self._get_next_step_trace(content, event_info): + if self._skip_first_step: + self._skip_first_step = False + continue self._record_trace_event(step_trace) self._record_average_info() log.info("Finish to parse step trace file.") - def _get_next_step_trace(self, content): + def _get_next_step_trace(self, content, event_info): """ Get next step trace info. Args: - content (bytes): The input step trace info + content (bytes): The input step trace info. + event_info (dict): The event info. + Returns: Generator, return the step trace one by one. """ - event_info = {} for pos in range(0, len(content), 20): next_event = self._get_trace_struct(content[pos:pos + self._event_size]) self._construct_event_info(next_event, event_info) @@ -251,7 +274,7 @@ class StepTraceParser: log.info("Finish add average info for step trace.") def _save(self): - log.info("Start to save step trace file.") + log.info("Start to save step trace file.") if not self._header: return with open(self._output_path, 'w') as file_handle: diff --git a/mindinsight/profiler/profiling.py b/mindinsight/profiler/profiling.py index 24af157379933430d6bf07de1e76bf7679103a4f..48fe0aa6e70d8cf9a4d6af9b454f7ec4ae939056 100644 --- a/mindinsight/profiler/profiling.py +++ b/mindinsight/profiler/profiling.py @@ -221,7 +221,10 @@ class Profiler: logger.warning(err.message) # analyse step trace info - self._analyse_step_trace(source_path, framework_parser) + try: + self._analyse_step_trace(source_path, framework_parser) + except MindInsightException as err: + logger.warning(err.message) # analyse timeline info self._analyse_timeline() diff --git a/tests/st/func/profiler/test_analyse.py b/tests/st/func/profiler/test_analyse.py index 61a29dbf08ef26eb64392b2e4a98fe9b78f419f2..092bbdafda131c7fedde788ee1b70341044bfd01 100644 --- a/tests/st/func/profiler/test_analyse.py +++ b/tests/st/func/profiler/test_analyse.py @@ -149,9 +149,12 @@ class TestProfilerAnalyse(TestCase): summary = analyser.summary assert summary == { 'total_time': 205.3809, - 'iteration_interval': '0.1%', - 'fp_and_bp': '57.48%', - 'tail': '42.42%', + 'iteration_interval': 0.2038, + 'iteration_interval_percent': '0.1%', + 'fp_and_bp': 118.054, + 'fp_and_bp_percent': '57.48%', + 'tail': 87.1231, + 'tail_percent': '42.42%', 'total_steps': 322} @pytest.mark.level0