From 1460ab4ab18aeb4c6dccd83b9fc016c12b9abe01 Mon Sep 17 00:00:00 2001 From: yuximiao Date: Thu, 13 Aug 2020 18:10:33 +0800 Subject: [PATCH] gpu profiler --- mindinsight/backend/profiler/profile_api.py | 2 +- .../datavisual/data_transform/data_manager.py | 11 +- .../data_transform/summary_watcher.py | 12 +- .../processors/train_task_manager.py | 1 + mindinsight/profiler/analyser/__init__.py | 2 +- mindinsight/profiler/analyser/gpu_analyser.py | 129 ++++++++++++++++++ mindinsight/profiler/common/util.py | 17 ++- .../profiler/common/validator/validate.py | 23 +++- 8 files changed, 184 insertions(+), 13 deletions(-) create mode 100644 mindinsight/profiler/analyser/gpu_analyser.py diff --git a/mindinsight/backend/profiler/profile_api.py b/mindinsight/backend/profiler/profile_api.py index 6eebd74..7bfec3d 100644 --- a/mindinsight/backend/profiler/profile_api.py +++ b/mindinsight/backend/profiler/profile_api.py @@ -114,7 +114,7 @@ def get_profile_device_list(): except ValidationError: raise ParamValueError("Invalid profiler dir") - device_list = analyse_device_list_from_profiler_dir(profiler_dir_abs) + device_list, _ = analyse_device_list_from_profiler_dir(profiler_dir_abs) return jsonify(device_list) diff --git a/mindinsight/datavisual/data_transform/data_manager.py b/mindinsight/datavisual/data_transform/data_manager.py index 4bfb8fd..292066b 100644 --- a/mindinsight/datavisual/data_transform/data_manager.py +++ b/mindinsight/datavisual/data_transform/data_manager.py @@ -59,14 +59,17 @@ class _BasicTrainJob: create_time (DateTime): The create time of summary directory. update_time (DateTime): The latest modify time of summary files directly in the summary directory. profiler_dir (str): The relative path of profiler directory. + profiler_type (str): The profiler device type. """ - def __init__(self, train_id, abs_summary_base_dir, abs_summary_dir, create_time, update_time, profiler_dir): + def __init__(self, train_id, abs_summary_base_dir, abs_summary_dir, create_time, update_time, profiler_dir, + profiler_type=""): self._train_id = train_id self._abs_summary_base_dir = abs_summary_base_dir self._abs_summary_dir = abs_summary_dir self._create_time = create_time self._update_time = update_time self._profiler_dir = profiler_dir + self._profiler_type = profiler_type @property def abs_summary_dir(self): @@ -98,6 +101,11 @@ class _BasicTrainJob: """Get update time.""" return self._update_time + @property + def profiler_type(self): + """Get profiler type""" + return self._profiler_type + class CachedTrainJob: """ @@ -952,6 +960,7 @@ class DataManager: create_time=info['create_time'], update_time=info['update_time'], profiler_dir=None if profiler is None else profiler['directory'], + profiler_type="" if profiler is None else profiler['profiler_type'], )) self._brief_cache.update_cache(basic_train_jobs) diff --git a/mindinsight/datavisual/data_transform/summary_watcher.py b/mindinsight/datavisual/data_transform/summary_watcher.py index 0798f5c..cc04022 100644 --- a/mindinsight/datavisual/data_transform/summary_watcher.py +++ b/mindinsight/datavisual/data_transform/summary_watcher.py @@ -109,6 +109,7 @@ class SummaryWatcher: 'directory': profiler['directory'], 'create_time': profiler['ctime'], 'update_time': profiler['mtime'], + 'profiler_type': profiler['profiler_type'] } directories.append(directory) @@ -226,13 +227,15 @@ class SummaryWatcher: elif entry.is_dir(): profiler_pattern = re.search(self.PROFILER_DIRECTORY_REGEX, entry.name) full_dir_path = os.path.join(summary_base_dir, relative_path, entry.name) - if profiler_pattern is None or not self._is_valid_profiler_directory(full_dir_path): + is_valid_profiler_dir, profiler_type = self._is_valid_profiler_directory(full_dir_path) + if profiler_pattern is None or not is_valid_profiler_dir: return profiler = { 'directory': os.path.join('.', entry.name), 'ctime': ctime, 'mtime': mtime, + "profiler_type": profiler_type } summary_dict[relative_path] = { @@ -286,19 +289,20 @@ class SummaryWatcher: profiler_pattern = re.search(self.PROFILER_DIRECTORY_REGEX, entry.name) if profiler_pattern is not None and entry.is_dir(): full_path = os.path.realpath(os.path.join(summary_directory, entry.name)) - if self._is_valid_profiler_directory(full_path): + if self._is_valid_profiler_directory(full_path)[0]: return True return False def _is_valid_profiler_directory(self, directory): + profiler_type = "" try: from mindinsight.profiler.common.util import analyse_device_list_from_profiler_dir - device_list = analyse_device_list_from_profiler_dir(directory) + device_list, profiler_type = analyse_device_list_from_profiler_dir(directory) except ImportError: device_list = [] - return bool(device_list) + return bool(device_list), profiler_type def list_summary_directories_by_pagination(self, summary_base_dir, offset=0, limit=10): """ diff --git a/mindinsight/datavisual/processors/train_task_manager.py b/mindinsight/datavisual/processors/train_task_manager.py index f0e780f..178739b 100644 --- a/mindinsight/datavisual/processors/train_task_manager.py +++ b/mindinsight/datavisual/processors/train_task_manager.py @@ -144,6 +144,7 @@ class TrainTaskManager(BaseProcessor): update_time=basic_info.update_time.strftime('%Y-%m-%d %H:%M:%S'), profiler_dir=basic_info.profiler_dir, cache_status=train_job.cache_status.value, + profiler_type=basic_info.profiler_type, ) if train_job.cache_status == CacheStatus.CACHED: diff --git a/mindinsight/profiler/analyser/__init__.py b/mindinsight/profiler/analyser/__init__.py index bc2cf23..00613da 100644 --- a/mindinsight/profiler/analyser/__init__.py +++ b/mindinsight/profiler/analyser/__init__.py @@ -14,4 +14,4 @@ # ============================================================================ """The analyser module.""" from . import analyser, minddata_pipeline_analyser, step_trace_analyser, \ - minddata_analyser, timeline_analyser + minddata_analyser, timeline_analyser, gpu_analyser diff --git a/mindinsight/profiler/analyser/gpu_analyser.py b/mindinsight/profiler/analyser/gpu_analyser.py new file mode 100644 index 0000000..66aa94a --- /dev/null +++ b/mindinsight/profiler/analyser/gpu_analyser.py @@ -0,0 +1,129 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""The gpu base analyser.""" +import csv +import os + +from mindinsight.profiler.analyser.base_analyser import BaseAnalyser +from mindinsight.profiler.common.log import logger + + +class GpuAnalyser(BaseAnalyser): + """Gpu base analyser.""" + _csv_file_to_analyse = "" + + def _load(self): + """Load data according to the parsed AICORE operator types file.""" + op_type_file_path = os.path.join( + self._profiling_dir, + self._csv_file_to_analyse.format(self._device_id) + ) + if not os.path.isfile(op_type_file_path): + logger.warning('The file <%s> does not exist.', op_type_file_path) + return + + with open(op_type_file_path, 'r') as file: + csv_reader = csv.reader(file) + _ = next(csv_reader) + for info in csv_reader: + self._data.append(self._convert_field_type(info)) + + @staticmethod + def _convert_field_type(row): + """ + Convert the field type to the specific type. + + Args: + row (list): One row data from parsed data. + + Returns: + list, the converted data. + """ + return row + + def _filter(self, filter_condition): + """ + Filter the profiling data according to the filter condition. + + Args: + filter_condition (dict): The filter condition. + """ + def _inner_filter(item: list): + return self._default_filter(item, filter_condition) + + self._result = list(filter(_inner_filter, self._data)) + + +class GpuOpTypeAnalyser(GpuAnalyser): + """Gpu operation type analyser.""" + _col_names = ["op_type", "type_occurrences", "total_time", "proportion", "avg_time"] + _csv_file_to_analyse = 'gpu_op_type_info_{}.csv' + + @staticmethod + def _convert_field_type(row): + """ + Convert the field type to the specific type. + + Args: + row (list): One row data from parsed data. + + Returns: + list, the converted data. + """ + return [row[0], int(row[1]), float(row[2]), float(row[3]), float(row[4])] + + +class GpuOpInfoAnalyser(GpuAnalyser): + """Gpu operation detail info analyser.""" + _col_names = ["op_side", "op_type", "op_name", "op_full_name", + "op_occurrences", "op_total_time", "op_avg_time", + "proportion", "cuda_activity_cost_time", "cuda_activity_call_count"] + _csv_file_to_analyse = 'gpu_op_detail_info_{}.csv' + + @staticmethod + def _convert_field_type(row): + """ + Convert the field type to the specific type. + + Args: + row (list): One row data from parsed data. + + Returns: + list, the converted data. + """ + return [row[0], row[1], row[2], row[3], int(row[4]), float(row[5]), + float(row[6]), float(row[7]), float(row[8]), int(row[9])] + + +class GpuCudaActivityAnalyser(GpuAnalyser): + """Gpu activity type analyser.""" + _col_names = ["name", "type", "op_full_name", "stream_id", + "block_dim", "grid_dim", "occurrences", "total_duration", + "avg_duration", "max_duration", "min_duration"] + _csv_file_to_analyse = 'gpu_activity_data_{}.csv' + + @staticmethod + def _convert_field_type(row): + """ + Convert the field type to the specific type. + + Args: + row (list): One row data from parsed data. + + Returns: + list, the converted data. + """ + return [row[0], row[1], row[2], row[3], row[4], row[5], int(row[6]), + float(row[7]), float(row[8]), float(row[9]), float(row[10])] diff --git a/mindinsight/profiler/common/util.py b/mindinsight/profiler/common/util.py index ea4963e..4612280 100644 --- a/mindinsight/profiler/common/util.py +++ b/mindinsight/profiler/common/util.py @@ -36,8 +36,10 @@ def analyse_device_list_from_profiler_dir(profiler_dir): list, the device_id list. """ profiler_file_prefix = ["timeline_display", "output_op_compute_time"] + gpu_profiler_file_prefix = ["gpu_op_detail_info", "gpu_activity_data", "gpu_op_type_info"] device_id_list = set() + gpu_device_id_list = set() for _, _, filenames in os.walk(profiler_dir): for filename in filenames: if filename.startswith("step_trace_raw"): @@ -51,8 +53,19 @@ def analyse_device_list_from_profiler_dir(profiler_dir): if device_num.isdigit() and '_'.join(items[:-1]) in profiler_file_prefix: device_id_list.add(device_num) - - return sorted(list(device_id_list)) + elif device_num.isdigit() and '_'.join(items[:-1]) in gpu_profiler_file_prefix: + gpu_device_id_list.add(device_num) + + if device_id_list: + result_list = sorted(list(device_id_list)) + profiler_type = "ascend" + elif gpu_device_id_list: + result_list = sorted(list(gpu_device_id_list)) + profiler_type = "gpu" + else: + result_list = [] + profiler_type = "" + return result_list, profiler_type def query_latest_trace_time_file(profiler_dir, device_id=0): diff --git a/mindinsight/profiler/common/validator/validate.py b/mindinsight/profiler/common/validator/validate.py index 90fdc16..84f5c15 100644 --- a/mindinsight/profiler/common/validator/validate.py +++ b/mindinsight/profiler/common/validator/validate.py @@ -27,6 +27,13 @@ AICORE_TYPE_COL = ["op_type", "execution_time", "execution_frequency", "precent" AICORE_DETAIL_COL = ["op_name", "op_type", "avg_execution_time", "subgraph", "full_op_name"] AICPU_COL = ["serial_number", "op_type", "total_time", "dispatch_time", "run_start", "run_end"] +GPU_TYPE_COL = ["op_type", "type_occurrences", "total_time", "proportion", "avg_time"] +GPU_ACTIVITY_COL = ["name", "type", "op_full_name", "stream_id", + "block_dim", "grid_dim", "occurrences", "total_duration", + "avg_duration", "max_duration", "min_duration"] +GPU_DETAIL_COL = ["op_side", "op_type", "op_name", "op_full_name", + "op_occurrences", "op_total_time", "op_avg_time", + "proportion", "cuda_activity_cost_time", "cuda_activity_call_count"] MINDDATA_PIPELINE_COL = [ 'op_id', 'op_type', 'num_workers', 'output_queue_average_size', 'output_queue_length', 'output_queue_usage_rate', 'sample_interval', @@ -67,10 +74,20 @@ def validate_condition(search_condition): search_scope = AICORE_TYPE_COL elif op_type == "aicore_detail": search_scope = AICORE_DETAIL_COL + elif op_type == "gpu_op_type": + search_scope = GPU_TYPE_COL + elif op_type == "gpu_op_info": + search_scope = GPU_DETAIL_COL + elif op_type == "gpu_cuda_activity": + search_scope = GPU_ACTIVITY_COL else: - raise ProfilerOpTypeException("The op_type must in ['aicpu', 'aicore_type', 'aicore_detail']") + raise ProfilerOpTypeException( + "The op_type must in ['aicpu', 'aicore_type', 'aicore_detail', " + "'gpu_op_type', 'gpu_op_info', 'gpu_cuda_activity']") else: - raise ProfilerOpTypeException("The op_type must in ['aicpu', 'aicore_type', 'aicore_detail']") + raise ProfilerOpTypeException( + "The op_type must in ['aicpu', 'aicore_type', 'aicore_detail', " + "'gpu_op_type', 'gpu_op_info', 'gpu_cuda_activity']") if "group_condition" in search_condition: validate_group_condition(search_condition) @@ -199,8 +216,6 @@ def validate_filter_condition(search_condition): if "op_name" in filter_condition: op_name_condition = filter_condition.get("op_name") validate_op_filter_condition(op_name_condition) - if "op_type" not in filter_condition and "op_name" not in filter_condition: - raise ProfilerFilterConditionException("The key of filter_condition is not support") def validate_and_set_job_id_env(job_id_env): -- GitLab