From 1460ab4ab18aeb4c6dccd83b9fc016c12b9abe01 Mon Sep 17 00:00:00 2001
From: yuximiao <yuximiao@huawei.com>
Date: Thu, 13 Aug 2020 18:10:33 +0800
Subject: [PATCH] gpu profiler

---
 mindinsight/backend/profiler/profile_api.py   |   2 +-
 .../datavisual/data_transform/data_manager.py |  11 +-
 .../data_transform/summary_watcher.py         |  12 +-
 .../processors/train_task_manager.py          |   1 +
 mindinsight/profiler/analyser/__init__.py     |   2 +-
 mindinsight/profiler/analyser/gpu_analyser.py | 129 ++++++++++++++++++
 mindinsight/profiler/common/util.py           |  17 ++-
 .../profiler/common/validator/validate.py     |  23 +++-
 8 files changed, 184 insertions(+), 13 deletions(-)
 create mode 100644 mindinsight/profiler/analyser/gpu_analyser.py

diff --git a/mindinsight/backend/profiler/profile_api.py b/mindinsight/backend/profiler/profile_api.py
index 6eebd74..7bfec3d 100644
--- a/mindinsight/backend/profiler/profile_api.py
+++ b/mindinsight/backend/profiler/profile_api.py
@@ -114,7 +114,7 @@ def get_profile_device_list():
     except ValidationError:
         raise ParamValueError("Invalid profiler dir")
 
-    device_list = analyse_device_list_from_profiler_dir(profiler_dir_abs)
+    device_list, _ = analyse_device_list_from_profiler_dir(profiler_dir_abs)
     return jsonify(device_list)
 
 
diff --git a/mindinsight/datavisual/data_transform/data_manager.py b/mindinsight/datavisual/data_transform/data_manager.py
index 4bfb8fd..292066b 100644
--- a/mindinsight/datavisual/data_transform/data_manager.py
+++ b/mindinsight/datavisual/data_transform/data_manager.py
@@ -59,14 +59,17 @@ class _BasicTrainJob:
         create_time (DateTime): The create time of summary directory.
         update_time (DateTime): The latest modify time of summary files directly in the summary directory.
         profiler_dir (str): The relative path of profiler directory.
+        profiler_type (str): The profiler device type.
     """
-    def __init__(self, train_id, abs_summary_base_dir, abs_summary_dir, create_time, update_time, profiler_dir):
+    def __init__(self, train_id, abs_summary_base_dir, abs_summary_dir, create_time, update_time, profiler_dir,
+                 profiler_type=""):
         self._train_id = train_id
         self._abs_summary_base_dir = abs_summary_base_dir
         self._abs_summary_dir = abs_summary_dir
         self._create_time = create_time
         self._update_time = update_time
         self._profiler_dir = profiler_dir
+        self._profiler_type = profiler_type
 
     @property
     def abs_summary_dir(self):
@@ -98,6 +101,11 @@ class _BasicTrainJob:
         """Get update time."""
         return self._update_time
 
+    @property
+    def profiler_type(self):
+        """Get profiler type"""
+        return self._profiler_type
+
 
 class CachedTrainJob:
     """
@@ -952,6 +960,7 @@ class DataManager:
                 create_time=info['create_time'],
                 update_time=info['update_time'],
                 profiler_dir=None if profiler is None else profiler['directory'],
+                profiler_type="" if profiler is None else profiler['profiler_type'],
             ))
 
         self._brief_cache.update_cache(basic_train_jobs)
diff --git a/mindinsight/datavisual/data_transform/summary_watcher.py b/mindinsight/datavisual/data_transform/summary_watcher.py
index 0798f5c..cc04022 100644
--- a/mindinsight/datavisual/data_transform/summary_watcher.py
+++ b/mindinsight/datavisual/data_transform/summary_watcher.py
@@ -109,6 +109,7 @@ class SummaryWatcher:
                     'directory': profiler['directory'],
                     'create_time': profiler['ctime'],
                     'update_time': profiler['mtime'],
+                    'profiler_type': profiler['profiler_type']
                 }
             directories.append(directory)
 
@@ -226,13 +227,15 @@ class SummaryWatcher:
         elif entry.is_dir():
             profiler_pattern = re.search(self.PROFILER_DIRECTORY_REGEX, entry.name)
             full_dir_path = os.path.join(summary_base_dir, relative_path, entry.name)
-            if profiler_pattern is None or not self._is_valid_profiler_directory(full_dir_path):
+            is_valid_profiler_dir, profiler_type = self._is_valid_profiler_directory(full_dir_path)
+            if profiler_pattern is None or not is_valid_profiler_dir:
                 return
 
             profiler = {
                 'directory': os.path.join('.', entry.name),
                 'ctime': ctime,
                 'mtime': mtime,
+                "profiler_type": profiler_type
             }
 
             summary_dict[relative_path] = {
@@ -286,19 +289,20 @@ class SummaryWatcher:
             profiler_pattern = re.search(self.PROFILER_DIRECTORY_REGEX, entry.name)
             if profiler_pattern is not None and entry.is_dir():
                 full_path = os.path.realpath(os.path.join(summary_directory, entry.name))
-                if self._is_valid_profiler_directory(full_path):
+                if self._is_valid_profiler_directory(full_path)[0]:
                     return True
 
         return False
 
     def _is_valid_profiler_directory(self, directory):
+        profiler_type = ""
         try:
             from mindinsight.profiler.common.util import analyse_device_list_from_profiler_dir
-            device_list = analyse_device_list_from_profiler_dir(directory)
+            device_list, profiler_type = analyse_device_list_from_profiler_dir(directory)
         except ImportError:
             device_list = []
 
-        return bool(device_list)
+        return bool(device_list), profiler_type
 
     def list_summary_directories_by_pagination(self, summary_base_dir, offset=0, limit=10):
         """
diff --git a/mindinsight/datavisual/processors/train_task_manager.py b/mindinsight/datavisual/processors/train_task_manager.py
index f0e780f..178739b 100644
--- a/mindinsight/datavisual/processors/train_task_manager.py
+++ b/mindinsight/datavisual/processors/train_task_manager.py
@@ -144,6 +144,7 @@ class TrainTaskManager(BaseProcessor):
             update_time=basic_info.update_time.strftime('%Y-%m-%d %H:%M:%S'),
             profiler_dir=basic_info.profiler_dir,
             cache_status=train_job.cache_status.value,
+            profiler_type=basic_info.profiler_type,
         )
 
         if train_job.cache_status == CacheStatus.CACHED:
diff --git a/mindinsight/profiler/analyser/__init__.py b/mindinsight/profiler/analyser/__init__.py
index bc2cf23..00613da 100644
--- a/mindinsight/profiler/analyser/__init__.py
+++ b/mindinsight/profiler/analyser/__init__.py
@@ -14,4 +14,4 @@
 # ============================================================================
 """The analyser module."""
 from . import analyser, minddata_pipeline_analyser, step_trace_analyser, \
-    minddata_analyser, timeline_analyser
+    minddata_analyser, timeline_analyser, gpu_analyser
diff --git a/mindinsight/profiler/analyser/gpu_analyser.py b/mindinsight/profiler/analyser/gpu_analyser.py
new file mode 100644
index 0000000..66aa94a
--- /dev/null
+++ b/mindinsight/profiler/analyser/gpu_analyser.py
@@ -0,0 +1,129 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""The gpu base analyser."""
+import csv
+import os
+
+from mindinsight.profiler.analyser.base_analyser import BaseAnalyser
+from mindinsight.profiler.common.log import logger
+
+
+class GpuAnalyser(BaseAnalyser):
+    """Gpu base analyser."""
+    _csv_file_to_analyse = ""
+
+    def _load(self):
+        """Load data according to the parsed AICORE operator types file."""
+        op_type_file_path = os.path.join(
+            self._profiling_dir,
+            self._csv_file_to_analyse.format(self._device_id)
+        )
+        if not os.path.isfile(op_type_file_path):
+            logger.warning('The file <%s> does not exist.', op_type_file_path)
+            return
+
+        with open(op_type_file_path, 'r') as file:
+            csv_reader = csv.reader(file)
+            _ = next(csv_reader)
+            for info in csv_reader:
+                self._data.append(self._convert_field_type(info))
+
+    @staticmethod
+    def _convert_field_type(row):
+        """
+        Convert the field type to the specific type.
+
+        Args:
+            row (list): One row data from parsed data.
+
+        Returns:
+            list, the converted data.
+        """
+        return row
+
+    def _filter(self, filter_condition):
+        """
+        Filter the profiling data according to the filter condition.
+
+        Args:
+            filter_condition (dict): The filter condition.
+        """
+        def _inner_filter(item: list):
+            return self._default_filter(item, filter_condition)
+
+        self._result = list(filter(_inner_filter, self._data))
+
+
+class GpuOpTypeAnalyser(GpuAnalyser):
+    """Gpu operation type analyser."""
+    _col_names = ["op_type", "type_occurrences", "total_time", "proportion", "avg_time"]
+    _csv_file_to_analyse = 'gpu_op_type_info_{}.csv'
+
+    @staticmethod
+    def _convert_field_type(row):
+        """
+        Convert the field type to the specific type.
+
+        Args:
+            row (list): One row data from parsed data.
+
+        Returns:
+            list, the converted data.
+        """
+        return [row[0], int(row[1]), float(row[2]), float(row[3]), float(row[4])]
+
+
+class GpuOpInfoAnalyser(GpuAnalyser):
+    """Gpu operation detail info analyser."""
+    _col_names = ["op_side", "op_type", "op_name", "op_full_name",
+                  "op_occurrences", "op_total_time", "op_avg_time",
+                  "proportion", "cuda_activity_cost_time", "cuda_activity_call_count"]
+    _csv_file_to_analyse = 'gpu_op_detail_info_{}.csv'
+
+    @staticmethod
+    def _convert_field_type(row):
+        """
+        Convert the field type to the specific type.
+
+        Args:
+            row (list): One row data from parsed data.
+
+        Returns:
+            list, the converted data.
+        """
+        return [row[0], row[1], row[2], row[3], int(row[4]), float(row[5]),
+                float(row[6]), float(row[7]), float(row[8]), int(row[9])]
+
+
+class GpuCudaActivityAnalyser(GpuAnalyser):
+    """Gpu activity type analyser."""
+    _col_names = ["name", "type", "op_full_name", "stream_id",
+                  "block_dim", "grid_dim", "occurrences", "total_duration",
+                  "avg_duration", "max_duration", "min_duration"]
+    _csv_file_to_analyse = 'gpu_activity_data_{}.csv'
+
+    @staticmethod
+    def _convert_field_type(row):
+        """
+        Convert the field type to the specific type.
+
+        Args:
+            row (list): One row data from parsed data.
+
+        Returns:
+            list, the converted data.
+        """
+        return [row[0], row[1], row[2], row[3], row[4], row[5], int(row[6]),
+                float(row[7]), float(row[8]), float(row[9]), float(row[10])]
diff --git a/mindinsight/profiler/common/util.py b/mindinsight/profiler/common/util.py
index ea4963e..4612280 100644
--- a/mindinsight/profiler/common/util.py
+++ b/mindinsight/profiler/common/util.py
@@ -36,8 +36,10 @@ def analyse_device_list_from_profiler_dir(profiler_dir):
         list, the device_id list.
     """
     profiler_file_prefix = ["timeline_display", "output_op_compute_time"]
+    gpu_profiler_file_prefix = ["gpu_op_detail_info", "gpu_activity_data", "gpu_op_type_info"]
 
     device_id_list = set()
+    gpu_device_id_list = set()
     for _, _, filenames in os.walk(profiler_dir):
         for filename in filenames:
             if filename.startswith("step_trace_raw"):
@@ -51,8 +53,19 @@ def analyse_device_list_from_profiler_dir(profiler_dir):
 
             if device_num.isdigit() and '_'.join(items[:-1]) in profiler_file_prefix:
                 device_id_list.add(device_num)
-
-    return sorted(list(device_id_list))
+            elif device_num.isdigit() and '_'.join(items[:-1]) in gpu_profiler_file_prefix:
+                gpu_device_id_list.add(device_num)
+
+    if device_id_list:
+        result_list = sorted(list(device_id_list))
+        profiler_type = "ascend"
+    elif gpu_device_id_list:
+        result_list = sorted(list(gpu_device_id_list))
+        profiler_type = "gpu"
+    else:
+        result_list = []
+        profiler_type = ""
+    return result_list, profiler_type
 
 
 def query_latest_trace_time_file(profiler_dir, device_id=0):
diff --git a/mindinsight/profiler/common/validator/validate.py b/mindinsight/profiler/common/validator/validate.py
index 90fdc16..84f5c15 100644
--- a/mindinsight/profiler/common/validator/validate.py
+++ b/mindinsight/profiler/common/validator/validate.py
@@ -27,6 +27,13 @@ AICORE_TYPE_COL = ["op_type", "execution_time", "execution_frequency", "precent"
 AICORE_DETAIL_COL = ["op_name", "op_type", "avg_execution_time", "subgraph", "full_op_name"]
 AICPU_COL = ["serial_number", "op_type", "total_time", "dispatch_time", "run_start",
              "run_end"]
+GPU_TYPE_COL = ["op_type", "type_occurrences", "total_time", "proportion", "avg_time"]
+GPU_ACTIVITY_COL = ["name", "type", "op_full_name", "stream_id",
+                    "block_dim", "grid_dim", "occurrences", "total_duration",
+                    "avg_duration", "max_duration", "min_duration"]
+GPU_DETAIL_COL = ["op_side", "op_type", "op_name", "op_full_name",
+                  "op_occurrences", "op_total_time", "op_avg_time",
+                  "proportion", "cuda_activity_cost_time", "cuda_activity_call_count"]
 MINDDATA_PIPELINE_COL = [
     'op_id', 'op_type', 'num_workers', 'output_queue_average_size',
     'output_queue_length', 'output_queue_usage_rate', 'sample_interval',
@@ -67,10 +74,20 @@ def validate_condition(search_condition):
             search_scope = AICORE_TYPE_COL
         elif op_type == "aicore_detail":
             search_scope = AICORE_DETAIL_COL
+        elif op_type == "gpu_op_type":
+            search_scope = GPU_TYPE_COL
+        elif op_type == "gpu_op_info":
+            search_scope = GPU_DETAIL_COL
+        elif op_type == "gpu_cuda_activity":
+            search_scope = GPU_ACTIVITY_COL
         else:
-            raise ProfilerOpTypeException("The op_type must in ['aicpu', 'aicore_type', 'aicore_detail']")
+            raise ProfilerOpTypeException(
+                "The op_type must in ['aicpu', 'aicore_type', 'aicore_detail', "
+                "'gpu_op_type', 'gpu_op_info', 'gpu_cuda_activity']")
     else:
-        raise ProfilerOpTypeException("The op_type must in ['aicpu', 'aicore_type', 'aicore_detail']")
+        raise ProfilerOpTypeException(
+            "The op_type must in ['aicpu', 'aicore_type', 'aicore_detail', "
+            "'gpu_op_type', 'gpu_op_info', 'gpu_cuda_activity']")
 
     if "group_condition" in search_condition:
         validate_group_condition(search_condition)
@@ -199,8 +216,6 @@ def validate_filter_condition(search_condition):
         if "op_name" in filter_condition:
             op_name_condition = filter_condition.get("op_name")
             validate_op_filter_condition(op_name_condition)
-        if "op_type" not in filter_condition and "op_name" not in filter_condition:
-            raise ProfilerFilterConditionException("The key of filter_condition is not support")
 
 
 def validate_and_set_job_id_env(job_id_env):
-- 
GitLab