提交 d8cbf988 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!525 Add gpu profiler.

Merge pull request !525 from yuximiao/yuximiao_gpu_profiler
...@@ -114,7 +114,7 @@ def get_profile_device_list(): ...@@ -114,7 +114,7 @@ def get_profile_device_list():
except ValidationError: except ValidationError:
raise ParamValueError("Invalid profiler dir") raise ParamValueError("Invalid profiler dir")
device_list = analyse_device_list_from_profiler_dir(profiler_dir_abs) device_list, _ = analyse_device_list_from_profiler_dir(profiler_dir_abs)
return jsonify(device_list) return jsonify(device_list)
......
...@@ -59,14 +59,17 @@ class _BasicTrainJob: ...@@ -59,14 +59,17 @@ class _BasicTrainJob:
create_time (DateTime): The create time of summary directory. create_time (DateTime): The create time of summary directory.
update_time (DateTime): The latest modify time of summary files directly in the summary directory. update_time (DateTime): The latest modify time of summary files directly in the summary directory.
profiler_dir (str): The relative path of profiler directory. profiler_dir (str): The relative path of profiler directory.
profiler_type (str): The profiler device type.
""" """
def __init__(self, train_id, abs_summary_base_dir, abs_summary_dir, create_time, update_time, profiler_dir): def __init__(self, train_id, abs_summary_base_dir, abs_summary_dir, create_time, update_time, profiler_dir,
profiler_type=""):
self._train_id = train_id self._train_id = train_id
self._abs_summary_base_dir = abs_summary_base_dir self._abs_summary_base_dir = abs_summary_base_dir
self._abs_summary_dir = abs_summary_dir self._abs_summary_dir = abs_summary_dir
self._create_time = create_time self._create_time = create_time
self._update_time = update_time self._update_time = update_time
self._profiler_dir = profiler_dir self._profiler_dir = profiler_dir
self._profiler_type = profiler_type
@property @property
def abs_summary_dir(self): def abs_summary_dir(self):
...@@ -98,6 +101,11 @@ class _BasicTrainJob: ...@@ -98,6 +101,11 @@ class _BasicTrainJob:
"""Get update time.""" """Get update time."""
return self._update_time return self._update_time
@property
def profiler_type(self):
"""Get profiler type"""
return self._profiler_type
class CachedTrainJob: class CachedTrainJob:
""" """
...@@ -952,6 +960,7 @@ class DataManager: ...@@ -952,6 +960,7 @@ class DataManager:
create_time=info['create_time'], create_time=info['create_time'],
update_time=info['update_time'], update_time=info['update_time'],
profiler_dir=None if profiler is None else profiler['directory'], profiler_dir=None if profiler is None else profiler['directory'],
profiler_type="" if profiler is None else profiler['profiler_type'],
)) ))
self._brief_cache.update_cache(basic_train_jobs) self._brief_cache.update_cache(basic_train_jobs)
......
...@@ -109,6 +109,7 @@ class SummaryWatcher: ...@@ -109,6 +109,7 @@ class SummaryWatcher:
'directory': profiler['directory'], 'directory': profiler['directory'],
'create_time': profiler['ctime'], 'create_time': profiler['ctime'],
'update_time': profiler['mtime'], 'update_time': profiler['mtime'],
'profiler_type': profiler['profiler_type']
} }
directories.append(directory) directories.append(directory)
...@@ -226,13 +227,15 @@ class SummaryWatcher: ...@@ -226,13 +227,15 @@ class SummaryWatcher:
elif entry.is_dir(): elif entry.is_dir():
profiler_pattern = re.search(self.PROFILER_DIRECTORY_REGEX, entry.name) profiler_pattern = re.search(self.PROFILER_DIRECTORY_REGEX, entry.name)
full_dir_path = os.path.join(summary_base_dir, relative_path, entry.name) full_dir_path = os.path.join(summary_base_dir, relative_path, entry.name)
if profiler_pattern is None or not self._is_valid_profiler_directory(full_dir_path): is_valid_profiler_dir, profiler_type = self._is_valid_profiler_directory(full_dir_path)
if profiler_pattern is None or not is_valid_profiler_dir:
return return
profiler = { profiler = {
'directory': os.path.join('.', entry.name), 'directory': os.path.join('.', entry.name),
'ctime': ctime, 'ctime': ctime,
'mtime': mtime, 'mtime': mtime,
"profiler_type": profiler_type
} }
summary_dict[relative_path] = { summary_dict[relative_path] = {
...@@ -286,19 +289,20 @@ class SummaryWatcher: ...@@ -286,19 +289,20 @@ class SummaryWatcher:
profiler_pattern = re.search(self.PROFILER_DIRECTORY_REGEX, entry.name) profiler_pattern = re.search(self.PROFILER_DIRECTORY_REGEX, entry.name)
if profiler_pattern is not None and entry.is_dir(): if profiler_pattern is not None and entry.is_dir():
full_path = os.path.realpath(os.path.join(summary_directory, entry.name)) full_path = os.path.realpath(os.path.join(summary_directory, entry.name))
if self._is_valid_profiler_directory(full_path): if self._is_valid_profiler_directory(full_path)[0]:
return True return True
return False return False
def _is_valid_profiler_directory(self, directory): def _is_valid_profiler_directory(self, directory):
profiler_type = ""
try: try:
from mindinsight.profiler.common.util import analyse_device_list_from_profiler_dir from mindinsight.profiler.common.util import analyse_device_list_from_profiler_dir
device_list = analyse_device_list_from_profiler_dir(directory) device_list, profiler_type = analyse_device_list_from_profiler_dir(directory)
except ImportError: except ImportError:
device_list = [] device_list = []
return bool(device_list) return bool(device_list), profiler_type
def list_summary_directories_by_pagination(self, summary_base_dir, offset=0, limit=10): def list_summary_directories_by_pagination(self, summary_base_dir, offset=0, limit=10):
""" """
......
...@@ -144,6 +144,7 @@ class TrainTaskManager(BaseProcessor): ...@@ -144,6 +144,7 @@ class TrainTaskManager(BaseProcessor):
update_time=basic_info.update_time.strftime('%Y-%m-%d %H:%M:%S'), update_time=basic_info.update_time.strftime('%Y-%m-%d %H:%M:%S'),
profiler_dir=basic_info.profiler_dir, profiler_dir=basic_info.profiler_dir,
cache_status=train_job.cache_status.value, cache_status=train_job.cache_status.value,
profiler_type=basic_info.profiler_type,
) )
if train_job.cache_status == CacheStatus.CACHED: if train_job.cache_status == CacheStatus.CACHED:
......
...@@ -14,4 +14,4 @@ ...@@ -14,4 +14,4 @@
# ============================================================================ # ============================================================================
"""The analyser module.""" """The analyser module."""
from . import analyser, minddata_pipeline_analyser, step_trace_analyser, \ from . import analyser, minddata_pipeline_analyser, step_trace_analyser, \
minddata_analyser, timeline_analyser minddata_analyser, timeline_analyser, gpu_analyser
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""The gpu base analyser."""
import csv
import os
from mindinsight.profiler.analyser.base_analyser import BaseAnalyser
from mindinsight.profiler.common.log import logger
class GpuAnalyser(BaseAnalyser):
"""Gpu base analyser."""
_csv_file_to_analyse = ""
def _load(self):
"""Load data according to the parsed AICORE operator types file."""
op_type_file_path = os.path.join(
self._profiling_dir,
self._csv_file_to_analyse.format(self._device_id)
)
if not os.path.isfile(op_type_file_path):
logger.warning('The file <%s> does not exist.', op_type_file_path)
return
with open(op_type_file_path, 'r') as file:
csv_reader = csv.reader(file)
_ = next(csv_reader)
for info in csv_reader:
self._data.append(self._convert_field_type(info))
@staticmethod
def _convert_field_type(row):
"""
Convert the field type to the specific type.
Args:
row (list): One row data from parsed data.
Returns:
list, the converted data.
"""
return row
def _filter(self, filter_condition):
"""
Filter the profiling data according to the filter condition.
Args:
filter_condition (dict): The filter condition.
"""
def _inner_filter(item: list):
return self._default_filter(item, filter_condition)
self._result = list(filter(_inner_filter, self._data))
class GpuOpTypeAnalyser(GpuAnalyser):
"""Gpu operation type analyser."""
_col_names = ["op_type", "type_occurrences", "total_time", "proportion", "avg_time"]
_csv_file_to_analyse = 'gpu_op_type_info_{}.csv'
@staticmethod
def _convert_field_type(row):
"""
Convert the field type to the specific type.
Args:
row (list): One row data from parsed data.
Returns:
list, the converted data.
"""
return [row[0], int(row[1]), float(row[2]), float(row[3]), float(row[4])]
class GpuOpInfoAnalyser(GpuAnalyser):
"""Gpu operation detail info analyser."""
_col_names = ["op_side", "op_type", "op_name", "op_full_name",
"op_occurrences", "op_total_time", "op_avg_time",
"proportion", "cuda_activity_cost_time", "cuda_activity_call_count"]
_csv_file_to_analyse = 'gpu_op_detail_info_{}.csv'
@staticmethod
def _convert_field_type(row):
"""
Convert the field type to the specific type.
Args:
row (list): One row data from parsed data.
Returns:
list, the converted data.
"""
return [row[0], row[1], row[2], row[3], int(row[4]), float(row[5]),
float(row[6]), float(row[7]), float(row[8]), int(row[9])]
class GpuCudaActivityAnalyser(GpuAnalyser):
"""Gpu activity type analyser."""
_col_names = ["name", "type", "op_full_name", "stream_id",
"block_dim", "grid_dim", "occurrences", "total_duration",
"avg_duration", "max_duration", "min_duration"]
_csv_file_to_analyse = 'gpu_activity_data_{}.csv'
@staticmethod
def _convert_field_type(row):
"""
Convert the field type to the specific type.
Args:
row (list): One row data from parsed data.
Returns:
list, the converted data.
"""
return [row[0], row[1], row[2], row[3], row[4], row[5], int(row[6]),
float(row[7]), float(row[8]), float(row[9]), float(row[10])]
...@@ -36,8 +36,10 @@ def analyse_device_list_from_profiler_dir(profiler_dir): ...@@ -36,8 +36,10 @@ def analyse_device_list_from_profiler_dir(profiler_dir):
list, the device_id list. list, the device_id list.
""" """
profiler_file_prefix = ["timeline_display", "output_op_compute_time"] profiler_file_prefix = ["timeline_display", "output_op_compute_time"]
gpu_profiler_file_prefix = ["gpu_op_detail_info", "gpu_activity_data", "gpu_op_type_info"]
device_id_list = set() device_id_list = set()
gpu_device_id_list = set()
for _, _, filenames in os.walk(profiler_dir): for _, _, filenames in os.walk(profiler_dir):
for filename in filenames: for filename in filenames:
if filename.startswith("step_trace_raw"): if filename.startswith("step_trace_raw"):
...@@ -51,8 +53,19 @@ def analyse_device_list_from_profiler_dir(profiler_dir): ...@@ -51,8 +53,19 @@ def analyse_device_list_from_profiler_dir(profiler_dir):
if device_num.isdigit() and '_'.join(items[:-1]) in profiler_file_prefix: if device_num.isdigit() and '_'.join(items[:-1]) in profiler_file_prefix:
device_id_list.add(device_num) device_id_list.add(device_num)
elif device_num.isdigit() and '_'.join(items[:-1]) in gpu_profiler_file_prefix:
return sorted(list(device_id_list)) gpu_device_id_list.add(device_num)
if device_id_list:
result_list = sorted(list(device_id_list))
profiler_type = "ascend"
elif gpu_device_id_list:
result_list = sorted(list(gpu_device_id_list))
profiler_type = "gpu"
else:
result_list = []
profiler_type = ""
return result_list, profiler_type
def query_latest_trace_time_file(profiler_dir, device_id=0): def query_latest_trace_time_file(profiler_dir, device_id=0):
......
...@@ -27,6 +27,13 @@ AICORE_TYPE_COL = ["op_type", "execution_time", "execution_frequency", "precent" ...@@ -27,6 +27,13 @@ AICORE_TYPE_COL = ["op_type", "execution_time", "execution_frequency", "precent"
AICORE_DETAIL_COL = ["op_name", "op_type", "avg_execution_time", "subgraph", "full_op_name"] AICORE_DETAIL_COL = ["op_name", "op_type", "avg_execution_time", "subgraph", "full_op_name"]
AICPU_COL = ["serial_number", "op_type", "total_time", "dispatch_time", "run_start", AICPU_COL = ["serial_number", "op_type", "total_time", "dispatch_time", "run_start",
"run_end"] "run_end"]
GPU_TYPE_COL = ["op_type", "type_occurrences", "total_time", "proportion", "avg_time"]
GPU_ACTIVITY_COL = ["name", "type", "op_full_name", "stream_id",
"block_dim", "grid_dim", "occurrences", "total_duration",
"avg_duration", "max_duration", "min_duration"]
GPU_DETAIL_COL = ["op_side", "op_type", "op_name", "op_full_name",
"op_occurrences", "op_total_time", "op_avg_time",
"proportion", "cuda_activity_cost_time", "cuda_activity_call_count"]
MINDDATA_PIPELINE_COL = [ MINDDATA_PIPELINE_COL = [
'op_id', 'op_type', 'num_workers', 'output_queue_average_size', 'op_id', 'op_type', 'num_workers', 'output_queue_average_size',
'output_queue_length', 'output_queue_usage_rate', 'sample_interval', 'output_queue_length', 'output_queue_usage_rate', 'sample_interval',
...@@ -67,10 +74,20 @@ def validate_condition(search_condition): ...@@ -67,10 +74,20 @@ def validate_condition(search_condition):
search_scope = AICORE_TYPE_COL search_scope = AICORE_TYPE_COL
elif op_type == "aicore_detail": elif op_type == "aicore_detail":
search_scope = AICORE_DETAIL_COL search_scope = AICORE_DETAIL_COL
elif op_type == "gpu_op_type":
search_scope = GPU_TYPE_COL
elif op_type == "gpu_op_info":
search_scope = GPU_DETAIL_COL
elif op_type == "gpu_cuda_activity":
search_scope = GPU_ACTIVITY_COL
else: else:
raise ProfilerOpTypeException("The op_type must in ['aicpu', 'aicore_type', 'aicore_detail']") raise ProfilerOpTypeException(
"The op_type must in ['aicpu', 'aicore_type', 'aicore_detail', "
"'gpu_op_type', 'gpu_op_info', 'gpu_cuda_activity']")
else: else:
raise ProfilerOpTypeException("The op_type must in ['aicpu', 'aicore_type', 'aicore_detail']") raise ProfilerOpTypeException(
"The op_type must in ['aicpu', 'aicore_type', 'aicore_detail', "
"'gpu_op_type', 'gpu_op_info', 'gpu_cuda_activity']")
if "group_condition" in search_condition: if "group_condition" in search_condition:
validate_group_condition(search_condition) validate_group_condition(search_condition)
...@@ -199,8 +216,6 @@ def validate_filter_condition(search_condition): ...@@ -199,8 +216,6 @@ def validate_filter_condition(search_condition):
if "op_name" in filter_condition: if "op_name" in filter_condition:
op_name_condition = filter_condition.get("op_name") op_name_condition = filter_condition.get("op_name")
validate_op_filter_condition(op_name_condition) validate_op_filter_condition(op_name_condition)
if "op_type" not in filter_condition and "op_name" not in filter_condition:
raise ProfilerFilterConditionException("The key of filter_condition is not support")
def validate_and_set_job_id_env(job_id_env): def validate_and_set_job_id_env(job_id_env):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册