diff --git a/mindinsight/profiler/README.md b/mindinsight/profiler/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c5eb20f09425f15284fe1b4f78515f78f19f1ce9 --- /dev/null +++ b/mindinsight/profiler/README.md @@ -0,0 +1,68 @@ +# MindInsight Profiler Introduction + +MindInsight Profiler is a performance analysis tool for MindSpore. It can help to analyse and optimize the performance of the neural networks. + +The Profiler enables users to: + +* Start/finish profiling the neural networks by adding two simple Profiler apis to the script. +* Analyse the performance of the operators in the neural network. + +## Add profiling code to MindSpore script + +To enable profiling on MindSpore, the MindInsight Profiler apis should be added to the script: + +1. Import MindInsight Profiler + + from mindinsight.profiler import Profiler + +2. Initialize the Profiler before training + + Example: + + profiler = Profiler(output_path="./data", is_detail=True, is_show_op_path=False, subgraph='All') + + Parameters including: + + subgraph (str): Defines which subgraph to monitor and analyse, can be 'all', 'Default', 'Gradients'. + is_detail (bool): Whether to show profiling data for op_instance level, only show optype level if False. + is_show_op_path (bool): Whether to save the full path for each op instance. + output_path (str): Output data path. + optypes_to_deal (list): Op type names, the data of which optype should be collected and analysed, + will deal with all op if null. + optypes_not_deal (list): Op type names, the data of which optype will not be collected and analysed. + +3. Call Profiler.analyse() at the end of the program + + Profiler.analyse() will collect profiling data and generate the analysis results. + +After training, we can open MindInsight UI to analyse the performance. + +## Operator Performance Analysis + +The operator performance analysis component is used to display the execution time of the operators during MindSpore run. + + ![op_type_statistics.png](./images/op_type_statistics.PNG) + +Figure 17: Statistics for Operator Types + +Figure 17 displays the statistics for the operator types, including: + +- Choose pie or bar graph to show the proportion time occupied by each operator type. The time of one operator type is calculated by accumulating the execution time of operators belong to this type. +- Display top 20 operator types with longest execution time, show the proportion and execution time (ms) of each operator type. + +![op_statistics.png](./images/op_statistics.PNG) + +Figure 18: Statistics for Operators + +Figure 18 displays the statistics table for the operators, including: + +- Choose All: Display statistics for the operators, including operator name、type、excution time、full scope time、information etc. The table will be sorted by execution time by default. +- Choose Type: Display statistics for the operator types, including operator type name、execution time、execution frequency and proportion of total time. Users can click on each line, querying for all the operators belong to this type. +- Search: There is a search box on the right, which can support fuzzy search for operators/operator types. + +## Limitations + +The Profiler has the following limitations now: + +* Only programs running on Ascend chip is supported. +* To limit the data size generated by the Profiler, MindInsight suggests that for large neural network, the profiled steps should better below 10. diff --git a/mindinsight/profiler/__init__.py b/mindinsight/profiler/__init__.py index e30774307ca2107b3a81c071ad33c042ef924790..e7b3674b19fcfe7508392f5accbfa452e3a37007 100644 --- a/mindinsight/profiler/__init__.py +++ b/mindinsight/profiler/__init__.py @@ -12,3 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ +""" +Profiler Module Introduction + +This module provides Python APIs to enable the profiling of MindSpore neural networks. +Users can import the mindinsight.profiler.Profiler, initialize the Profiler object to start profiling, +and use Profiler.analyse() to stop profiling and analyse the results. +To visualize the profiling results, users can open MindInsight Web, find the corresponding run +and click the profile link. +Now, Profiler supports the AICore operator analysis. +""" +from mindinsight.profiler.profiling import Profiler + +__all__ = ["Profiler"] diff --git a/mindinsight/profiler/images/op_statistics.PNG b/mindinsight/profiler/images/op_statistics.PNG new file mode 100644 index 0000000000000000000000000000000000000000..05a146e1ffd5f732ad0fb8c80bd9abe81fb65ab4 Binary files /dev/null and b/mindinsight/profiler/images/op_statistics.PNG differ diff --git a/mindinsight/profiler/images/op_type_statistics.PNG b/mindinsight/profiler/images/op_type_statistics.PNG new file mode 100644 index 0000000000000000000000000000000000000000..6d18ccaa0f393938c8f89ca7c20e21e5ff496b4a Binary files /dev/null and b/mindinsight/profiler/images/op_type_statistics.PNG differ diff --git a/mindinsight/profiler/parser/hwts_log_parser.py b/mindinsight/profiler/parser/hwts_log_parser.py index 1b1d14ee52310c6cf4c801006740e8ef7404ed97..64413c5b594d34e602d259c74ffd0f7488a7e0b1 100755 --- a/mindinsight/profiler/parser/hwts_log_parser.py +++ b/mindinsight/profiler/parser/hwts_log_parser.py @@ -18,13 +18,14 @@ from tabulate import tabulate from mindinsight.profiler.common._utils import fwrite_format, get_file_join_name from mindinsight.profiler.common.log import logger + class HWTSLogParser: """ The Parser for hwts log files. Args: - _input_path(str): The profiling job path. Such as: '/var/log/npu/profiling/JOBAIFGJEJFEDCBAEADIFJAAAAAAAAAA". - output_filename(str): The output data path and name. Such as: './output_format_data_hwts_0.txt'. + _input_path (str): The profiling job path. Such as: '/var/log/npu/profiling/JOBAIFGJEJFEDCBAEADIFJAAAAAAAAAA". + output_filename (str): The output data path and name. Such as: './output_format_data_hwts_0.txt'. """ _source_file_target = 'hwts.log.data.45.dev.profiler_default_tag' @@ -53,7 +54,7 @@ class HWTSLogParser: Execute the parser, get result data, and write it to the output file. Returns: - bool: whether succeed to analyse hwts log. + bool, whether succeed to analyse hwts log. """ content_format = ['QIIIIIIIIIIII', 'QIIQIIIIIIII', 'IIIIQIIIIIIII'] diff --git a/mindinsight/profiler/parser/optime_parser.py b/mindinsight/profiler/parser/optime_parser.py index fe1830528d5fdc9a8ab18afa8525b73f49fa2f9d..6b2436b423a437d2c8567128ec546db477ed271b 100755 --- a/mindinsight/profiler/parser/optime_parser.py +++ b/mindinsight/profiler/parser/optime_parser.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""op compute time files parser""" +"""Op compute time files parser.""" from tabulate import tabulate from mindinsight.profiler.common._utils import fwrite_format @@ -21,9 +21,9 @@ class OPComputeTimeParser: Join hwts info and framework info, get op time info, and output to the result file. Args: - hwts_output_file(str): The file path of hwts_output_file. Such as: './output_format_data_hwts_0.txt". - output_filename(str): The output data file path and name. Such as: './output_op_compute_time_0.txt'. - op_task_info(dict): The task and op relation info. format as: {taskid, [opname, streamid, block dim]}. + hwts_output_file (str): The file path of hwts_output_file. Such as: './output_format_data_hwts_0.txt". + output_filename (str): The output data file path and name. Such as: './output_op_compute_time_0.txt'. + op_task_info (dict): The task and op relation info. The format: {task_id, [opname, stream_id, block dim]}. """ _dst_file_title = 'title:op compute time' @@ -79,7 +79,7 @@ class OPComputeTimeParser: if op_start[1] == "Start" and op_end[1] == "End"\ and op_start[0] == op_end[0]: - # op_name, taskId, cycle counter, streamId + # op_name, task_id, cycle counter, stream_id tmp_result_data.append([op_start[0], op_start[2], int(op_end[3]) - int(op_start[3]), op_start[4]]) cur_index += 2 else: @@ -103,7 +103,6 @@ class OPComputeTimeParser: op_name_task_dict[item[0]] = item[1] op_name_count_dict[item[0]] = 1 - for op_name, time in op_name_time_dict.items(): if op_name in op_name_steamid_dict.keys(): stream_id = op_name_steamid_dict[op_name] diff --git a/mindinsight/profiler/profiling.py b/mindinsight/profiler/profiling.py index 3d0fd99db181c504b61b3fd93c46cc30f37a609e..741517558c7161df35e6fae182ebbd00b5323f0c 100644 --- a/mindinsight/profiler/profiling.py +++ b/mindinsight/profiler/profiling.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""profiling api file.""" +"""Profiling api file.""" import os import time from tabulate import tabulate @@ -30,10 +30,33 @@ from mindinsight.profiler.common.validator.checkparam import \ from mindinsight.profiler.common.log import logger from mindinsight.utils.exceptions import MindInsightException -profiling_log_base_path = "/var/log/npu/profiling" +PROFILING_LOG_BASE_PATH = "/var/log/npu/profiling" + class Profiler: - """Performance profiling tool.""" + """ + Performance profiling API. + + Enable MindSpore users to profile the neural network. + + Args: + subgraph (str): Defines which subgraph to monitor and analyse, can be 'all', 'Default', 'Gradients'. + is_detail (bool): Whether to show profiling data for op_instance level, only show optype level if False. + is_show_op_path (bool): Whether to save the full path for each op instance. + output_path (str): Output data path. + optypes_to_deal (list): Op type names, the data of which optype should be collected and analysed, + will deal with all op if null. + optypes_not_deal (list): Op type names, the data of which optype will not be collected and analysed. + + Examples: + >>> from mindinsight.profiler import Profiler + >>> profiler = Profiler(subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data') + >>> model = Model(train_network) + >>> dataset = get_dataset() + >>> model.train(2, dataset) + >>> profiler.analyse() + """ + _base_profiling_container_path = "/var/log/npu/profiling/container" _hwts_output_filename_target = "output_format_data_hwts_" _opcompute_output_filename_target = "output_op_compute_time_" @@ -41,19 +64,6 @@ class Profiler: def __init__(self, subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data', optypes_to_deal='', optypes_not_deal='Variable', job_id=""): - """ - Init profiling service, called berfore network training. - - Args: - subgraph(str): which subgraph to monit and anlayse, can be 'all', 'Default', 'Gradients'. - is_detail(Bool): whether to show profiling data for op_instace level, only show optype level if False. - is_show_op_path(Bool): whether to save the full path for each op instace. - output_path(Bool): output data path. - optypes_to_deal(List): Op type names, the data of which optype should be collected and analysed, - will deal with all op if null. - optypes_not_deal(List): Op type names, the data of which optype will not be collected and analysed. - """ - dev_id = os.getenv('DEVICE_ID') if not dev_id: dev_id = "0" @@ -82,9 +92,18 @@ class Profiler: self._start_time = int(time.time() * 10000000) logger.info("Profiling: profiling start time: %d", self._start_time) - def analyse(self): - """Collect and analyze performance data, called after training or during training.""" + """ + Collect and analyse performance data, called after training or during training. + + Examples: + >>> from mindinsight.profiler import Profiler + >>> profiler = Profiler(subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data') + >>> model = Model(train_network) + >>> dataset = get_dataset() + >>> model.train(2, dataset) + >>> profiler.analyse() + """ try: from mindspore.communication.management import release @@ -96,13 +115,13 @@ class Profiler: job_id = self._get_profiling_job_id() if not job_id: - msg = ("Fail to get profiling job, please check whether job dir was generated under path %s"\ - %profiling_log_base_path) + msg = ("Fail to get profiling job, please check whether job dir was generated under path %s" \ + % PROFILING_LOG_BASE_PATH) raise RuntimeError(msg) logger.info("Profiling: job id is %s ", job_id) - source_path = os.path.join(profiling_log_base_path, job_id) + source_path = os.path.join(PROFILING_LOG_BASE_PATH, job_id) # parse hwts.log.data.45.dev file, and get task profiling data hwts_output_filename = self._hwts_output_filename_target + self._dev_id + ".txt" hwts_output_filename = os.path.join(self._output_path, hwts_output_filename) @@ -154,12 +173,12 @@ class Profiler: return self._profiling_job_id job_id = "" - cmd = "ls -t " + profiling_log_base_path + "|grep JOB|awk '{print $1}'" + cmd = "ls -t " + PROFILING_LOG_BASE_PATH + "|grep JOB|awk '{print $1}'" r = os.popen(cmd) profiling_job_dirs = r.readlines() r.close() for item in profiling_job_dirs: - path = os.path.join(profiling_log_base_path, item.strip()) + path = os.path.join(PROFILING_LOG_BASE_PATH, item.strip()) log_file = get_file_names(path, "host_start.log") if not log_file: logger.error("Profiling: job path %s, host_start.log not exist.", path) @@ -191,10 +210,10 @@ class Profiler: Parse host start log file, get the device id and start time of the job. Args: - input_file(str): the file path of the host start log file. + input_file (str): The file path of the host start log file. Returns: - dict: job start time and device id. + dict, job start time and device id. """ item_dict = {} @@ -207,7 +226,7 @@ class Profiler: return item_dict def _analyser_op_info(self): - """Analyser the operator information.""" + """Analyse the operator information.""" integrator = Integrator(self._output_path, self._dev_id) integrator.integrate()