Use multiple processes to calc events.

1. To accelerate summary file parsing, multiple processes are used. As the first step to mindinsight parsing performance optimization, we only made changes to _load_single_file function. 2. This PR will imporve summary parsing throughput dramatically (about cpu_count times) 3. Changes are mainly about _load_single_file function In the future, a more global concurrent computing framework is needed for mindinsight. See the gitee wiki doc for details.

Use multiple processes to calc events.
1. To accelerate summary file parsing, multiple processes are used. As the first step to mindinsight parsing performance optimization, we only made changes to _load_single_file function. 2. This PR will imporve summary parsing throughput dramatically (about cpu_count times) 3. Changes are mainly about _load_single_file function In the future, a more global concurrent computing framework is needed for mindinsight. See the gitee wiki doc for details.
7877f33b · wangshuide2020 · 84647841 · 7877f33b · 7877f33b · 7877f33b
9 changed file
--- a/mindinsight/backend/run.py
+++ b/mindinsight/backend/run.py
@@ -236,9 +236,10 @@ def start():
    process = subprocess.Popen(
        shlex.split(cmd),
        shell=False,
-        stdin=subprocess.PIPE,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE
+        # Change stdout to DEVNULL to prevent broken pipe error when creating new processes.
+        stdin=subprocess.DEVNULL,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.STDOUT
    )

    # sleep 1 second for gunicorn appplication to load modules
@@ -246,9 +247,7 @@ def start():

    # check if gunicorn application is running
    if process.poll() is not None:
-        _, stderr = process.communicate()
-        for line in stderr.decode().split('\n'):
-            console.error(line)
+        console.error("Start MindInsight failed. See log for details.")
    else:
        state_result = _check_server_start_stat(errorlog_abspath, log_size)
        # print gunicorn start state to stdout

--- a/mindinsight/conf/constants.py
+++ b/mindinsight/conf/constants.py
@@ -14,6 +14,7 @@
 # ============================================================================
 """Constants module for mindinsight settings."""
 import logging
+import os

 ####################################
 # Global default settings.
@@ -48,6 +49,7 @@ API_PREFIX = '/v1/mindinsight'
 # Datavisual default settings.
 ####################################
 MAX_THREADS_COUNT = 15
+MAX_PROCESSES_COUNT = max(os.cpu_count() or 0, 15)

 MAX_TAG_SIZE_PER_EVENTS_DATA = 300
 DEFAULT_STEP_SIZES_PER_TAG = 500

--- a/mindinsight/datavisual/data_transform/data_loader.py
+++ b/mindinsight/datavisual/data_transform/data_loader.py
@@ -34,8 +34,13 @@ class DataLoader:
        self._summary_dir = summary_dir
        self._loader = None

-    def load(self):
-        """Load the data when loader is exist."""
+    def load(self, workers_count=1):
+        """Load the data when loader is exist.
+
+        Args:
+            workers_count (int): The count of workers. Default value is 1.
+        """
+
        if self._loader is None:
            ms_dataloader = MSDataLoader(self._summary_dir)
            loaders = [ms_dataloader]
@@ -48,7 +53,7 @@ class DataLoader:
                logger.warning("No valid files can be loaded, summary_dir: %s.", self._summary_dir)
                raise exceptions.SummaryLogPathInvalid()

-        self._loader.load()
+        self._loader.load(workers_count)

    def get_events_data(self):
        """

--- a/mindinsight/datavisual/data_transform/data_manager.py
+++ b/mindinsight/datavisual/data_transform/data_manager.py
@@ -510,7 +510,7 @@ class _DetailCacheManager(_BaseCacheManager):
            logger.debug("delete loader %s", loader_id)
            self._loader_pool.pop(loader_id)

-    def _execute_loader(self, loader_id):
+    def _execute_loader(self, loader_id, workers_count):
        """
        Load data form data_loader.

@@ -518,7 +518,7 @@ class _DetailCacheManager(_BaseCacheManager):

        Args:
            loader_id (str): An ID for `Loader`.
-
+            workers_count (int): The count of workers.
        """
        try:
            with self._loader_pool_mutex:
@@ -527,7 +527,7 @@ class _DetailCacheManager(_BaseCacheManager):
                    logger.debug("Loader %r has been deleted, will not load data.", loader_id)
                    return

-            loader.data_loader.load()
+            loader.data_loader.load(workers_count)

            # Update loader cache status to CACHED.
            # Loader with cache status CACHED should remain the same cache status.
@@ -584,7 +584,7 @@ class _DetailCacheManager(_BaseCacheManager):
            futures = []
            loader_pool = self._get_snapshot_loader_pool()
            for loader_id in loader_pool:
-                future = executor.submit(self._execute_loader, loader_id)
+                future = executor.submit(self._execute_loader, loader_id, threads_count)
                futures.append(future)
            wait(futures, return_when=ALL_COMPLETED)


--- a/mindinsight/datavisual/data_transform/events_data.py
+++ b/mindinsight/datavisual/data_transform/events_data.py
@@ -85,6 +85,7 @@ class EventsData:
            deleted_tag = self._check_tag_out_of_spec(plugin_name)
            if deleted_tag is not None:
                if tag in self._deleted_tags:
+                    logger.debug("Tag is in deleted tags: %s.", tag)
                    return
                self.delete_tensor_event(deleted_tag)


--- a/mindinsight/datavisual/data_transform/ms_data_loader.py
+++ b/mindinsight/datavisual/data_transform/ms_data_loader.py
@@ -19,12 +19,17 @@ This module is used to load the MindSpore training log file.
 Each instance will read an entire run, a run can contain one or
 more log file.
 """
+import concurrent.futures as futures
+import math
+import os
 import re
 import struct
+import threading

 from google.protobuf.message import DecodeError
 from google.protobuf.text_format import ParseError

+from mindinsight.conf import settings
 from mindinsight.datavisual.common import exceptions
 from mindinsight.datavisual.common.enums import PluginNameEnum
 from mindinsight.datavisual.common.log import logger
@@ -32,13 +37,13 @@ from mindinsight.datavisual.data_access.file_handler import FileHandler
 from mindinsight.datavisual.data_transform.events_data import EventsData
 from mindinsight.datavisual.data_transform.events_data import TensorEvent
 from mindinsight.datavisual.data_transform.graph import MSGraph
-from mindinsight.datavisual.proto_files import mindinsight_summary_pb2 as summary_pb2
-from mindinsight.datavisual.proto_files import mindinsight_anf_ir_pb2 as anf_ir_pb2
-from mindinsight.datavisual.utils import crc32
-from mindinsight.utils.exceptions import UnknownError
 from mindinsight.datavisual.data_transform.histogram import Histogram
 from mindinsight.datavisual.data_transform.histogram_container import HistogramContainer
 from mindinsight.datavisual.data_transform.tensor_container import TensorContainer
+from mindinsight.datavisual.proto_files import mindinsight_anf_ir_pb2 as anf_ir_pb2
+from mindinsight.datavisual.proto_files import mindinsight_summary_pb2 as summary_pb2
+from mindinsight.datavisual.utils import crc32
+from mindinsight.utils.exceptions import UnknownError

 HEADER_SIZE = 8
 CRC_STR_SIZE = 4
@@ -79,11 +84,14 @@ class MSDataLoader:
                           "we will reload all files in path %s.", self._summary_dir)
            self.__init__(self._summary_dir)

-    def load(self):
+    def load(self, workers_count=1):
        """
        Load all log valid files.

        When the file is reloaded, it will continue to load from where it left off.
+
+        Args:
+            workers_count (int): The count of workers. Default value is 1.
        """
        logger.debug("Start to load data in ms data loader.")
        filenames = self.filter_valid_files()
@@ -95,7 +103,7 @@ class MSDataLoader:
        self._check_files_deleted(filenames, old_filenames)

        for parser in self._parser_list:
-            parser.parse_files(filenames, events_data=self._events_data)
+            parser.parse_files(workers_count, filenames, events_data=self._events_data)

    def filter_valid_files(self):
        """
@@ -125,11 +133,12 @@ class _Parser:
        self._latest_mtime = 0
        self._summary_dir = summary_dir

-    def parse_files(self, filenames, events_data):
+    def parse_files(self, workers_count, filenames, events_data):
        """
        Load files and parse files content.

        Args:
+            workers_count (int): The count of workers.
            filenames (list[str]): File name list.
            events_data (EventsData): The container of event data.
        """
@@ -177,7 +186,7 @@ class _Parser:
 class _PbParser(_Parser):
    """This class is used to parse pb file."""

-    def parse_files(self, filenames, events_data):
+    def parse_files(self, workers_count, filenames, events_data):
        pb_filenames = self.filter_files(filenames)
        pb_filenames = self.sort_files(pb_filenames)
        for filename in pb_filenames:
@@ -255,11 +264,12 @@ class _SummaryParser(_Parser):
        self._summary_file_handler = None
        self._events_data = None

-    def parse_files(self, filenames, events_data):
+    def parse_files(self, workers_count, filenames, events_data):
        """
        Load summary file and parse file content.

        Args:
+            workers_count (int): The count of workers.
            filenames (list[str]): File name list.
            events_data (EventsData): The container of event data.
        """
@@ -285,7 +295,7 @@ class _SummaryParser(_Parser):

            self._latest_file_size = new_size
            try:
-                self._load_single_file(self._summary_file_handler)
+                self._load_single_file(self._summary_file_handler, workers_count)
            except UnknownError as ex:
                logger.warning("Parse summary file failed, detail: %r,"
                               "file path: %s.", str(ex), file_path)
@@ -304,36 +314,75 @@ class _SummaryParser(_Parser):
            lambda filename: (re.search(r'summary\.\d+', filename)
                              and not filename.endswith("_lineage")), filenames))

-    def _load_single_file(self, file_handler):
+    def _load_single_file(self, file_handler, workers_count):
        """
        Load a log file data.

        Args:
            file_handler (FileHandler): A file handler.
+            workers_count (int): The count of workers.
        """
-        logger.debug("Load single summary file, file path: %s.", file_handler.file_path)
-        while True:
-            start_offset = file_handler.offset
-            try:
-                event_str = self._event_load(file_handler)
-                if event_str is None:
+
+        default_concurrency = 1
+        cpu_count = os.cpu_count()
+        if cpu_count is None:
+            concurrency = default_concurrency
+        else:
+            concurrency = min(math.floor(cpu_count / workers_count),
+                              math.floor(settings.MAX_PROCESSES_COUNT / workers_count))
+        if concurrency <= 0:
+            concurrency = default_concurrency
+        logger.debug("Load single summary file, file path: %s, concurrency: %s.", file_handler.file_path, concurrency)
+
+        semaphore = threading.Semaphore(value=concurrency)
+        with futures.ProcessPoolExecutor(max_workers=concurrency) as executor:
+            while True:
+                start_offset = file_handler.offset
+                try:
+                    event_str = self._event_load(file_handler)
+                    if event_str is None:
+                        file_handler.reset_offset(start_offset)
+                        break
+
+                    # Make sure we have at most concurrency tasks not finished to save memory.
+                    semaphore.acquire()
+                    future = executor.submit(self._event_parse, event_str, self._latest_filename)
+
+                    def _add_tensor_event_callback(future_value):
+                        try:
+                            tensor_values = future_value.result()
+                            for tensor_value in tensor_values:
+                                if tensor_value.plugin_name == PluginNameEnum.GRAPH.value:
+                                    try:
+                                        graph_tags = self._events_data.list_tags_by_plugin(PluginNameEnum.GRAPH.value)
+                                    except KeyError:
+                                        graph_tags = []
+
+                                    summary_tags = self.filter_files(graph_tags)
+                                    for tag in summary_tags:
+                                        self._events_data.delete_tensor_event(tag)
+
+                                self._events_data.add_tensor_event(tensor_value)
+                        except Exception as exc:
+                            # Log exception for debugging.
+                            logger.exception(exc)
+                            raise
+                        finally:
+                            semaphore.release()
+
+                    future.add_done_callback(_add_tensor_event_callback)
+                except exceptions.CRCFailedError:
                    file_handler.reset_offset(start_offset)
+                    logger.warning("Check crc faild and ignore this file, file_path=%s, "
+                                   "offset=%s.", file_handler.file_path, file_handler.offset)
                    break
-
-                event = summary_pb2.Event.FromString(event_str)
-                self._event_parse(event)
-            except exceptions.CRCFailedError:
-                file_handler.reset_offset(start_offset)
-                logger.warning("Check crc faild and ignore this file, file_path=%s, "
-                               "offset=%s.", file_handler.file_path, file_handler.offset)
-                break
-            except (OSError, DecodeError, exceptions.MindInsightException) as ex:
-                logger.warning("Parse log file fail, and ignore this file, detail: %r,"
-                               "file path: %s.", str(ex), file_handler.file_path)
-                break
-            except Exception as ex:
-                logger.exception(ex)
-                raise UnknownError(str(ex))
+                except (OSError, DecodeError, exceptions.MindInsightException) as ex:
+                    logger.warning("Parse log file fail, and ignore this file, detail: %r,"
+                                   "file path: %s.", str(ex), file_handler.file_path)
+                    break
+                except Exception as ex:
+                    logger.exception(ex)
+                    raise UnknownError(str(ex))

    def _event_load(self, file_handler):
        """
@@ -381,20 +430,29 @@ class _SummaryParser(_Parser):

        return event_str

-    def _event_parse(self, event):
+    @staticmethod
+    def _event_parse(event_str, latest_file_name):
        """
        Transform `Event` data to tensor_event and update it to EventsData.

+        This method is static to avoid sending unnecessary objects to other processes.
+
        Args:
-            event (Event): Message event in summary proto, data read from file handler.
+            event (str): Message event string in summary proto, data read from file handler.
+            latest_file_name (str): Latest file name.
        """
+
        plugins = {
            'scalar_value': PluginNameEnum.SCALAR,
            'image': PluginNameEnum.IMAGE,
            'histogram': PluginNameEnum.HISTOGRAM,
            'tensor': PluginNameEnum.TENSOR
        }
+        logger.debug("Start to parse event string. Event string len: %s.", len(event_str))
+        event = summary_pb2.Event.FromString(event_str)
+        logger.debug("Deserialize event string completed.")

+        ret_tensor_events = []
        if event.HasField('summary'):
            for value in event.summary.value:
                for plugin in plugins:
@@ -402,6 +460,7 @@ class _SummaryParser(_Parser):
                        continue
                    plugin_name_enum = plugins[plugin]
                    tensor_event_value = getattr(value, plugin)
+                    logger.debug("Processing plugin value: %s.", plugin_name_enum)

                    if plugin == 'histogram':
                        tensor_event_value = HistogramContainer(tensor_event_value)
@@ -419,29 +478,23 @@ class _SummaryParser(_Parser):
                                               tag='{}/{}'.format(value.tag, plugin_name_enum.value),
                                               plugin_name=plugin_name_enum.value,
                                               value=tensor_event_value,
-                                               filename=self._latest_filename)
-                    self._events_data.add_tensor_event(tensor_event)
+                                               filename=latest_file_name)
+                    logger.debug("Tensor event generated, plugin is %s, tag is %s, step is %s.",
+                                 plugin_name_enum, value.tag, event.step)
+                    ret_tensor_events.append(tensor_event)

        elif event.HasField('graph_def'):
            graph = MSGraph()
            graph.build_graph(event.graph_def)
            tensor_event = TensorEvent(wall_time=event.wall_time,
                                       step=event.step,
-                                       tag=self._latest_filename,
+                                       tag=latest_file_name,
                                       plugin_name=PluginNameEnum.GRAPH.value,
                                       value=graph,
-                                       filename=self._latest_filename)
-
-            try:
-                graph_tags = self._events_data.list_tags_by_plugin(PluginNameEnum.GRAPH.value)
-            except KeyError:
-                graph_tags = []
-
-            summary_tags = self.filter_files(graph_tags)
-            for tag in summary_tags:
-                self._events_data.delete_tensor_event(tag)
+                                       filename=latest_file_name)
+            ret_tensor_events.append(tensor_event)

-            self._events_data.add_tensor_event(tensor_event)
+        return ret_tensor_events

    @staticmethod
    def _compare_summary_file(current_file, dst_file):

--- a/mindinsight/datavisual/data_transform/tensor_container.py
+++ b/mindinsight/datavisual/data_transform/tensor_container.py
@@ -199,8 +199,8 @@ class TensorContainer:

    def __init__(self, tensor_message):
        self._lock = threading.Lock
-        self._msg = tensor_message
-        self._dims = tensor_message.dims
+        # Original dims can not be pickled to transfer to other process, so tuple is used.
+        self._dims = tuple(tensor_message.dims)
        self._data_type = tensor_message.data_type
        self._np_array = None
        self._data = _get_data_from_tensor(tensor_message)
@@ -265,5 +265,4 @@ class TensorContainer:
            logger.error("Reshape array fail, detail: %r", str(ex))
            return

-        self._msg = None
        self._np_array = ndarray
--- a/mindinsight/datavisual/processors/tensor_processor.py
+++ b/mindinsight/datavisual/processors/tensor_processor.py
@@ -245,7 +245,7 @@ class TensorProcessor(BaseProcessor):
            # This value is an instance of TensorContainer
            value = tensor.value
            value_dict = {
-                "dims": tuple(value.dims),
+                "dims": value.dims,
                "data_type": anf_ir_pb2.DataType.Name(value.data_type)
            }
            if detail and detail == 'stats':
@@ -313,7 +313,7 @@ class TensorProcessor(BaseProcessor):
                "wall_time": tensor.wall_time,
                "step": tensor.step,
                "value": {
-                    "dims": tuple(value.dims),
+                    "dims": value.dims,
                    "data_type": anf_ir_pb2.DataType.Name(value.data_type),
                    "data": res_data.tolist(),
                    "statistics": get_statistics_dict(value, flatten_data)
@@ -362,7 +362,7 @@ class TensorProcessor(BaseProcessor):
                "wall_time": tensor.wall_time,
                "step": tensor.step,
                "value": {
-                    "dims": tuple(value.dims),
+                    "dims": value.dims,
                    "data_type": anf_ir_pb2.DataType.Name(value.data_type),
                    "histogram_buckets": buckets,
                    "statistics": get_statistics_dict(value, None)

--- a/mindinsight/scripts/stop.py
+++ b/mindinsight/scripts/stop.py
@@ -103,21 +103,17 @@ class Command(BaseCommand):
        self.logfile.info('Stop mindinsight with port %s and pid %s.', port, pid)

        process = psutil.Process(pid)
-        child_pids = [child.pid for child in process.children()]
+        processes_to_kill = [process]
+        # Set recursive to True to kill grand children processes.
+        for child in process.children(recursive=True):
+            processes_to_kill.append(child)

-        # kill gunicorn master process
-        try:
-            os.kill(pid, signal.SIGKILL)
-        except PermissionError:
-            self.console.info('kill pid %s failed due to permission error', pid)
-            sys.exit(1)
-
-        # cleanup gunicorn worker processes
-        for child_pid in child_pids:
+        for proc in processes_to_kill:
+            self.logfile.info('Stopping mindinsight process %s.', proc.pid)
            try:
-                os.kill(child_pid, signal.SIGKILL)
-            except ProcessLookupError:
-                pass
+                proc.send_signal(signal.SIGKILL)
+            except psutil.Error as ex:
+                self.logfile.warning("Stop process %s failed. Detail: %s.", proc.pid, str(ex))

        for hook in HookUtils.instance().hooks():
            hook.on_shutdown(self.logfile)
@@ -154,7 +150,19 @@ class Command(BaseCommand):
            if user != process.username():
                continue

-            pid = process.pid if process.ppid() == 1 else process.ppid()
+            gunicorn_master_process = process
+
+            # The gunicorn master process might have grand children (eg forked by process pool).
+            while True:
+                parent_process = gunicorn_master_process.parent()
+                if parent_process is None or parent_process.pid == 1:
+                    break
+                parent_cmd = parent_process.cmdline()
+                if ' '.join(parent_cmd).find(self.cmd_regex) == -1:
+                    break
+                gunicorn_master_process = parent_process
+
+            pid = gunicorn_master_process.pid

            for open_file in process.open_files():
                if open_file.path.endswith(self.access_log_path):