From 9daf2ae128f35c88d96b702e1446886c48b275af Mon Sep 17 00:00:00 2001 From: wangshuide2020 <7511764+wangshuide2020@user.noreply.gitee.com> Date: Wed, 26 Aug 2020 20:13:53 +0800 Subject: [PATCH] kill children processes of worker before worker has been killed by gunicorn master. --- README_CN.md | 2 +- mindinsight/backend/config/gunicorn_conf.py | 45 +++++++++++++++++++ .../data_transform/tensor_container.py | 6 ++- 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/README_CN.md b/README_CN.md index a96ddb0..2d749bc 100644 --- a/README_CN.md +++ b/README_CN.md @@ -23,7 +23,7 @@ MindInsight为MindSpore提供了简单易用的调优调试能力。在训练过 请从[MindSpore下载页面](https://www.mindspore.cn/versions)下载并安装whl包。 ``` -pip install mindinsight-{version}-cp37-cp37m-linux_{arch}.whl +pip install -U mindinsight-{version}-cp37-cp37m-linux_{arch}.whl ``` 更多MindInsight的安装方法,请点击[安装教程](https://www.mindspore.cn/install/)中的MindInsight章节进行查看。 diff --git a/mindinsight/backend/config/gunicorn_conf.py b/mindinsight/backend/config/gunicorn_conf.py index 37221a4..0993a0b 100644 --- a/mindinsight/backend/config/gunicorn_conf.py +++ b/mindinsight/backend/config/gunicorn_conf.py @@ -15,9 +15,13 @@ """Config file for gunicorn.""" import os +import multiprocessing +import signal import threading +import time from importlib import import_module +import psutil import gunicorn @@ -43,3 +47,44 @@ def on_starting(server): hook_module = import_module('mindinsight.utils.hook') for hook in hook_module.HookUtils.instance().hooks(): threading.Thread(target=hook.on_startup, args=(server.log,)).start() + + +def post_fork(server, worker): + """ + Launch a process to listen worker after gunicorn fork worker. + + Children processes of gunicorn worker should be killed when worker has been killed + because gunicorn master murders this worker for some reasons such as worker timeout. + + Args: + server (Arbiter): gunicorn server instance. + worker (ThreadWorker): worker instance. + """ + def murder_worker_children_processes(): + processes_to_kill = [] + # sleep 3 seconds so that all worker children processes have been launched. + time.sleep(3) + process = psutil.Process(worker.pid) + for child in process.children(recursive=True): + if child.pid != os.getpid(): + processes_to_kill.append(child) + while True: + if os.getppid() != worker.pid: + current_worker_pid = os.getppid() + for proc in processes_to_kill: + server.log.info("Original worker pid: %d, current worker pid: %d, stop process %d", + worker.pid, current_worker_pid, proc.pid) + try: + proc.send_signal(signal.SIGKILL) + except psutil.NoSuchProcess: + continue + except psutil.Error as ex: + server.log.error("Stop process %d failed. Detail: %s.", proc.pid, str(ex)) + server.log.info("%d processes have been killed.", len(processes_to_kill)) + break + time.sleep(1) + + listen_process = multiprocessing.Process(target=murder_worker_children_processes, + name="murder_worker_children_processes") + listen_process.start() + server.log.info("Server pid: %d, start to listening.", server.pid) diff --git a/mindinsight/datavisual/data_transform/tensor_container.py b/mindinsight/datavisual/data_transform/tensor_container.py index 0f86580..0bdc94d 100644 --- a/mindinsight/datavisual/data_transform/tensor_container.py +++ b/mindinsight/datavisual/data_transform/tensor_container.py @@ -193,8 +193,10 @@ class TensorContainer: self._stats = get_statistics_from_tensor(self._np_array) original_buckets = calc_original_buckets(self._np_array, self._stats) self._count = sum(bucket.count for bucket in original_buckets) - self._max = self._stats.max - self._min = self._stats.min + # convert the type of max and min value to np.float64 so that it cannot overflow + # when calculating width of histogram. + self._max = np.float64(self._stats.max) + self._min = np.float64(self._stats.min) self._histogram = Histogram(tuple(original_buckets), self._max, self._min, self._count) @property -- GitLab