提交 9daf2ae1 编写于 作者: W wangshuide2020

kill children processes of worker before worker has been killed by gunicorn master.

上级 a868ea81
......@@ -23,7 +23,7 @@ MindInsight为MindSpore提供了简单易用的调优调试能力。在训练过
请从[MindSpore下载页面](https://www.mindspore.cn/versions)下载并安装whl包。
```
pip install mindinsight-{version}-cp37-cp37m-linux_{arch}.whl
pip install -U mindinsight-{version}-cp37-cp37m-linux_{arch}.whl
```
更多MindInsight的安装方法,请点击[安装教程](https://www.mindspore.cn/install/)中的MindInsight章节进行查看。
......
......@@ -15,9 +15,13 @@
"""Config file for gunicorn."""
import os
import multiprocessing
import signal
import threading
import time
from importlib import import_module
import psutil
import gunicorn
......@@ -43,3 +47,44 @@ def on_starting(server):
hook_module = import_module('mindinsight.utils.hook')
for hook in hook_module.HookUtils.instance().hooks():
threading.Thread(target=hook.on_startup, args=(server.log,)).start()
def post_fork(server, worker):
"""
Launch a process to listen worker after gunicorn fork worker.
Children processes of gunicorn worker should be killed when worker has been killed
because gunicorn master murders this worker for some reasons such as worker timeout.
Args:
server (Arbiter): gunicorn server instance.
worker (ThreadWorker): worker instance.
"""
def murder_worker_children_processes():
processes_to_kill = []
# sleep 3 seconds so that all worker children processes have been launched.
time.sleep(3)
process = psutil.Process(worker.pid)
for child in process.children(recursive=True):
if child.pid != os.getpid():
processes_to_kill.append(child)
while True:
if os.getppid() != worker.pid:
current_worker_pid = os.getppid()
for proc in processes_to_kill:
server.log.info("Original worker pid: %d, current worker pid: %d, stop process %d",
worker.pid, current_worker_pid, proc.pid)
try:
proc.send_signal(signal.SIGKILL)
except psutil.NoSuchProcess:
continue
except psutil.Error as ex:
server.log.error("Stop process %d failed. Detail: %s.", proc.pid, str(ex))
server.log.info("%d processes have been killed.", len(processes_to_kill))
break
time.sleep(1)
listen_process = multiprocessing.Process(target=murder_worker_children_processes,
name="murder_worker_children_processes")
listen_process.start()
server.log.info("Server pid: %d, start to listening.", server.pid)
......@@ -193,8 +193,10 @@ class TensorContainer:
self._stats = get_statistics_from_tensor(self._np_array)
original_buckets = calc_original_buckets(self._np_array, self._stats)
self._count = sum(bucket.count for bucket in original_buckets)
self._max = self._stats.max
self._min = self._stats.min
# convert the type of max and min value to np.float64 so that it cannot overflow
# when calculating width of histogram.
self._max = np.float64(self._stats.max)
self._min = np.float64(self._stats.min)
self._histogram = Histogram(tuple(original_buckets), self._max, self._min, self._count)
@property
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册