提交 46f59906 编写于 作者: F fuyw 提交者: Bo Zhou

fix unittest bugs (#136)

* add favicon

* Sleep longer for unittest.

* fix unittest bugs

* increase sleep seconds after job_memory error
上级 eca90f14
include parl/remote/static/logo.png include parl/remote/static/logo.png
include parl/remote/static/favicon.ico
recursive-include parl/remote/templates *.html recursive-include parl/remote/templates *.html
recursive-include parl/remote/static/css *.css recursive-include parl/remote/static/css *.css
recursive-include parl/remote/static/js *.js recursive-include parl/remote/static/js *.js
...@@ -50,7 +50,7 @@ class Job(object): ...@@ -50,7 +50,7 @@ class Job(object):
Attributes: Attributes:
pid (int): Job process ID. pid (int): Job process ID.
max_memory (float): Maximum memory (MB) can be used by each remote instance. max_memory (float): Maximum memory (MB) can be used by each remote instance.
""" """
self.job_is_alive = True self.job_is_alive = True
self.worker_address = worker_address self.worker_address = worker_address
...@@ -59,10 +59,13 @@ class Job(object): ...@@ -59,10 +59,13 @@ class Job(object):
self.lock = threading.Lock() self.lock = threading.Lock()
self._create_sockets() self._create_sockets()
process = psutil.Process(self.pid)
self.init_memory = float(process.memory_info()[0]) / (1024**2)
def _create_sockets(self): def _create_sockets(self):
"""Create three sockets for each job. """Create three sockets for each job.
(1) reply_socket(main socket): receives the command(i.e, the function name and args) (1) reply_socket(main socket): receives the command(i.e, the function name and args)
from the actual class instance, completes the computation, and returns the result of from the actual class instance, completes the computation, and returns the result of
the function. the function.
(2) job_socket(functional socket): sends job_address and heartbeat_address to worker. (2) job_socket(functional socket): sends job_address and heartbeat_address to worker.
...@@ -134,7 +137,7 @@ class Job(object): ...@@ -134,7 +137,7 @@ class Job(object):
if self.max_memory is not None: if self.max_memory is not None:
process = psutil.Process(self.pid) process = psutil.Process(self.pid)
used_memory = float(process.memory_info()[0]) / (1024**2) used_memory = float(process.memory_info()[0]) / (1024**2)
if used_memory > self.max_memory: if used_memory > self.max_memory + self.init_memory:
stop_job = True stop_job = True
return stop_job return stop_job
...@@ -174,6 +177,10 @@ class Job(object): ...@@ -174,6 +177,10 @@ class Job(object):
to_byte(self.job_address) to_byte(self.job_address)
]) ])
if stop_job == True: if stop_job == True:
logger.error(
"Memory used by this job exceeds {}. This job will exist."
.format(self.max_memory))
time.sleep(3)
socket.close(0) socket.close(0)
os._exit(1) os._exit(1)
except zmq.error.Again as e: except zmq.error.Again as e:
......
...@@ -133,7 +133,7 @@ def start_master(port, cpu_num, monitor_port): ...@@ -133,7 +133,7 @@ def start_master(port, cpu_num, monitor_port):
## If you want to check cluster status, please view: ## If you want to check cluster status, please view:
http://{}:{}. http://{}:{}
or call: or call:
...@@ -196,8 +196,7 @@ def status(): ...@@ -196,8 +196,7 @@ def status():
status = [] status = []
for monitor in monitors: for monitor in monitors:
monitor_port, _, master_address = monitor.split(' ') monitor_port, _, master_address = monitor.split(' ')
master_ip = master_address.split(':')[0] monitor_address = "{}:{}".format(get_ip_address(), monitor_port)
monitor_address = "{}:{}".format(master_ip, monitor_port)
socket = ctx.socket(zmq.REQ) socket = ctx.socket(zmq.REQ)
socket.connect('tcp://{}'.format(master_address)) socket.connect('tcp://{}'.format(master_address))
socket.send_multipart([STATUS_TAG]) socket.send_multipart([STATUS_TAG])
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
<head> <head>
<meta charset="utf-8" /> <meta charset="utf-8" />
<title>Parl Cluster</title> <title>Parl Cluster</title>
<link rel="shortcut icon" href="../static/favicon.ico">
<script type="text/javascript" src="../static/js/jquery.min.js"></script> <script type="text/javascript" src="../static/js/jquery.min.js"></script>
<script src="../static/js/echarts.min.js"></script> <script src="../static/js/echarts.min.js"></script>
<script src="../static/js/parl.js"></script> <script src="../static/js/parl.js"></script>
...@@ -39,7 +40,7 @@ ...@@ -39,7 +40,7 @@
<tr> <tr>
<th scope="col">#</th> <th scope="col">#</th>
<th scope="col">Path</th> <th scope="col">Path</th>
<th scope="col">Client ID</th> <th scope="col">Hostname</th>
<th scope="col">Actor Num</th> <th scope="col">Actor Num</th>
<th scope="col">Time (min)</th> <th scope="col">Time (min)</th>
</tr> </tr>
......
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
<head> <head>
<meta charset="utf-8" /> <meta charset="utf-8" />
<title>Parl Cluster</title> <title>Parl Cluster</title>
<link rel="shortcut icon" href="../static/favicon.ico">
<script type="text/javascript" src="../static/js/jquery.min.js"></script> <script type="text/javascript" src="../static/js/jquery.min.js"></script>
<script src="../static/js/echarts.min.js"></script> <script src="../static/js/echarts.min.js"></script>
<script src="../static/js/parl.js"></script> <script src="../static/js/parl.js"></script>
......
...@@ -23,41 +23,54 @@ from parl.remote.worker import Worker ...@@ -23,41 +23,54 @@ from parl.remote.worker import Worker
from parl.remote.client import disconnect from parl.remote.client import disconnect
from parl.remote.monitor import ClusterMonitor from parl.remote.monitor import ClusterMonitor
from multiprocessing import Process
@parl.remote_class(max_memory=200)
@parl.remote_class(max_memory=300)
class Actor(object): class Actor(object):
def __init__(self, x=10): def __init__(self, x=10):
self.x = x self.x = x
self.data = [] self.data = []
def add_100mb(self): def add_500mb(self):
self.data.append(os.urandom(100 * 1024**2)) self.data.append(os.urandom(500 * 1024**2))
self.x += 1 self.x += 1
return self.x return self.x
from parl.utils import logger
class TestMaxMemory(unittest.TestCase): class TestMaxMemory(unittest.TestCase):
def tearDown(self): def tearDown(self):
disconnect() disconnect()
def actor(self):
actor1 = Actor()
time.sleep(10)
actor1.add_500mb()
def test_max_memory(self): def test_max_memory(self):
port = 3001 port = 3001
master = Master(port=port) master = Master(port=port)
th = threading.Thread(target=master.run) th = threading.Thread(target=master.run)
th.start() th.start()
time.sleep(1) time.sleep(5)
worker = Worker('localhost:{}'.format(port), 1) worker = Worker('localhost:{}'.format(port), 1)
cluster_monitor = ClusterMonitor('localhost:{}'.format(port)) cluster_monitor = ClusterMonitor('localhost:{}'.format(port))
time.sleep(1) time.sleep(5)
parl.connect('localhost:{}'.format(port)) parl.connect('localhost:{}'.format(port))
actor = Actor() actor = Actor()
time.sleep(30) time.sleep(20)
self.assertEqual(1, cluster_monitor.data['clients'][0]['actor_num']) self.assertEqual(1, cluster_monitor.data['clients'][0]['actor_num'])
actor.add_100mb()
time.sleep(50)
self.assertEqual(0, cluster_monitor.data['clients'][0]['actor_num'])
actor.job_socket.close(0)
del actor del actor
time.sleep(5)
p = Process(target=self.actor)
p.start()
time.sleep(30)
self.assertEqual(0, cluster_monitor.data['clients'][0]['actor_num'])
p.terminate()
worker.exit() worker.exit()
master.exit() master.exit()
......
...@@ -24,7 +24,7 @@ from parl.remote.client import disconnect ...@@ -24,7 +24,7 @@ from parl.remote.client import disconnect
from parl.remote.monitor import ClusterMonitor from parl.remote.monitor import ClusterMonitor
@parl.remote_class(max_memory=200) @parl.remote_class(max_memory=300)
class Actor(object): class Actor(object):
def __init__(self, x=10): def __init__(self, x=10):
self.x = x self.x = x
...@@ -45,14 +45,14 @@ class TestClusterStatus(unittest.TestCase): ...@@ -45,14 +45,14 @@ class TestClusterStatus(unittest.TestCase):
master = Master(port=port) master = Master(port=port)
th = threading.Thread(target=master.run) th = threading.Thread(target=master.run)
th.start() th.start()
time.sleep(1) time.sleep(5)
worker = Worker('localhost:{}'.format(port), 1) worker = Worker('localhost:{}'.format(port), 1)
time.sleep(5) time.sleep(5)
status_info = master.cluster_monitor.get_status_info() status_info = master.cluster_monitor.get_status_info()
self.assertEqual(status_info, 'has 0 used cpus, 1 vacant cpus.') self.assertEqual(status_info, 'has 0 used cpus, 1 vacant cpus.')
parl.connect('localhost:{}'.format(port)) parl.connect('localhost:{}'.format(port))
actor = Actor() actor = Actor()
time.sleep(30) time.sleep(50)
status_info = master.cluster_monitor.get_status_info() status_info = master.cluster_monitor.get_status_info()
self.assertEqual(status_info, 'has 1 used cpus, 0 vacant cpus.') self.assertEqual(status_info, 'has 1 used cpus, 0 vacant cpus.')
worker.exit() worker.exit()
......
...@@ -307,7 +307,7 @@ class Worker(object): ...@@ -307,7 +307,7 @@ class Worker(object):
total_memory = round(virtual_memory[0] / (1024**3), 2) total_memory = round(virtual_memory[0] / (1024**3), 2)
used_memory = round(virtual_memory[3] / (1024**3), 2) used_memory = round(virtual_memory[3] / (1024**3), 2)
vacant_memory = round(total_memory - used_memory, 2) vacant_memory = round(total_memory - used_memory, 2)
load_average = round(psutil.getloadavg()[0], 2) load_average = round(os.getloadavg()[0], 2)
return (vacant_memory, used_memory, now, load_average) return (vacant_memory, used_memory, now, load_average)
def _reply_heartbeat(self, target): def _reply_heartbeat(self, target):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册