提交 91dc4653 编写于 作者: P Pablo Hoffman

added LogStats extension for periodically logging basic stats (like crawled...

added LogStats extension for periodically logging basic stats (like crawled pages and scraped items)
上级 d2a9c0fd
......@@ -178,10 +178,20 @@ Built-in extensions reference
General purpose extensions
--------------------------
Log Stats extension
~~~~~~~~~~~~~~~~~~~
.. module:: scrapy.contrib.logstats
:synopsis: Basic stats logging
.. class:: LogStats
Log basic stats like crawled pages and scraped items.
Core Stats extension
~~~~~~~~~~~~~~~~~~~~
.. module:: scrapy.contrib.corestats.corestats
.. module:: scrapy.contrib.corestats
:synopsis: Core stats collection
.. class:: CoreStats
......
......@@ -548,6 +548,8 @@ Default::
'scrapy.contrib.closespider.CloseSpider': 0,
'scrapy.contrib.feedexport.FeedExporter': 0,
'scrapy.contrib.spidercontext.SpiderContext': 0,
'scrapy.contrib.throttle.AutoThrottle': 0,
'scrapy.contrib.logstats.LogStats': 0,
}
The list of available extensions. Keep in mind that some of them need to
......
from collections import defaultdict
from twisted.internet import task
from scrapy.xlib.pydispatch import dispatcher
from scrapy.exceptions import NotConfigured
from scrapy.conf import settings
from scrapy import log, signals
class Slot(object):
def __init__(self):
self.items = 0
self.itemsprev = 0
self.pages = 0
self.pagesprev = 0
class LogStats(object):
"""Log basic scraping stats periodically"""
def __init__(self):
self.interval = settings.getfloat('LOGSTATS_INTERVAL')
if not self.interval:
raise NotConfigured
self.slots = defaultdict(Slot)
self.multiplier = 60.0 / self.interval
dispatcher.connect(self.item_scraped, signal=signals.item_scraped)
dispatcher.connect(self.response_received, signal=signals.response_received)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
dispatcher.connect(self.engine_started, signal=signals.engine_started)
dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
def item_scraped(self, spider):
self.slots[spider].items += 1
def response_received(self, spider):
self.slots[spider].pages += 1
def spider_closed(self, spider):
del self.slots[spider]
def engine_started(self):
self.tsk = task.LoopingCall(self.log)
self.tsk.start(self.interval)
def log(self):
for spider, slot in self.slots.items():
irate = (slot.items - slot.itemsprev) * self.multiplier
prate = (slot.pages - slot.pagesprev) * self.multiplier
slot.pagesprev, slot.itemsprev = slot.pages, slot.items
msg = "Crawled %d pages (at %d pages/min), scraped %d items (at %d items/min)" \
% (slot.pages, prate, slot.items, irate)
log.msg(msg, spider=spider)
def engine_stopped(self):
if self.tsk.running:
self.tsk.stop()
......@@ -137,6 +137,7 @@ EXTENSIONS_BASE = {
'scrapy.contrib.feedexport.FeedExporter': 0,
'scrapy.contrib.spidercontext.SpiderContext': 0,
'scrapy.contrib.throttle.AutoThrottle': 0,
'scrapy.contrib.logstats.LogStats': 0,
}
FEED_URI = None
......@@ -182,6 +183,8 @@ LOG_STDOUT = False
LOG_LEVEL = 'DEBUG'
LOG_FILE = None
LOGSTATS_INTERVAL = 60.0
MAIL_DEBUG = False
MAIL_HOST = 'localhost'
MAIL_PORT = 25
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册