提交 fa84730e 编写于 作者: T tpeng

avoid download large response

introduce DOWNLOAD_MAXSIZE and DOWNLOAD_WARNSIZE in settings and
download_maxsize/download_warnsize in spider/request meta, so
downloader stop downloading as soon as the received data exceed the
limit. also check the twsisted response's length in advance to stop
downloading as early as possible.
上级 ed84231b
......@@ -422,6 +422,40 @@ The amount of time (in secs) that the downloader will wait before timing out.
spider attribute and per-request using :reqmeta:`download_timeout`
Request.meta key.
.. setting:: DOWNLOAD_MAXSIZE
DOWNLOAD_MAXSIZE
----------------
Default: `1073741824` (1024Mb)
The maximum response size (in bytes) that downloader will download.
If you want to disable it set to 0.
.. note::
This size can be set per spider using :attr:`download_maxsize`
spider attribute and per-request using :reqmeta:`download_maxsize`
Request.meta key.
.. setting:: DOWNLOAD_WARNSIZE
DOWNLOAD_WARNSIZE
----------------
Default: `33554432` (32Mb)
The response size (in bytes) that downloader will start to warn.
If you want to disable it set to 0.
.. note::
This size can be set per spider using :attr:`download_warnsize`
spider attribute and per-request using :reqmeta:`download_warnsize`
Request.meta key.
.. setting:: DUPEFILTER_CLASS
DUPEFILTER_CLASS
......
......@@ -9,7 +9,7 @@ from six.moves.urllib.parse import urldefrag
from zope.interface import implements
from twisted.internet import defer, reactor, protocol
from twisted.web.http_headers import Headers as TxHeaders
from twisted.web.iweb import IBodyProducer
from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH
from twisted.internet.error import TimeoutError
from twisted.web.http import PotentialDataLoss
from scrapy.xlib.tx import Agent, ProxyAgent, ResponseDone, \
......@@ -19,6 +19,7 @@ from scrapy.http import Headers
from scrapy.responsetypes import responsetypes
from scrapy.core.downloader.webclient import _parse
from scrapy.utils.misc import load_object
from scrapy import log
class HTTP11DownloadHandler(object):
......@@ -29,10 +30,14 @@ class HTTP11DownloadHandler(object):
self._pool._factory.noisy = False
self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
self._contextFactory = self._contextFactoryClass()
self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool)
agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool,
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize))
return agent.download_request(request)
def close(self):
......@@ -131,11 +136,14 @@ class ScrapyAgent(object):
_ProxyAgent = ProxyAgent
_TunnelingAgent = TunnelingAgent
def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None):
def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None,
maxsize=0, warnsize=0):
self._contextFactory = contextFactory
self._connectTimeout = connectTimeout
self._bindAddress = bindAddress
self._pool = pool
self._maxsize = maxsize
self._warnsize = warnsize
def _get_agent(self, request, timeout):
bindaddress = request.meta.get('bindaddress') or self._bindAddress
......@@ -197,11 +205,25 @@ class ScrapyAgent(object):
if txresponse.length == 0:
return txresponse, '', None
maxsize = request.meta.get('download_maxsize', self._maxsize)
warnsize = request.meta.get('download_warnsize', self._warnsize)
expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1
if maxsize and expected_size > maxsize:
log.msg("Expected response size (%s) larger than download max size (%s)." % (expected_size, maxsize),
logLevel=log.ERROR)
txresponse._transport._producer.loseConnection()
raise defer.CancelledError()
if warnsize and expected_size > warnsize:
log.msg("Expected response size (%s) larger than downlod warn size (%s)." % (expected_size, warnsize),
logLevel=log.WARNING)
def _cancel(_):
txresponse._transport._producer.loseConnection()
d = defer.Deferred(_cancel)
txresponse.deliverBody(_ResponseReader(d, txresponse, request))
txresponse.deliverBody(_ResponseReader(d, txresponse, request, maxsize, warnsize))
return d
def _cb_bodydone(self, result, request, url):
......@@ -232,14 +254,27 @@ class _RequestBodyProducer(object):
class _ResponseReader(protocol.Protocol):
def __init__(self, finished, txresponse, request):
def __init__(self, finished, txresponse, request, maxsize, warnsize):
self._finished = finished
self._txresponse = txresponse
self._request = request
self._bodybuf = BytesIO()
self._maxsize = maxsize
self._warnsize = warnsize
self._bytes_received = 0
def dataReceived(self, bodyBytes):
self._bodybuf.write(bodyBytes)
self._bytes_received += len(bodyBytes)
if self._maxsize and self._bytes_received > self._maxsize:
log.msg("Received (%s) bytes larger than download max size (%s)." % (self._bytes_received, self._maxsize),
logLevel=log.ERROR)
self._finished.cancel()
if self._warnsize and self._bytes_received > self._warnsize:
log.msg("Received (%s) bytes larger than download warn size (%s)." % (self._bytes_received, self._warnsize),
logLevel=log.WARNING)
def connectionLost(self, reason):
if self._finished.called:
......
......@@ -66,6 +66,9 @@ DOWNLOAD_HANDLERS_BASE = {
DOWNLOAD_TIMEOUT = 180 # 3mins
DOWNLOAD_MAXSIZE = 1073741824 # 1024m
DOWNLOAD_WARNSIZE = 33554432 # 32m
DOWNLOADER = 'scrapy.core.downloader.Downloader'
DOWNLOADER_HTTPCLIENTFACTORY = 'scrapy.core.downloader.webclient.ScrapyHTTPClientFactory'
......
......@@ -30,6 +30,8 @@ from scrapy import optional_features
from scrapy.utils.test import get_crawler
from scrapy.exceptions import NotConfigured
from tests.mockserver import MockServer
from tests.spiders import SingleRequestSpider
class DummyDH(object):
......@@ -211,6 +213,64 @@ class Http11TestCase(HttpTestCase):
if 'http11' not in optional_features:
skip = 'HTTP1.1 not supported in twisted < 11.1.0'
def test_download_without_maxsize_limit(self):
request = Request(self.getURL('file'))
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, "0123456789")
return d
@defer.inlineCallbacks
def test_download_with_maxsize_per_req(self):
meta = {'download_maxsize': 2}
request = Request(self.getURL('file'), meta=meta)
d = self.download_request(request, Spider('foo'))
yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)
@defer.inlineCallbacks
def test_download_with_small_maxsize_per_spider(self):
request = Request(self.getURL('file'))
d = self.download_request(request, Spider('foo', download_maxsize=2))
yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)
def test_download_with_large_maxsize_per_spider(self):
request = Request(self.getURL('file'))
d = self.download_request(request, Spider('foo', download_maxsize=100))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, "0123456789")
return d
class Http11MockServerTestCase(unittest.TestCase):
"""HTTP 1.1 test case with MockServer"""
if 'http11' not in optional_features:
skip = 'HTTP1.1 not supported in twisted < 11.1.0'
def setUp(self):
self.mockserver = MockServer()
self.mockserver.__enter__()
def tearDown(self):
self.mockserver.__exit__(None, None, None)
@defer.inlineCallbacks
def test_download_with_content_length(self):
crawler = get_crawler(SingleRequestSpider)
# http://localhost:8998/partial set Content-Length to 1024, use download_maxsize= 1000 to avoid
# download it
yield crawler.crawl(seed=Request(url='http://localhost:8998/partial', meta={'download_maxsize': 1000}))
failure = crawler.spider.meta['failure']
self.assertIsInstance(failure.value, defer.CancelledError)
@defer.inlineCallbacks
def test_download(self):
crawler = get_crawler(SingleRequestSpider)
yield crawler.crawl(seed=Request(url='http://localhost:8998'))
failure = crawler.spider.meta.get('failure')
self.assertTrue(failure == None)
reason = crawler.spider.meta['close_reason']
self.assertTrue(reason, 'finished')
class UriResource(resource.Resource):
"""Return the full uri that was requested"""
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册