未验证 提交 0e7ee125 编写于 作者: M Mikhail Korobov 提交者: GitHub

Merge pull request #5036 from dmiwell/urllength-dont-skip-silently

UrlLengthMiddleware: don't skip silently
......@@ -27,9 +27,12 @@ class UrlLengthMiddleware:
def process_spider_output(self, response, result, spider):
def _filter(request):
if isinstance(request, Request) and len(request.url) > self.maxlength:
logger.debug("Ignoring link (url length > %(maxlength)d): %(url)s ",
{'maxlength': self.maxlength, 'url': request.url},
extra={'spider': spider})
logger.info(
"Ignoring link (url length > %(maxlength)d): %(url)s ",
{'maxlength': self.maxlength, 'url': request.url},
extra={'spider': spider}
)
spider.crawler.stats.inc_value('urllength/request_ignored_count', spider=spider)
return False
else:
return True
......
from unittest import TestCase
from testfixtures import LogCapture
from scrapy.spidermiddlewares.urllength import UrlLengthMiddleware
from scrapy.http import Response, Request
from scrapy.spiders import Spider
from scrapy.utils.test import get_crawler
from scrapy.settings import Settings
class TestUrlLengthMiddleware(TestCase):
def test_process_spider_output(self):
res = Response('http://scrapytest.org')
def setUp(self):
self.maxlength = 25
settings = Settings({'URLLENGTH_LIMIT': self.maxlength})
crawler = get_crawler(Spider)
self.spider = crawler._create_spider('foo')
self.stats = crawler.stats
self.mw = UrlLengthMiddleware.from_settings(settings)
self.response = Response('http://scrapytest.org')
self.short_url_req = Request('http://scrapytest.org/')
self.long_url_req = Request('http://scrapytest.org/this_is_a_long_url')
self.reqs = [self.short_url_req, self.long_url_req]
def process_spider_output(self):
return list(self.mw.process_spider_output(self.response, self.reqs, self.spider))
def test_middleware_works(self):
self.assertEqual(self.process_spider_output(), [self.short_url_req])
def test_logging(self):
with LogCapture() as log:
self.process_spider_output()
short_url_req = Request('http://scrapytest.org/')
long_url_req = Request('http://scrapytest.org/this_is_a_long_url')
reqs = [short_url_req, long_url_req]
ric = self.stats.get_value('urllength/request_ignored_count', spider=self.spider)
self.assertEqual(ric, 1)
mw = UrlLengthMiddleware(maxlength=25)
spider = Spider('foo')
out = list(mw.process_spider_output(res, reqs, spider))
self.assertEqual(out, [short_url_req])
self.assertIn(f'Ignoring link (url length > {self.maxlength})', str(log))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册