diff --git a/scrapy/spidermiddlewares/urllength.py b/scrapy/spidermiddlewares/urllength.py index 5be1f80cb05f181421a11e16291b4b3f97c80ec8..450d4ff40eab11322416a4500637d473df440be8 100644 --- a/scrapy/spidermiddlewares/urllength.py +++ b/scrapy/spidermiddlewares/urllength.py @@ -27,9 +27,12 @@ class UrlLengthMiddleware: def process_spider_output(self, response, result, spider): def _filter(request): if isinstance(request, Request) and len(request.url) > self.maxlength: - logger.debug("Ignoring link (url length > %(maxlength)d): %(url)s ", - {'maxlength': self.maxlength, 'url': request.url}, - extra={'spider': spider}) + logger.info( + "Ignoring link (url length > %(maxlength)d): %(url)s ", + {'maxlength': self.maxlength, 'url': request.url}, + extra={'spider': spider} + ) + spider.crawler.stats.inc_value('urllength/request_ignored_count', spider=spider) return False else: return True diff --git a/tests/test_spidermiddleware_urllength.py b/tests/test_spidermiddleware_urllength.py index 5ef2b23fdf1de1b131f8e98705c8c01430c7b3f1..171f4ddfdd60faf26f187bb060d952f9328661c4 100644 --- a/tests/test_spidermiddleware_urllength.py +++ b/tests/test_spidermiddleware_urllength.py @@ -1,20 +1,41 @@ from unittest import TestCase +from testfixtures import LogCapture + from scrapy.spidermiddlewares.urllength import UrlLengthMiddleware from scrapy.http import Response, Request from scrapy.spiders import Spider +from scrapy.utils.test import get_crawler +from scrapy.settings import Settings class TestUrlLengthMiddleware(TestCase): - def test_process_spider_output(self): - res = Response('http://scrapytest.org') + def setUp(self): + self.maxlength = 25 + settings = Settings({'URLLENGTH_LIMIT': self.maxlength}) + + crawler = get_crawler(Spider) + self.spider = crawler._create_spider('foo') + self.stats = crawler.stats + self.mw = UrlLengthMiddleware.from_settings(settings) + + self.response = Response('http://scrapytest.org') + self.short_url_req = Request('http://scrapytest.org/') + self.long_url_req = Request('http://scrapytest.org/this_is_a_long_url') + self.reqs = [self.short_url_req, self.long_url_req] + + def process_spider_output(self): + return list(self.mw.process_spider_output(self.response, self.reqs, self.spider)) + + def test_middleware_works(self): + self.assertEqual(self.process_spider_output(), [self.short_url_req]) + + def test_logging(self): + with LogCapture() as log: + self.process_spider_output() - short_url_req = Request('http://scrapytest.org/') - long_url_req = Request('http://scrapytest.org/this_is_a_long_url') - reqs = [short_url_req, long_url_req] + ric = self.stats.get_value('urllength/request_ignored_count', spider=self.spider) + self.assertEqual(ric, 1) - mw = UrlLengthMiddleware(maxlength=25) - spider = Spider('foo') - out = list(mw.process_spider_output(res, reqs, spider)) - self.assertEqual(out, [short_url_req]) + self.assertIn(f'Ignoring link (url length > {self.maxlength})', str(log))