diff --git a/scrapy/contrib/spidermiddleware/httperror.py b/scrapy/contrib/spidermiddleware/httperror.py index d5a26da63f1a0705f73fd2a7614e8c4bd8146d9e..7fb7aa97c8f6534718e97dd0f3c5ddae686ceb10 100644 --- a/scrapy/contrib/spidermiddleware/httperror.py +++ b/scrapy/contrib/spidermiddleware/httperror.py @@ -25,7 +25,7 @@ class HttpErrorMiddleware(object): self.handle_httpstatus_list = settings.getlist('HTTPERROR_ALLOWED_CODES') def process_spider_input(self, response, spider): - if 200 <= response.status < 300: # common case + if 200 <= response.status < 300: # common case return meta = response.meta if 'handle_httpstatus_all' in meta: @@ -38,11 +38,14 @@ class HttpErrorMiddleware(object): allowed_statuses = getattr(spider, 'handle_httpstatus_list', self.handle_httpstatus_list) if response.status in allowed_statuses: return - log.msg(format="Ignoring HTTP response code: not handled or not allowed: %(status_code)d", - level=log.DEBUG, spider=spider, - status_code=response.status) raise HttpError(response, 'Ignoring non-200 response') def process_spider_exception(self, response, exception, spider): if isinstance(exception, HttpError): + log.msg( + format="Ignoring response %(response)r: HTTP status code is not handled or not allowed", + level=log.DEBUG, + spider=spider, + response=response + ) return [] diff --git a/scrapy/tests/test_closespider.py b/scrapy/tests/test_closespider.py index 2759e4ff317e233653327d497aa9807a8b677584..c6622d4864fb75b692d978e1468f073305b03060 100644 --- a/scrapy/tests/test_closespider.py +++ b/scrapy/tests/test_closespider.py @@ -1,16 +1,10 @@ from twisted.internet import defer from twisted.trial.unittest import TestCase -from scrapy.utils.test import get_crawler +from scrapy.utils.test import docrawl from scrapy.tests.spiders import FollowAllSpider, ItemSpider, ErrorSpider from scrapy.tests.mockserver import MockServer -def docrawl(spider, settings=None): - crawler = get_crawler(settings) - crawler.configure() - crawler.crawl(spider) - return crawler.start() - class TestCloseSpider(TestCase): def setUp(self): diff --git a/scrapy/tests/test_crawl.py b/scrapy/tests/test_crawl.py index e6edb4ddac5280ad93f1b9b1e342bf1e661eaf7d..a8847aa40e9bf972a4d13932c07fe9c66622f556 100644 --- a/scrapy/tests/test_crawl.py +++ b/scrapy/tests/test_crawl.py @@ -3,19 +3,13 @@ import socket import mock from twisted.internet import defer from twisted.trial.unittest import TestCase -from scrapy.utils.test import get_crawler, get_testlog +from scrapy.utils.test import docrawl, get_testlog from scrapy.tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider, \ BrokenStartRequestsSpider, SingleRequestSpider from scrapy.tests.mockserver import MockServer from scrapy.http import Request -def docrawl(spider, settings=None): - crawler = get_crawler(settings) - crawler.crawl(spider) - return crawler.start() - - class CrawlTestCase(TestCase): def setUp(self): diff --git a/scrapy/tests/test_proxy_connect.py b/scrapy/tests/test_proxy_connect.py index ee9576f71a99a8c49238a2d7a9e3e8aa094f6572..e0b47a3dcb32f92bba39c04d3b6dddfe67de9281 100644 --- a/scrapy/tests/test_proxy_connect.py +++ b/scrapy/tests/test_proxy_connect.py @@ -8,16 +8,11 @@ from netlib import http_auth from twisted.internet import defer from twisted.trial.unittest import TestCase -from scrapy.utils.test import get_crawler, get_testlog +from scrapy.utils.test import get_testlog, docrawl from scrapy.tests.spiders import SimpleSpider from scrapy.tests.mockserver import MockServer -def docrawl(spider, settings=None): - crawler = get_crawler(settings) - crawler.configure() - crawler.crawl(spider) - return crawler.start() diff --git a/scrapy/tests/test_spidermiddleware_httperror.py b/scrapy/tests/test_spidermiddleware_httperror.py index f25db582a7c0a8ff99cff2fc26df0a2ddfead7f4..b078ba4ba29e51df6742c489334eaeea01e9bd56 100644 --- a/scrapy/tests/test_spidermiddleware_httperror.py +++ b/scrapy/tests/test_spidermiddleware_httperror.py @@ -1,22 +1,67 @@ from unittest import TestCase +from twisted.trial.unittest import TestCase as TrialTestCase +from twisted.internet import defer + +from scrapy.utils.test import docrawl, get_testlog +from scrapy.tests.mockserver import MockServer from scrapy.http import Response, Request from scrapy.spider import Spider from scrapy.contrib.spidermiddleware.httperror import HttpErrorMiddleware, HttpError from scrapy.settings import Settings +class _HttpErrorSpider(Spider): + name = 'httperror' + start_urls = [ + "http://localhost:8998/status?n=200", + "http://localhost:8998/status?n=404", + "http://localhost:8998/status?n=402", + "http://localhost:8998/status?n=500", + ] + bypass_status_codes = set() + + def __init__(self, *args, **kwargs): + super(_HttpErrorSpider, self).__init__(*args, **kwargs) + self.failed = set() + self.skipped = set() + self.parsed = set() + + def start_requests(self): + for url in self.start_urls: + yield Request(url, self.parse, errback=self.on_error) + + def parse(self, response): + self.parsed.add(response.url[-3:]) + + def on_error(self, failure): + if isinstance(failure.value, HttpError): + response = failure.value.response + if response.status in self.bypass_status_codes: + self.skipped.add(response.url[-3:]) + return self.parse(response) + + # it assumes there is a response attached to failure + self.failed.add(failure.value.response.url[-3:]) + return failure + + +def _responses(request, status_codes): + responses = [] + for code in status_codes: + response = Response(request.url, status=code) + response.request = request + responses.append(response) + return responses + + class TestHttpErrorMiddleware(TestCase): def setUp(self): self.spider = Spider('foo') self.mw = HttpErrorMiddleware(Settings({})) self.req = Request('http://scrapytest.org') - - self.res200 = Response('http://scrapytest.org', status=200) - self.res200.request = self.req - self.res404 = Response('http://scrapytest.org', status=404) - self.res404.request = self.req + self.res200, self.res404 = _responses(self.req, [200, 404]) def test_process_spider_input(self): self.assertEquals(None, @@ -43,6 +88,7 @@ class TestHttpErrorMiddleware(TestCase): self.assertEquals(None, self.mw.process_spider_input(self.res404, self.spider)) + class TestHttpErrorMiddlewareSettings(TestCase): """Similar test, but with settings""" @@ -50,13 +96,7 @@ class TestHttpErrorMiddlewareSettings(TestCase): self.spider = Spider('foo') self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOWED_CODES': (402,)})) self.req = Request('http://scrapytest.org') - - self.res200 = Response('http://scrapytest.org', status=200) - self.res200.request = self.req - self.res404 = Response('http://scrapytest.org', status=404) - self.res404.request = self.req - self.res402 = Response('http://scrapytest.org', status=402) - self.res402.request = self.req + self.res200, self.res404, self.res402 = _responses(self.req, [200, 404, 402]) def test_process_spider_input(self): self.assertEquals(None, @@ -86,19 +126,14 @@ class TestHttpErrorMiddlewareSettings(TestCase): self.assertRaises(HttpError, self.mw.process_spider_input, self.res402, self.spider) + class TestHttpErrorMiddlewareHandleAll(TestCase): def setUp(self): self.spider = Spider('foo') self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOW_ALL': True})) self.req = Request('http://scrapytest.org') - - self.res200 = Response('http://scrapytest.org', status=200) - self.res200.request = self.req - self.res404 = Response('http://scrapytest.org', status=404) - self.res404.request = self.req - self.res402 = Response('http://scrapytest.org', status=402) - self.res402.request = self.req + self.res200, self.res404, self.res402 = _responses(self.req, [200, 404, 402]) def test_process_spider_input(self): self.assertEquals(None, @@ -119,3 +154,34 @@ class TestHttpErrorMiddlewareHandleAll(TestCase): self.assertRaises(HttpError, self.mw.process_spider_input, res402, self.spider) + +class TestHttpErrorMiddlewareIntegrational(TrialTestCase): + def setUp(self): + self.mockserver = MockServer() + self.mockserver.__enter__() + + def tearDown(self): + self.mockserver.__exit__(None, None, None) + + @defer.inlineCallbacks + def test_middleware_works(self): + spider = _HttpErrorSpider() + yield docrawl(spider) + assert not spider.skipped, spider.skipped + self.assertEqual(spider.parsed, {'200'}) + self.assertEqual(spider.failed, {'404', '402', '500'}) + + @defer.inlineCallbacks + def test_logging(self): + spider = _HttpErrorSpider(bypass_status_codes={402}) + yield docrawl(spider) + # print(get_testlog()) + self.assertEqual(spider.parsed, {'200', '402'}) + self.assertEqual(spider.skipped, {'402'}) + self.assertEqual(spider.failed, {'404', '500'}) + + log = get_testlog() + self.assertIn('Ignoring response <404', log) + self.assertIn('Ignoring response <500', log) + self.assertNotIn('Ignoring response <200', log) + self.assertNotIn('Ignoring response <402', log) diff --git a/scrapy/utils/test.py b/scrapy/utils/test.py index ec9ca1c51eb54fc9b547296b0dcabb714cd2bb54..8c1f7fe8a68e5e96cbb979c52298900d5ed049f0 100644 --- a/scrapy/utils/test.py +++ b/scrapy/utils/test.py @@ -53,11 +53,13 @@ def get_testenv(): def get_testlog(): """Get Scrapy log of current test, ignoring the rest""" + with open("test.log", "rb") as fp: + loglines = fp.readlines() + thistest = [] - loglines = open("test.log").readlines() - for l in loglines[::-1]: - thistest.append(l) - if "[-] -->" in l: + for line in loglines[::-1]: + thistest.append(line) + if "[-] -->" in line: break return "".join(thistest[::-1]) @@ -67,3 +69,10 @@ def assert_samelines(testcase, text1, text2, msg=None): line endings between platforms """ testcase.assertEqual(text1.splitlines(), text2.splitlines(), msg) + +def docrawl(spider, settings=None): + """Configure and start Crawler; return the result of crawler.start()""" + crawler = get_crawler(settings) + crawler.configure() + crawler.crawl(spider) + return crawler.start()