提交 3df69462 编写于 作者: D Daniel Graña

Merge pull request #656 from kmike/httperror-logging

[MRG] Fix for #612 + small tests cleanup
......@@ -25,7 +25,7 @@ class HttpErrorMiddleware(object):
self.handle_httpstatus_list = settings.getlist('HTTPERROR_ALLOWED_CODES')
def process_spider_input(self, response, spider):
if 200 <= response.status < 300: # common case
if 200 <= response.status < 300: # common case
return
meta = response.meta
if 'handle_httpstatus_all' in meta:
......@@ -38,11 +38,14 @@ class HttpErrorMiddleware(object):
allowed_statuses = getattr(spider, 'handle_httpstatus_list', self.handle_httpstatus_list)
if response.status in allowed_statuses:
return
log.msg(format="Ignoring HTTP response code: not handled or not allowed: %(status_code)d",
level=log.DEBUG, spider=spider,
status_code=response.status)
raise HttpError(response, 'Ignoring non-200 response')
def process_spider_exception(self, response, exception, spider):
if isinstance(exception, HttpError):
log.msg(
format="Ignoring response %(response)r: HTTP status code is not handled or not allowed",
level=log.DEBUG,
spider=spider,
response=response
)
return []
from twisted.internet import defer
from twisted.trial.unittest import TestCase
from scrapy.utils.test import get_crawler
from scrapy.utils.test import docrawl
from scrapy.tests.spiders import FollowAllSpider, ItemSpider, ErrorSpider
from scrapy.tests.mockserver import MockServer
def docrawl(spider, settings=None):
crawler = get_crawler(settings)
crawler.configure()
crawler.crawl(spider)
return crawler.start()
class TestCloseSpider(TestCase):
def setUp(self):
......
......@@ -3,19 +3,13 @@ import socket
import mock
from twisted.internet import defer
from twisted.trial.unittest import TestCase
from scrapy.utils.test import get_crawler, get_testlog
from scrapy.utils.test import docrawl, get_testlog
from scrapy.tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider, \
BrokenStartRequestsSpider, SingleRequestSpider
from scrapy.tests.mockserver import MockServer
from scrapy.http import Request
def docrawl(spider, settings=None):
crawler = get_crawler(settings)
crawler.crawl(spider)
return crawler.start()
class CrawlTestCase(TestCase):
def setUp(self):
......
......@@ -8,16 +8,11 @@ from netlib import http_auth
from twisted.internet import defer
from twisted.trial.unittest import TestCase
from scrapy.utils.test import get_crawler, get_testlog
from scrapy.utils.test import get_testlog, docrawl
from scrapy.tests.spiders import SimpleSpider
from scrapy.tests.mockserver import MockServer
def docrawl(spider, settings=None):
crawler = get_crawler(settings)
crawler.configure()
crawler.crawl(spider)
return crawler.start()
......
from unittest import TestCase
from twisted.trial.unittest import TestCase as TrialTestCase
from twisted.internet import defer
from scrapy.utils.test import docrawl, get_testlog
from scrapy.tests.mockserver import MockServer
from scrapy.http import Response, Request
from scrapy.spider import Spider
from scrapy.contrib.spidermiddleware.httperror import HttpErrorMiddleware, HttpError
from scrapy.settings import Settings
class _HttpErrorSpider(Spider):
name = 'httperror'
start_urls = [
"http://localhost:8998/status?n=200",
"http://localhost:8998/status?n=404",
"http://localhost:8998/status?n=402",
"http://localhost:8998/status?n=500",
]
bypass_status_codes = set()
def __init__(self, *args, **kwargs):
super(_HttpErrorSpider, self).__init__(*args, **kwargs)
self.failed = set()
self.skipped = set()
self.parsed = set()
def start_requests(self):
for url in self.start_urls:
yield Request(url, self.parse, errback=self.on_error)
def parse(self, response):
self.parsed.add(response.url[-3:])
def on_error(self, failure):
if isinstance(failure.value, HttpError):
response = failure.value.response
if response.status in self.bypass_status_codes:
self.skipped.add(response.url[-3:])
return self.parse(response)
# it assumes there is a response attached to failure
self.failed.add(failure.value.response.url[-3:])
return failure
def _responses(request, status_codes):
responses = []
for code in status_codes:
response = Response(request.url, status=code)
response.request = request
responses.append(response)
return responses
class TestHttpErrorMiddleware(TestCase):
def setUp(self):
self.spider = Spider('foo')
self.mw = HttpErrorMiddleware(Settings({}))
self.req = Request('http://scrapytest.org')
self.res200 = Response('http://scrapytest.org', status=200)
self.res200.request = self.req
self.res404 = Response('http://scrapytest.org', status=404)
self.res404.request = self.req
self.res200, self.res404 = _responses(self.req, [200, 404])
def test_process_spider_input(self):
self.assertEquals(None,
......@@ -43,6 +88,7 @@ class TestHttpErrorMiddleware(TestCase):
self.assertEquals(None,
self.mw.process_spider_input(self.res404, self.spider))
class TestHttpErrorMiddlewareSettings(TestCase):
"""Similar test, but with settings"""
......@@ -50,13 +96,7 @@ class TestHttpErrorMiddlewareSettings(TestCase):
self.spider = Spider('foo')
self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOWED_CODES': (402,)}))
self.req = Request('http://scrapytest.org')
self.res200 = Response('http://scrapytest.org', status=200)
self.res200.request = self.req
self.res404 = Response('http://scrapytest.org', status=404)
self.res404.request = self.req
self.res402 = Response('http://scrapytest.org', status=402)
self.res402.request = self.req
self.res200, self.res404, self.res402 = _responses(self.req, [200, 404, 402])
def test_process_spider_input(self):
self.assertEquals(None,
......@@ -86,19 +126,14 @@ class TestHttpErrorMiddlewareSettings(TestCase):
self.assertRaises(HttpError,
self.mw.process_spider_input, self.res402, self.spider)
class TestHttpErrorMiddlewareHandleAll(TestCase):
def setUp(self):
self.spider = Spider('foo')
self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOW_ALL': True}))
self.req = Request('http://scrapytest.org')
self.res200 = Response('http://scrapytest.org', status=200)
self.res200.request = self.req
self.res404 = Response('http://scrapytest.org', status=404)
self.res404.request = self.req
self.res402 = Response('http://scrapytest.org', status=402)
self.res402.request = self.req
self.res200, self.res404, self.res402 = _responses(self.req, [200, 404, 402])
def test_process_spider_input(self):
self.assertEquals(None,
......@@ -119,3 +154,34 @@ class TestHttpErrorMiddlewareHandleAll(TestCase):
self.assertRaises(HttpError,
self.mw.process_spider_input, res402, self.spider)
class TestHttpErrorMiddlewareIntegrational(TrialTestCase):
def setUp(self):
self.mockserver = MockServer()
self.mockserver.__enter__()
def tearDown(self):
self.mockserver.__exit__(None, None, None)
@defer.inlineCallbacks
def test_middleware_works(self):
spider = _HttpErrorSpider()
yield docrawl(spider)
assert not spider.skipped, spider.skipped
self.assertEqual(spider.parsed, {'200'})
self.assertEqual(spider.failed, {'404', '402', '500'})
@defer.inlineCallbacks
def test_logging(self):
spider = _HttpErrorSpider(bypass_status_codes={402})
yield docrawl(spider)
# print(get_testlog())
self.assertEqual(spider.parsed, {'200', '402'})
self.assertEqual(spider.skipped, {'402'})
self.assertEqual(spider.failed, {'404', '500'})
log = get_testlog()
self.assertIn('Ignoring response <404', log)
self.assertIn('Ignoring response <500', log)
self.assertNotIn('Ignoring response <200', log)
self.assertNotIn('Ignoring response <402', log)
......@@ -53,11 +53,13 @@ def get_testenv():
def get_testlog():
"""Get Scrapy log of current test, ignoring the rest"""
with open("test.log", "rb") as fp:
loglines = fp.readlines()
thistest = []
loglines = open("test.log").readlines()
for l in loglines[::-1]:
thistest.append(l)
if "[-] -->" in l:
for line in loglines[::-1]:
thistest.append(line)
if "[-] -->" in line:
break
return "".join(thistest[::-1])
......@@ -67,3 +69,10 @@ def assert_samelines(testcase, text1, text2, msg=None):
line endings between platforms
"""
testcase.assertEqual(text1.splitlines(), text2.splitlines(), msg)
def docrawl(spider, settings=None):
"""Configure and start Crawler; return the result of crawler.start()"""
crawler = get_crawler(settings)
crawler.configure()
crawler.crawl(spider)
return crawler.start()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册