提交 80f5003c 编写于 作者: A Adrián Chaves

Add tests for get_retry_request

上级 6ab99018
......@@ -10,6 +10,7 @@ Failed pages are collected on the scraping process and rescheduled at the end,
once the spider has finished crawling all regular (non failed) pages.
"""
import logging
from inspect import isclass
from twisted.internet import defer
from twisted.internet.error import (
......@@ -81,8 +82,8 @@ def get_retry_request(
retry_times = request.meta.get('retry_times', 0) + 1
if max_retry_times is None:
max_retry_times = request.meta.get('max_retry_times')
if max_retry_times is None:
max_retry_times = settings.getint('RETRY_TIMES')
if max_retry_times is None:
max_retry_times = settings.getint('RETRY_TIMES')
if retry_times <= max_retry_times:
logger.debug(
"Retrying %(request)s (failed %(retry_times)d times): %(reason)s",
......@@ -96,6 +97,8 @@ def get_retry_request(
priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
new_request.priority = request.priority + priority_adjust
if isclass(reason):
reason = reason()
if isinstance(reason, Exception):
reason = global_object_name(reason.__class__)
......@@ -149,10 +152,12 @@ class RetryMiddleware:
return self._retry(request, exception, spider)
def _retry(self, request, reason, spider):
max_retry_times = request.meta.get('max_retry_times', self.max_retry_times)
priority_adjust = request.meta.get('priority_adjust', self.priority_adjust)
return get_retry_request(
request,
reason=reason,
spider=spider,
max_retry_times=self.max_retry_times,
priority_adjust=self.priority_adjust,
max_retry_times=max_retry_times,
priority_adjust=priority_adjust,
)
......@@ -4,18 +4,22 @@ from twisted.internet.error import (
ConnectError,
ConnectionDone,
ConnectionLost,
ConnectionRefusedError,
DNSLookupError,
TCPTimedOutError,
TimeoutError,
)
from twisted.web.client import ResponseFailed
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from scrapy.spiders import Spider
from scrapy.downloadermiddlewares.retry import (
get_retry_request,
RetryMiddleware,
)
from scrapy.exceptions import IgnoreRequest
from scrapy.http import Request, Response
from scrapy.spiders import Spider
from scrapy.utils.test import get_crawler
from testfixtures import LogCapture
class RetryTest(unittest.TestCase):
def setUp(self):
......@@ -119,82 +123,480 @@ class RetryTest(unittest.TestCase):
class MaxRetryTimesTest(unittest.TestCase):
def setUp(self):
self.crawler = get_crawler(Spider)
self.spider = self.crawler._create_spider('foo')
self.mw = RetryMiddleware.from_crawler(self.crawler)
self.mw.max_retry_times = 2
self.invalid_url = 'http://www.scrapytest.org/invalid_url'
def test_with_settings_zero(self):
invalid_url = 'http://www.scrapytest.org/invalid_url'
# SETTINGS: RETRY_TIMES = 0
self.mw.max_retry_times = 0
def get_spider_and_middleware(self, settings=None):
crawler = get_crawler(Spider, settings or {})
spider = crawler._create_spider('foo')
middleware = RetryMiddleware.from_crawler(crawler)
return spider, middleware
def test_with_settings_zero(self):
max_retry_times = 0
settings = {'RETRY_TIMES': max_retry_times}
spider, middleware = self.get_spider_and_middleware(settings)
req = Request(self.invalid_url)
self._test_retry(req, DNSLookupError('foo'), self.mw.max_retry_times)
self._test_retry(
req,
DNSLookupError('foo'),
max_retry_times,
spider=spider,
middleware=middleware,
)
def test_with_metakey_zero(self):
# SETTINGS: meta(max_retry_times) = 0
meta_max_retry_times = 0
req = Request(self.invalid_url, meta={'max_retry_times': meta_max_retry_times})
self._test_retry(req, DNSLookupError('foo'), meta_max_retry_times)
max_retry_times = 0
spider, middleware = self.get_spider_and_middleware()
meta = {'max_retry_times': max_retry_times}
req = Request(self.invalid_url, meta=meta)
self._test_retry(
req,
DNSLookupError('foo'),
max_retry_times,
spider=spider,
middleware=middleware,
)
def test_without_metakey(self):
# SETTINGS: RETRY_TIMES is NON-ZERO
self.mw.max_retry_times = 5
max_retry_times = 5
settings = {'RETRY_TIMES': max_retry_times}
spider, middleware = self.get_spider_and_middleware(settings)
req = Request(self.invalid_url)
self._test_retry(req, DNSLookupError('foo'), self.mw.max_retry_times)
self._test_retry(
req,
DNSLookupError('foo'),
max_retry_times,
spider=spider,
middleware=middleware,
)
def test_with_metakey_greater(self):
# SETINGS: RETRY_TIMES < meta(max_retry_times)
self.mw.max_retry_times = 2
meta_max_retry_times = 3
middleware_max_retry_times = 2
req1 = Request(self.invalid_url, meta={'max_retry_times': meta_max_retry_times})
req2 = Request(self.invalid_url)
self._test_retry(req1, DNSLookupError('foo'), meta_max_retry_times)
self._test_retry(req2, DNSLookupError('foo'), self.mw.max_retry_times)
settings = {'RETRY_TIMES': middleware_max_retry_times}
spider, middleware = self.get_spider_and_middleware(settings)
self._test_retry(
req1,
DNSLookupError('foo'),
meta_max_retry_times,
spider=spider,
middleware=middleware,
)
self._test_retry(
req2,
DNSLookupError('foo'),
middleware_max_retry_times,
spider=spider,
middleware=middleware,
)
def test_with_metakey_lesser(self):
# SETINGS: RETRY_TIMES > meta(max_retry_times)
self.mw.max_retry_times = 5
meta_max_retry_times = 4
middleware_max_retry_times = 5
req1 = Request(self.invalid_url, meta={'max_retry_times': meta_max_retry_times})
req2 = Request(self.invalid_url)
self._test_retry(req1, DNSLookupError('foo'), meta_max_retry_times)
self._test_retry(req2, DNSLookupError('foo'), self.mw.max_retry_times)
settings = {'RETRY_TIMES': middleware_max_retry_times}
spider, middleware = self.get_spider_and_middleware(settings)
self._test_retry(
req1,
DNSLookupError('foo'),
meta_max_retry_times,
spider=spider,
middleware=middleware,
)
self._test_retry(
req2,
DNSLookupError('foo'),
middleware_max_retry_times,
spider=spider,
middleware=middleware,
)
def test_with_dont_retry(self):
# SETTINGS: meta(max_retry_times) = 4
meta_max_retry_times = 4
req = Request(self.invalid_url, meta={
'max_retry_times': meta_max_retry_times, 'dont_retry': True
})
self._test_retry(req, DNSLookupError('foo'), 0)
def _test_retry(self, req, exception, max_retry_times):
max_retry_times = 4
spider, middleware = self.get_spider_and_middleware()
meta = {
'max_retry_times': max_retry_times,
'dont_retry': True,
}
req = Request(self.invalid_url, meta=meta)
self._test_retry(
req,
DNSLookupError('foo'),
0,
spider=spider,
middleware=middleware,
)
def _test_retry(
self,
req,
exception,
max_retry_times,
spider=None,
middleware=None,
):
spider = spider or self.spider
middleware = middleware or self.mw
for i in range(0, max_retry_times):
req = self.mw.process_exception(req, exception, self.spider)
req = middleware.process_exception(req, exception, spider)
assert isinstance(req, Request)
# discard it
req = self.mw.process_exception(req, exception, self.spider)
req = middleware.process_exception(req, exception, spider)
self.assertEqual(req, None)
class GetRetryRequestTest(unittest.TestCase):
def get_spider(self, settings=None):
crawler = get_crawler(Spider, settings or {})
return crawler._create_spider('foo')
def test_basic_usage(self):
request = Request('https://example.com')
spider = self.get_spider()
with LogCapture() as log:
new_request = get_retry_request(
request,
spider=spider,
)
self.assertIsInstance(new_request, Request)
self.assertNotEqual(new_request, request)
self.assertEqual(new_request.dont_filter, True)
expected_retry_times = 1
self.assertEqual(new_request.meta['retry_times'], expected_retry_times)
self.assertEqual(new_request.priority, -1)
expected_reason = "unspecified"
for stat in ('retry/count', f'retry/reason_count/{expected_reason}'):
self.assertEqual(spider.crawler.stats.get_value(stat), 1)
log.check_present(
(
"scrapy.downloadermiddlewares.retry",
"DEBUG",
f"Retrying {request} (failed {expected_retry_times} times): "
f"{expected_reason}",
)
)
def test_max_retries_reached(self):
request = Request('https://example.com')
spider = self.get_spider()
max_retry_times = 0
with LogCapture() as log:
new_request = get_retry_request(
request,
spider=spider,
max_retry_times=max_retry_times,
)
self.assertEqual(new_request, None)
self.assertEqual(
spider.crawler.stats.get_value('retry/max_reached'),
1
)
failure_count = max_retry_times + 1
expected_reason = "unspecified"
log.check_present(
(
"scrapy.downloadermiddlewares.retry",
"ERROR",
f"Gave up retrying {request} (failed {failure_count} times): "
f"{expected_reason}",
)
)
def test_one_retry(self):
request = Request('https://example.com')
spider = self.get_spider()
with LogCapture() as log:
new_request = get_retry_request(
request,
spider=spider,
max_retry_times=1,
)
self.assertIsInstance(new_request, Request)
self.assertNotEqual(new_request, request)
self.assertEqual(new_request.dont_filter, True)
expected_retry_times = 1
self.assertEqual(new_request.meta['retry_times'], expected_retry_times)
self.assertEqual(new_request.priority, -1)
expected_reason = "unspecified"
for stat in ('retry/count', f'retry/reason_count/{expected_reason}'):
self.assertEqual(spider.crawler.stats.get_value(stat), 1)
log.check_present(
(
"scrapy.downloadermiddlewares.retry",
"DEBUG",
f"Retrying {request} (failed {expected_retry_times} times): "
f"{expected_reason}",
)
)
def test_two_retries(self):
spider = self.get_spider()
request = Request('https://example.com')
new_request = request
max_retry_times = 2
for index in range(max_retry_times):
with LogCapture() as log:
new_request = get_retry_request(
new_request,
spider=spider,
max_retry_times=max_retry_times,
)
self.assertIsInstance(new_request, Request)
self.assertNotEqual(new_request, request)
self.assertEqual(new_request.dont_filter, True)
expected_retry_times = index+1
self.assertEqual(new_request.meta['retry_times'], expected_retry_times)
self.assertEqual(new_request.priority, -expected_retry_times)
expected_reason = "unspecified"
for stat in ('retry/count', f'retry/reason_count/{expected_reason}'):
value = spider.crawler.stats.get_value(stat)
self.assertEqual(value, expected_retry_times)
log.check_present(
(
"scrapy.downloadermiddlewares.retry",
"DEBUG",
f"Retrying {request} (failed {expected_retry_times} times): "
f"{expected_reason}",
)
)
with LogCapture() as log:
new_request = get_retry_request(
new_request,
spider=spider,
max_retry_times=max_retry_times,
)
self.assertEqual(new_request, None)
self.assertEqual(
spider.crawler.stats.get_value('retry/max_reached'),
1
)
failure_count = max_retry_times + 1
expected_reason = "unspecified"
log.check_present(
(
"scrapy.downloadermiddlewares.retry",
"ERROR",
f"Gave up retrying {request} (failed {failure_count} times): "
f"{expected_reason}",
)
)
def test_no_spider(self):
request = Request('https://example.com')
with self.assertRaises(TypeError):
get_retry_request(request)
def test_max_retry_times_setting(self):
max_retry_times = 0
spider = self.get_spider({'RETRY_TIMES': max_retry_times})
request = Request('https://example.com')
new_request = get_retry_request(
request,
spider=spider,
)
self.assertEqual(new_request, None)
def test_max_retry_times_meta(self):
max_retry_times = 0
spider = self.get_spider({'RETRY_TIMES': max_retry_times + 1})
meta = {'max_retry_times': max_retry_times}
request = Request('https://example.com', meta=meta)
new_request = get_retry_request(
request,
spider=spider,
)
self.assertEqual(new_request, None)
def test_max_retry_times_argument(self):
max_retry_times = 0
spider = self.get_spider({'RETRY_TIMES': max_retry_times + 1})
meta = {'max_retry_times': max_retry_times + 1}
request = Request('https://example.com', meta=meta)
new_request = get_retry_request(
request,
spider=spider,
max_retry_times=max_retry_times,
)
self.assertEqual(new_request, None)
def test_priority_adjust_setting(self):
priority_adjust = 1
spider = self.get_spider({'RETRY_PRIORITY_ADJUST': priority_adjust})
request = Request('https://example.com')
new_request = get_retry_request(
request,
spider=spider,
)
self.assertEqual(new_request.priority, priority_adjust)
def test_priority_adjust_argument(self):
priority_adjust = 1
spider = self.get_spider({'RETRY_PRIORITY_ADJUST': priority_adjust+1})
request = Request('https://example.com')
new_request = get_retry_request(
request,
spider=spider,
priority_adjust=priority_adjust,
)
self.assertEqual(new_request.priority, priority_adjust)
def test_log_extra_retry_success(self):
request = Request('https://example.com')
spider = self.get_spider()
with LogCapture(attributes=('spider',)) as log:
new_request = get_retry_request(
request,
spider=spider,
)
log.check_present(spider)
def test_log_extra_retries_exceeded(self):
request = Request('https://example.com')
spider = self.get_spider()
with LogCapture(attributes=('spider',)) as log:
new_request = get_retry_request(
request,
spider=spider,
max_retry_times=0,
)
log.check_present(spider)
def test_reason_string(self):
request = Request('https://example.com')
spider = self.get_spider()
expected_reason = 'because'
with LogCapture() as log:
new_request = get_retry_request(
request,
spider=spider,
reason=expected_reason,
)
expected_retry_times = 1
for stat in ('retry/count', f'retry/reason_count/{expected_reason}'):
self.assertEqual(spider.crawler.stats.get_value(stat), 1)
log.check_present(
(
"scrapy.downloadermiddlewares.retry",
"DEBUG",
f"Retrying {request} (failed {expected_retry_times} times): "
f"{expected_reason}",
)
)
def test_reason_builtin_exception(self):
request = Request('https://example.com')
spider = self.get_spider()
expected_reason = NotImplementedError()
expected_reason_string = 'builtins.NotImplementedError'
with LogCapture() as log:
new_request = get_retry_request(
request,
spider=spider,
reason=expected_reason,
)
expected_retry_times = 1
stat = spider.crawler.stats.get_value(
f'retry/reason_count/{expected_reason_string}'
)
self.assertEqual(stat, 1)
log.check_present(
(
"scrapy.downloadermiddlewares.retry",
"DEBUG",
f"Retrying {request} (failed {expected_retry_times} times): "
f"{expected_reason}",
)
)
def test_reason_builtin_exception_class(self):
request = Request('https://example.com')
spider = self.get_spider()
expected_reason = NotImplementedError
expected_reason_string = 'builtins.NotImplementedError'
with LogCapture() as log:
new_request = get_retry_request(
request,
spider=spider,
reason=expected_reason,
)
expected_retry_times = 1
stat = spider.crawler.stats.get_value(
f'retry/reason_count/{expected_reason_string}'
)
self.assertEqual(stat, 1)
log.check_present(
(
"scrapy.downloadermiddlewares.retry",
"DEBUG",
f"Retrying {request} (failed {expected_retry_times} times): "
f"{expected_reason}",
)
)
def test_reason_custom_exception(self):
request = Request('https://example.com')
spider = self.get_spider()
expected_reason = IgnoreRequest()
expected_reason_string = 'scrapy.exceptions.IgnoreRequest'
with LogCapture() as log:
new_request = get_retry_request(
request,
spider=spider,
reason=expected_reason,
)
expected_retry_times = 1
stat = spider.crawler.stats.get_value(
f'retry/reason_count/{expected_reason_string}'
)
self.assertEqual(stat, 1)
log.check_present(
(
"scrapy.downloadermiddlewares.retry",
"DEBUG",
f"Retrying {request} (failed {expected_retry_times} times): "
f"{expected_reason}",
)
)
def test_reason_custom_exception_class(self):
request = Request('https://example.com')
spider = self.get_spider()
expected_reason = IgnoreRequest
expected_reason_string = 'scrapy.exceptions.IgnoreRequest'
with LogCapture() as log:
new_request = get_retry_request(
request,
spider=spider,
reason=expected_reason,
)
expected_retry_times = 1
stat = spider.crawler.stats.get_value(
f'retry/reason_count/{expected_reason_string}'
)
self.assertEqual(stat, 1)
log.check_present(
(
"scrapy.downloadermiddlewares.retry",
"DEBUG",
f"Retrying {request} (failed {expected_retry_times} times): "
f"{expected_reason}",
)
)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册