Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
looyolo
scrapy
提交
80f5003c
S
scrapy
项目概览
looyolo
/
scrapy
与 Fork 源项目一致
从无法访问的项目Fork
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
S
scrapy
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
80f5003c
编写于
2月 22, 2021
作者:
A
Adrián Chaves
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add tests for get_retry_request
上级
6ab99018
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
459 addition
and
52 deletion
+459
-52
scrapy/downloadermiddlewares/retry.py
scrapy/downloadermiddlewares/retry.py
+9
-4
tests/test_downloadermiddleware_retry.py
tests/test_downloadermiddleware_retry.py
+450
-48
未找到文件。
scrapy/downloadermiddlewares/retry.py
浏览文件 @
80f5003c
...
...
@@ -10,6 +10,7 @@ Failed pages are collected on the scraping process and rescheduled at the end,
once the spider has finished crawling all regular (non failed) pages.
"""
import
logging
from
inspect
import
isclass
from
twisted.internet
import
defer
from
twisted.internet.error
import
(
...
...
@@ -81,8 +82,8 @@ def get_retry_request(
retry_times
=
request
.
meta
.
get
(
'retry_times'
,
0
)
+
1
if
max_retry_times
is
None
:
max_retry_times
=
request
.
meta
.
get
(
'max_retry_times'
)
if
max_retry_times
is
None
:
max_retry_times
=
settings
.
getint
(
'RETRY_TIMES'
)
if
max_retry_times
is
None
:
max_retry_times
=
settings
.
getint
(
'RETRY_TIMES'
)
if
retry_times
<=
max_retry_times
:
logger
.
debug
(
"Retrying %(request)s (failed %(retry_times)d times): %(reason)s"
,
...
...
@@ -96,6 +97,8 @@ def get_retry_request(
priority_adjust
=
settings
.
getint
(
'RETRY_PRIORITY_ADJUST'
)
new_request
.
priority
=
request
.
priority
+
priority_adjust
if
isclass
(
reason
):
reason
=
reason
()
if
isinstance
(
reason
,
Exception
):
reason
=
global_object_name
(
reason
.
__class__
)
...
...
@@ -149,10 +152,12 @@ class RetryMiddleware:
return
self
.
_retry
(
request
,
exception
,
spider
)
def
_retry
(
self
,
request
,
reason
,
spider
):
max_retry_times
=
request
.
meta
.
get
(
'max_retry_times'
,
self
.
max_retry_times
)
priority_adjust
=
request
.
meta
.
get
(
'priority_adjust'
,
self
.
priority_adjust
)
return
get_retry_request
(
request
,
reason
=
reason
,
spider
=
spider
,
max_retry_times
=
self
.
max_retry_times
,
priority_adjust
=
self
.
priority_adjust
,
max_retry_times
=
max_retry_times
,
priority_adjust
=
priority_adjust
,
)
tests/test_downloadermiddleware_retry.py
浏览文件 @
80f5003c
...
...
@@ -4,18 +4,22 @@ from twisted.internet.error import (
ConnectError
,
ConnectionDone
,
ConnectionLost
,
ConnectionRefusedError
,
DNSLookupError
,
TCPTimedOutError
,
TimeoutError
,
)
from
twisted.web.client
import
ResponseFailed
from
scrapy.downloadermiddlewares.retry
import
RetryMiddleware
from
scrapy.spiders
import
Spider
from
scrapy.downloadermiddlewares.retry
import
(
get_retry_request
,
RetryMiddleware
,
)
from
scrapy.exceptions
import
IgnoreRequest
from
scrapy.http
import
Request
,
Response
from
scrapy.spiders
import
Spider
from
scrapy.utils.test
import
get_crawler
from
testfixtures
import
LogCapture
class
RetryTest
(
unittest
.
TestCase
):
def
setUp
(
self
):
...
...
@@ -119,82 +123,480 @@ class RetryTest(unittest.TestCase):
class
MaxRetryTimesTest
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
crawler
=
get_crawler
(
Spider
)
self
.
spider
=
self
.
crawler
.
_create_spider
(
'foo'
)
self
.
mw
=
RetryMiddleware
.
from_crawler
(
self
.
crawler
)
self
.
mw
.
max_retry_times
=
2
self
.
invalid_url
=
'http://www.scrapytest.org/invalid_url'
def
test_with_settings_zero
(
self
):
invalid_url
=
'http://www.scrapytest.org/invalid_url'
# SETTINGS: RETRY_TIMES = 0
self
.
mw
.
max_retry_times
=
0
def
get_spider_and_middleware
(
self
,
settings
=
None
):
crawler
=
get_crawler
(
Spider
,
settings
or
{})
spider
=
crawler
.
_create_spider
(
'foo'
)
middleware
=
RetryMiddleware
.
from_crawler
(
crawler
)
return
spider
,
middleware
def
test_with_settings_zero
(
self
):
max_retry_times
=
0
settings
=
{
'RETRY_TIMES'
:
max_retry_times
}
spider
,
middleware
=
self
.
get_spider_and_middleware
(
settings
)
req
=
Request
(
self
.
invalid_url
)
self
.
_test_retry
(
req
,
DNSLookupError
(
'foo'
),
self
.
mw
.
max_retry_times
)
self
.
_test_retry
(
req
,
DNSLookupError
(
'foo'
),
max_retry_times
,
spider
=
spider
,
middleware
=
middleware
,
)
def
test_with_metakey_zero
(
self
):
# SETTINGS: meta(max_retry_times) = 0
meta_max_retry_times
=
0
req
=
Request
(
self
.
invalid_url
,
meta
=
{
'max_retry_times'
:
meta_max_retry_times
})
self
.
_test_retry
(
req
,
DNSLookupError
(
'foo'
),
meta_max_retry_times
)
max_retry_times
=
0
spider
,
middleware
=
self
.
get_spider_and_middleware
()
meta
=
{
'max_retry_times'
:
max_retry_times
}
req
=
Request
(
self
.
invalid_url
,
meta
=
meta
)
self
.
_test_retry
(
req
,
DNSLookupError
(
'foo'
),
max_retry_times
,
spider
=
spider
,
middleware
=
middleware
,
)
def
test_without_metakey
(
self
):
# SETTINGS: RETRY_TIMES is NON-ZERO
self
.
mw
.
max_retry_times
=
5
max_retry_times
=
5
settings
=
{
'RETRY_TIMES'
:
max_retry_times
}
spider
,
middleware
=
self
.
get_spider_and_middleware
(
settings
)
req
=
Request
(
self
.
invalid_url
)
self
.
_test_retry
(
req
,
DNSLookupError
(
'foo'
),
self
.
mw
.
max_retry_times
)
self
.
_test_retry
(
req
,
DNSLookupError
(
'foo'
),
max_retry_times
,
spider
=
spider
,
middleware
=
middleware
,
)
def
test_with_metakey_greater
(
self
):
# SETINGS: RETRY_TIMES < meta(max_retry_times)
self
.
mw
.
max_retry_times
=
2
meta_max_retry_times
=
3
middleware_max_retry_times
=
2
req1
=
Request
(
self
.
invalid_url
,
meta
=
{
'max_retry_times'
:
meta_max_retry_times
})
req2
=
Request
(
self
.
invalid_url
)
self
.
_test_retry
(
req1
,
DNSLookupError
(
'foo'
),
meta_max_retry_times
)
self
.
_test_retry
(
req2
,
DNSLookupError
(
'foo'
),
self
.
mw
.
max_retry_times
)
settings
=
{
'RETRY_TIMES'
:
middleware_max_retry_times
}
spider
,
middleware
=
self
.
get_spider_and_middleware
(
settings
)
self
.
_test_retry
(
req1
,
DNSLookupError
(
'foo'
),
meta_max_retry_times
,
spider
=
spider
,
middleware
=
middleware
,
)
self
.
_test_retry
(
req2
,
DNSLookupError
(
'foo'
),
middleware_max_retry_times
,
spider
=
spider
,
middleware
=
middleware
,
)
def
test_with_metakey_lesser
(
self
):
# SETINGS: RETRY_TIMES > meta(max_retry_times)
self
.
mw
.
max_retry_times
=
5
meta_max_retry_times
=
4
middleware_max_retry_times
=
5
req1
=
Request
(
self
.
invalid_url
,
meta
=
{
'max_retry_times'
:
meta_max_retry_times
})
req2
=
Request
(
self
.
invalid_url
)
self
.
_test_retry
(
req1
,
DNSLookupError
(
'foo'
),
meta_max_retry_times
)
self
.
_test_retry
(
req2
,
DNSLookupError
(
'foo'
),
self
.
mw
.
max_retry_times
)
settings
=
{
'RETRY_TIMES'
:
middleware_max_retry_times
}
spider
,
middleware
=
self
.
get_spider_and_middleware
(
settings
)
self
.
_test_retry
(
req1
,
DNSLookupError
(
'foo'
),
meta_max_retry_times
,
spider
=
spider
,
middleware
=
middleware
,
)
self
.
_test_retry
(
req2
,
DNSLookupError
(
'foo'
),
middleware_max_retry_times
,
spider
=
spider
,
middleware
=
middleware
,
)
def
test_with_dont_retry
(
self
):
# SETTINGS: meta(max_retry_times) = 4
meta_max_retry_times
=
4
req
=
Request
(
self
.
invalid_url
,
meta
=
{
'max_retry_times'
:
meta_max_retry_times
,
'dont_retry'
:
True
})
self
.
_test_retry
(
req
,
DNSLookupError
(
'foo'
),
0
)
def
_test_retry
(
self
,
req
,
exception
,
max_retry_times
):
max_retry_times
=
4
spider
,
middleware
=
self
.
get_spider_and_middleware
()
meta
=
{
'max_retry_times'
:
max_retry_times
,
'dont_retry'
:
True
,
}
req
=
Request
(
self
.
invalid_url
,
meta
=
meta
)
self
.
_test_retry
(
req
,
DNSLookupError
(
'foo'
),
0
,
spider
=
spider
,
middleware
=
middleware
,
)
def
_test_retry
(
self
,
req
,
exception
,
max_retry_times
,
spider
=
None
,
middleware
=
None
,
):
spider
=
spider
or
self
.
spider
middleware
=
middleware
or
self
.
mw
for
i
in
range
(
0
,
max_retry_times
):
req
=
self
.
mw
.
process_exception
(
req
,
exception
,
self
.
spider
)
req
=
middleware
.
process_exception
(
req
,
exception
,
spider
)
assert
isinstance
(
req
,
Request
)
# discard it
req
=
self
.
mw
.
process_exception
(
req
,
exception
,
self
.
spider
)
req
=
middleware
.
process_exception
(
req
,
exception
,
spider
)
self
.
assertEqual
(
req
,
None
)
class
GetRetryRequestTest
(
unittest
.
TestCase
):
def
get_spider
(
self
,
settings
=
None
):
crawler
=
get_crawler
(
Spider
,
settings
or
{})
return
crawler
.
_create_spider
(
'foo'
)
def
test_basic_usage
(
self
):
request
=
Request
(
'https://example.com'
)
spider
=
self
.
get_spider
()
with
LogCapture
()
as
log
:
new_request
=
get_retry_request
(
request
,
spider
=
spider
,
)
self
.
assertIsInstance
(
new_request
,
Request
)
self
.
assertNotEqual
(
new_request
,
request
)
self
.
assertEqual
(
new_request
.
dont_filter
,
True
)
expected_retry_times
=
1
self
.
assertEqual
(
new_request
.
meta
[
'retry_times'
],
expected_retry_times
)
self
.
assertEqual
(
new_request
.
priority
,
-
1
)
expected_reason
=
"unspecified"
for
stat
in
(
'retry/count'
,
f
'retry/reason_count/
{
expected_reason
}
'
):
self
.
assertEqual
(
spider
.
crawler
.
stats
.
get_value
(
stat
),
1
)
log
.
check_present
(
(
"scrapy.downloadermiddlewares.retry"
,
"DEBUG"
,
f
"Retrying
{
request
}
(failed
{
expected_retry_times
}
times): "
f
"
{
expected_reason
}
"
,
)
)
def
test_max_retries_reached
(
self
):
request
=
Request
(
'https://example.com'
)
spider
=
self
.
get_spider
()
max_retry_times
=
0
with
LogCapture
()
as
log
:
new_request
=
get_retry_request
(
request
,
spider
=
spider
,
max_retry_times
=
max_retry_times
,
)
self
.
assertEqual
(
new_request
,
None
)
self
.
assertEqual
(
spider
.
crawler
.
stats
.
get_value
(
'retry/max_reached'
),
1
)
failure_count
=
max_retry_times
+
1
expected_reason
=
"unspecified"
log
.
check_present
(
(
"scrapy.downloadermiddlewares.retry"
,
"ERROR"
,
f
"Gave up retrying
{
request
}
(failed
{
failure_count
}
times): "
f
"
{
expected_reason
}
"
,
)
)
def
test_one_retry
(
self
):
request
=
Request
(
'https://example.com'
)
spider
=
self
.
get_spider
()
with
LogCapture
()
as
log
:
new_request
=
get_retry_request
(
request
,
spider
=
spider
,
max_retry_times
=
1
,
)
self
.
assertIsInstance
(
new_request
,
Request
)
self
.
assertNotEqual
(
new_request
,
request
)
self
.
assertEqual
(
new_request
.
dont_filter
,
True
)
expected_retry_times
=
1
self
.
assertEqual
(
new_request
.
meta
[
'retry_times'
],
expected_retry_times
)
self
.
assertEqual
(
new_request
.
priority
,
-
1
)
expected_reason
=
"unspecified"
for
stat
in
(
'retry/count'
,
f
'retry/reason_count/
{
expected_reason
}
'
):
self
.
assertEqual
(
spider
.
crawler
.
stats
.
get_value
(
stat
),
1
)
log
.
check_present
(
(
"scrapy.downloadermiddlewares.retry"
,
"DEBUG"
,
f
"Retrying
{
request
}
(failed
{
expected_retry_times
}
times): "
f
"
{
expected_reason
}
"
,
)
)
def
test_two_retries
(
self
):
spider
=
self
.
get_spider
()
request
=
Request
(
'https://example.com'
)
new_request
=
request
max_retry_times
=
2
for
index
in
range
(
max_retry_times
):
with
LogCapture
()
as
log
:
new_request
=
get_retry_request
(
new_request
,
spider
=
spider
,
max_retry_times
=
max_retry_times
,
)
self
.
assertIsInstance
(
new_request
,
Request
)
self
.
assertNotEqual
(
new_request
,
request
)
self
.
assertEqual
(
new_request
.
dont_filter
,
True
)
expected_retry_times
=
index
+
1
self
.
assertEqual
(
new_request
.
meta
[
'retry_times'
],
expected_retry_times
)
self
.
assertEqual
(
new_request
.
priority
,
-
expected_retry_times
)
expected_reason
=
"unspecified"
for
stat
in
(
'retry/count'
,
f
'retry/reason_count/
{
expected_reason
}
'
):
value
=
spider
.
crawler
.
stats
.
get_value
(
stat
)
self
.
assertEqual
(
value
,
expected_retry_times
)
log
.
check_present
(
(
"scrapy.downloadermiddlewares.retry"
,
"DEBUG"
,
f
"Retrying
{
request
}
(failed
{
expected_retry_times
}
times): "
f
"
{
expected_reason
}
"
,
)
)
with
LogCapture
()
as
log
:
new_request
=
get_retry_request
(
new_request
,
spider
=
spider
,
max_retry_times
=
max_retry_times
,
)
self
.
assertEqual
(
new_request
,
None
)
self
.
assertEqual
(
spider
.
crawler
.
stats
.
get_value
(
'retry/max_reached'
),
1
)
failure_count
=
max_retry_times
+
1
expected_reason
=
"unspecified"
log
.
check_present
(
(
"scrapy.downloadermiddlewares.retry"
,
"ERROR"
,
f
"Gave up retrying
{
request
}
(failed
{
failure_count
}
times): "
f
"
{
expected_reason
}
"
,
)
)
def
test_no_spider
(
self
):
request
=
Request
(
'https://example.com'
)
with
self
.
assertRaises
(
TypeError
):
get_retry_request
(
request
)
def
test_max_retry_times_setting
(
self
):
max_retry_times
=
0
spider
=
self
.
get_spider
({
'RETRY_TIMES'
:
max_retry_times
})
request
=
Request
(
'https://example.com'
)
new_request
=
get_retry_request
(
request
,
spider
=
spider
,
)
self
.
assertEqual
(
new_request
,
None
)
def
test_max_retry_times_meta
(
self
):
max_retry_times
=
0
spider
=
self
.
get_spider
({
'RETRY_TIMES'
:
max_retry_times
+
1
})
meta
=
{
'max_retry_times'
:
max_retry_times
}
request
=
Request
(
'https://example.com'
,
meta
=
meta
)
new_request
=
get_retry_request
(
request
,
spider
=
spider
,
)
self
.
assertEqual
(
new_request
,
None
)
def
test_max_retry_times_argument
(
self
):
max_retry_times
=
0
spider
=
self
.
get_spider
({
'RETRY_TIMES'
:
max_retry_times
+
1
})
meta
=
{
'max_retry_times'
:
max_retry_times
+
1
}
request
=
Request
(
'https://example.com'
,
meta
=
meta
)
new_request
=
get_retry_request
(
request
,
spider
=
spider
,
max_retry_times
=
max_retry_times
,
)
self
.
assertEqual
(
new_request
,
None
)
def
test_priority_adjust_setting
(
self
):
priority_adjust
=
1
spider
=
self
.
get_spider
({
'RETRY_PRIORITY_ADJUST'
:
priority_adjust
})
request
=
Request
(
'https://example.com'
)
new_request
=
get_retry_request
(
request
,
spider
=
spider
,
)
self
.
assertEqual
(
new_request
.
priority
,
priority_adjust
)
def
test_priority_adjust_argument
(
self
):
priority_adjust
=
1
spider
=
self
.
get_spider
({
'RETRY_PRIORITY_ADJUST'
:
priority_adjust
+
1
})
request
=
Request
(
'https://example.com'
)
new_request
=
get_retry_request
(
request
,
spider
=
spider
,
priority_adjust
=
priority_adjust
,
)
self
.
assertEqual
(
new_request
.
priority
,
priority_adjust
)
def
test_log_extra_retry_success
(
self
):
request
=
Request
(
'https://example.com'
)
spider
=
self
.
get_spider
()
with
LogCapture
(
attributes
=
(
'spider'
,))
as
log
:
new_request
=
get_retry_request
(
request
,
spider
=
spider
,
)
log
.
check_present
(
spider
)
def
test_log_extra_retries_exceeded
(
self
):
request
=
Request
(
'https://example.com'
)
spider
=
self
.
get_spider
()
with
LogCapture
(
attributes
=
(
'spider'
,))
as
log
:
new_request
=
get_retry_request
(
request
,
spider
=
spider
,
max_retry_times
=
0
,
)
log
.
check_present
(
spider
)
def
test_reason_string
(
self
):
request
=
Request
(
'https://example.com'
)
spider
=
self
.
get_spider
()
expected_reason
=
'because'
with
LogCapture
()
as
log
:
new_request
=
get_retry_request
(
request
,
spider
=
spider
,
reason
=
expected_reason
,
)
expected_retry_times
=
1
for
stat
in
(
'retry/count'
,
f
'retry/reason_count/
{
expected_reason
}
'
):
self
.
assertEqual
(
spider
.
crawler
.
stats
.
get_value
(
stat
),
1
)
log
.
check_present
(
(
"scrapy.downloadermiddlewares.retry"
,
"DEBUG"
,
f
"Retrying
{
request
}
(failed
{
expected_retry_times
}
times): "
f
"
{
expected_reason
}
"
,
)
)
def
test_reason_builtin_exception
(
self
):
request
=
Request
(
'https://example.com'
)
spider
=
self
.
get_spider
()
expected_reason
=
NotImplementedError
()
expected_reason_string
=
'builtins.NotImplementedError'
with
LogCapture
()
as
log
:
new_request
=
get_retry_request
(
request
,
spider
=
spider
,
reason
=
expected_reason
,
)
expected_retry_times
=
1
stat
=
spider
.
crawler
.
stats
.
get_value
(
f
'retry/reason_count/
{
expected_reason_string
}
'
)
self
.
assertEqual
(
stat
,
1
)
log
.
check_present
(
(
"scrapy.downloadermiddlewares.retry"
,
"DEBUG"
,
f
"Retrying
{
request
}
(failed
{
expected_retry_times
}
times): "
f
"
{
expected_reason
}
"
,
)
)
def
test_reason_builtin_exception_class
(
self
):
request
=
Request
(
'https://example.com'
)
spider
=
self
.
get_spider
()
expected_reason
=
NotImplementedError
expected_reason_string
=
'builtins.NotImplementedError'
with
LogCapture
()
as
log
:
new_request
=
get_retry_request
(
request
,
spider
=
spider
,
reason
=
expected_reason
,
)
expected_retry_times
=
1
stat
=
spider
.
crawler
.
stats
.
get_value
(
f
'retry/reason_count/
{
expected_reason_string
}
'
)
self
.
assertEqual
(
stat
,
1
)
log
.
check_present
(
(
"scrapy.downloadermiddlewares.retry"
,
"DEBUG"
,
f
"Retrying
{
request
}
(failed
{
expected_retry_times
}
times): "
f
"
{
expected_reason
}
"
,
)
)
def
test_reason_custom_exception
(
self
):
request
=
Request
(
'https://example.com'
)
spider
=
self
.
get_spider
()
expected_reason
=
IgnoreRequest
()
expected_reason_string
=
'scrapy.exceptions.IgnoreRequest'
with
LogCapture
()
as
log
:
new_request
=
get_retry_request
(
request
,
spider
=
spider
,
reason
=
expected_reason
,
)
expected_retry_times
=
1
stat
=
spider
.
crawler
.
stats
.
get_value
(
f
'retry/reason_count/
{
expected_reason_string
}
'
)
self
.
assertEqual
(
stat
,
1
)
log
.
check_present
(
(
"scrapy.downloadermiddlewares.retry"
,
"DEBUG"
,
f
"Retrying
{
request
}
(failed
{
expected_retry_times
}
times): "
f
"
{
expected_reason
}
"
,
)
)
def
test_reason_custom_exception_class
(
self
):
request
=
Request
(
'https://example.com'
)
spider
=
self
.
get_spider
()
expected_reason
=
IgnoreRequest
expected_reason_string
=
'scrapy.exceptions.IgnoreRequest'
with
LogCapture
()
as
log
:
new_request
=
get_retry_request
(
request
,
spider
=
spider
,
reason
=
expected_reason
,
)
expected_retry_times
=
1
stat
=
spider
.
crawler
.
stats
.
get_value
(
f
'retry/reason_count/
{
expected_reason_string
}
'
)
self
.
assertEqual
(
stat
,
1
)
log
.
check_present
(
(
"scrapy.downloadermiddlewares.retry"
,
"DEBUG"
,
f
"Retrying
{
request
}
(failed
{
expected_retry_times
}
times): "
f
"
{
expected_reason
}
"
,
)
)
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录