Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
looyolo
scrapy
提交
2aa4f3cb
S
scrapy
项目概览
looyolo
/
scrapy
与 Fork 源项目一致
从无法访问的项目Fork
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
S
scrapy
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
未验证
提交
2aa4f3cb
编写于
8月 17, 2020
作者:
E
Eugenio Lacuesta
提交者:
GitHub
8月 17, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Conditional request attribute binding for responses (#4632)
上级
acb3b443
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
250 addition
and
35 deletion
+250
-35
docs/topics/signals.rst
docs/topics/signals.rst
+5
-0
scrapy/core/engine.py
scrapy/core/engine.py
+9
-4
scrapy/core/scraper.py
scrapy/core/scraper.py
+31
-31
setup.cfg
setup.cfg
+3
-0
tests/test_request_attribute_binding.py
tests/test_request_attribute_binding.py
+202
-0
未找到文件。
docs/topics/signals.rst
浏览文件 @
2aa4f3cb
...
...
@@ -423,6 +423,11 @@ response_received
:param spider: the spider for which the response is intended
:type spider: :class:`~scrapy.spiders.Spider` object
.. note:: The ``request`` argument might not contain the original request that
reached the downloader, if a :ref:`topics-downloader-middleware` modifies
the :class:`~scrapy.http.Response` object and sets a specific ``request``
attribute.
response_downloaded
~~~~~~~~~~~~~~~~~~~
...
...
scrapy/core/engine.py
浏览文件 @
2aa4f3cb
...
...
@@ -243,12 +243,17 @@ class ExecutionEngine:
%
(
type
(
response
),
response
)
)
if
isinstance
(
response
,
Response
):
response
.
request
=
request
# tie request to response received
logkws
=
self
.
logformatter
.
crawled
(
request
,
response
,
spider
)
if
response
.
request
is
None
:
response
.
request
=
request
logkws
=
self
.
logformatter
.
crawled
(
response
.
request
,
response
,
spider
)
if
logkws
is
not
None
:
logger
.
log
(
*
logformatter_adapter
(
logkws
),
extra
=
{
'spider'
:
spider
})
self
.
signals
.
send_catch_log
(
signals
.
response_received
,
response
=
response
,
request
=
request
,
spider
=
spider
)
self
.
signals
.
send_catch_log
(
signal
=
signals
.
response_received
,
response
=
response
,
request
=
response
.
request
,
spider
=
spider
,
)
return
response
def
_on_complete
(
_
):
...
...
scrapy/core/scraper.py
浏览文件 @
2aa4f3cb
...
...
@@ -12,7 +12,7 @@ from scrapy import signals
from
scrapy.core.spidermw
import
SpiderMiddlewareManager
from
scrapy.exceptions
import
CloseSpider
,
DropItem
,
IgnoreRequest
from
scrapy.http
import
Request
,
Response
from
scrapy.utils.defer
import
defer_
result
,
defer_succeed
,
iter_errback
,
parallel
from
scrapy.utils.defer
import
defer_
fail
,
defer_succeed
,
iter_errback
,
parallel
from
scrapy.utils.log
import
failure_to_exc_info
,
logformatter_adapter
from
scrapy.utils.misc
import
load_object
,
warn_on_generator_with_return_value
from
scrapy.utils.spider
import
iterate_spider_output
...
...
@@ -120,40 +120,40 @@ class Scraper:
response
,
request
,
deferred
=
slot
.
next_response_request_deferred
()
self
.
_scrape
(
response
,
request
,
spider
).
chainDeferred
(
deferred
)
def
_scrape
(
self
,
response
,
request
,
spider
):
"""Handle the downloaded response or failure through the spider
callback/errback"""
if
not
isinstance
(
response
,
(
Response
,
Failure
)):
raise
TypeError
(
"Incorrect type: expected Response or Failure, got %s: %r"
%
(
type
(
response
),
response
)
)
dfd
=
self
.
_scrape2
(
response
,
request
,
spider
)
# returns spider's processed output
dfd
.
addErrback
(
self
.
handle_spider_error
,
request
,
response
,
spider
)
dfd
.
addCallback
(
self
.
handle_spider_output
,
request
,
response
,
spider
)
def
_scrape
(
self
,
result
,
request
,
spider
):
"""
Handle the downloaded response or failure through the spider callback/errback
"""
if
not
isinstance
(
result
,
(
Response
,
Failure
)):
raise
TypeError
(
"Incorrect type: expected Response or Failure, got %s: %r"
%
(
type
(
result
),
result
))
dfd
=
self
.
_scrape2
(
result
,
request
,
spider
)
# returns spider's processed output
dfd
.
addErrback
(
self
.
handle_spider_error
,
request
,
result
,
spider
)
dfd
.
addCallback
(
self
.
handle_spider_output
,
request
,
result
,
spider
)
return
dfd
def
_scrape2
(
self
,
request_result
,
request
,
spider
):
"""Handle the different cases of request's result been a Response or a
Failure"""
if
not
isinstance
(
request_result
,
Failure
):
return
self
.
spidermw
.
scrape_response
(
self
.
call_spider
,
request_result
,
request
,
spider
)
else
:
dfd
=
self
.
call_spider
(
request_result
,
request
,
spider
)
return
dfd
.
addErrback
(
self
.
_log_download_errors
,
request_result
,
request
,
spider
)
def
_scrape2
(
self
,
result
,
request
,
spider
):
"""
Handle the different cases of request's result been a Response or a Failure
"""
if
isinstance
(
result
,
Response
):
return
self
.
spidermw
.
scrape_response
(
self
.
call_spider
,
result
,
request
,
spider
)
else
:
# result is a Failure
dfd
=
self
.
call_spider
(
result
,
request
,
spider
)
return
dfd
.
addErrback
(
self
.
_log_download_errors
,
result
,
request
,
spider
)
def
call_spider
(
self
,
result
,
request
,
spider
):
result
.
request
=
request
dfd
=
defer_result
(
result
)
callback
=
request
.
callback
or
spider
.
_parse
warn_on_generator_with_return_value
(
spider
,
callback
)
warn_on_generator_with_return_value
(
spider
,
request
.
errback
)
dfd
.
addCallbacks
(
callback
=
callback
,
errback
=
request
.
errback
,
callbackKeywords
=
request
.
cb_kwargs
)
if
isinstance
(
result
,
Response
):
if
getattr
(
result
,
"request"
,
None
)
is
None
:
result
.
request
=
request
callback
=
result
.
request
.
callback
or
spider
.
_parse
warn_on_generator_with_return_value
(
spider
,
callback
)
dfd
=
defer_succeed
(
result
)
dfd
.
addCallback
(
callback
,
**
result
.
request
.
cb_kwargs
)
else
:
# result is a Failure
result
.
request
=
request
warn_on_generator_with_return_value
(
spider
,
request
.
errback
)
dfd
=
defer_fail
(
result
)
dfd
.
addErrback
(
request
.
errback
)
return
dfd
.
addCallback
(
iterate_spider_output
)
def
handle_spider_error
(
self
,
_failure
,
request
,
response
,
spider
):
...
...
setup.cfg
浏览文件 @
2aa4f3cb
...
...
@@ -130,6 +130,9 @@ ignore_errors = True
[mypy-tests.test_pipelines]
ignore_errors = True
[mypy-tests.test_request_attribute_binding]
ignore_errors = True
[mypy-tests.test_request_cb_kwargs]
ignore_errors = True
...
...
tests/test_request_attribute_binding.py
0 → 100644
浏览文件 @
2aa4f3cb
from
twisted.internet
import
defer
from
twisted.trial.unittest
import
TestCase
from
scrapy
import
Request
,
signals
from
scrapy.crawler
import
CrawlerRunner
from
scrapy.http.response
import
Response
from
testfixtures
import
LogCapture
from
tests.mockserver
import
MockServer
from
tests.spiders
import
SingleRequestSpider
OVERRIDEN_URL
=
"https://example.org"
class
ProcessResponseMiddleware
:
def
process_response
(
self
,
request
,
response
,
spider
):
return
response
.
replace
(
request
=
Request
(
OVERRIDEN_URL
))
class
RaiseExceptionRequestMiddleware
:
def
process_request
(
self
,
request
,
spider
):
1
/
0
return
request
class
CatchExceptionOverrideRequestMiddleware
:
def
process_exception
(
self
,
request
,
exception
,
spider
):
return
Response
(
url
=
"http://localhost/"
,
body
=
b
"Caught "
+
exception
.
__class__
.
__name__
.
encode
(
"utf-8"
),
request
=
Request
(
OVERRIDEN_URL
),
)
class
CatchExceptionDoNotOverrideRequestMiddleware
:
def
process_exception
(
self
,
request
,
exception
,
spider
):
return
Response
(
url
=
"http://localhost/"
,
body
=
b
"Caught "
+
exception
.
__class__
.
__name__
.
encode
(
"utf-8"
),
)
class
AlternativeCallbacksSpider
(
SingleRequestSpider
):
name
=
"alternative_callbacks_spider"
def
alt_callback
(
self
,
response
,
foo
=
None
):
self
.
logger
.
info
(
"alt_callback was invoked with foo=%s"
,
foo
)
class
AlternativeCallbacksMiddleware
:
def
process_response
(
self
,
request
,
response
,
spider
):
new_request
=
request
.
replace
(
url
=
OVERRIDEN_URL
,
callback
=
spider
.
alt_callback
,
cb_kwargs
=
{
"foo"
:
"bar"
},
)
return
response
.
replace
(
request
=
new_request
)
class
CrawlTestCase
(
TestCase
):
def
setUp
(
self
):
self
.
mockserver
=
MockServer
()
self
.
mockserver
.
__enter__
()
def
tearDown
(
self
):
self
.
mockserver
.
__exit__
(
None
,
None
,
None
)
@
defer
.
inlineCallbacks
def
test_response_200
(
self
):
url
=
self
.
mockserver
.
url
(
"/status?n=200"
)
crawler
=
CrawlerRunner
().
create_crawler
(
SingleRequestSpider
)
yield
crawler
.
crawl
(
seed
=
url
,
mockserver
=
self
.
mockserver
)
response
=
crawler
.
spider
.
meta
[
"responses"
][
0
]
self
.
assertEqual
(
response
.
request
.
url
,
url
)
@
defer
.
inlineCallbacks
def
test_response_error
(
self
):
for
status
in
(
"404"
,
"500"
):
url
=
self
.
mockserver
.
url
(
"/status?n={}"
.
format
(
status
))
crawler
=
CrawlerRunner
().
create_crawler
(
SingleRequestSpider
)
yield
crawler
.
crawl
(
seed
=
url
,
mockserver
=
self
.
mockserver
)
failure
=
crawler
.
spider
.
meta
[
"failure"
]
response
=
failure
.
value
.
response
self
.
assertEqual
(
failure
.
request
.
url
,
url
)
self
.
assertEqual
(
response
.
request
.
url
,
url
)
@
defer
.
inlineCallbacks
def
test_downloader_middleware_raise_exception
(
self
):
url
=
self
.
mockserver
.
url
(
"/status?n=200"
)
runner
=
CrawlerRunner
(
settings
=
{
"DOWNLOADER_MIDDLEWARES"
:
{
__name__
+
".RaiseExceptionRequestMiddleware"
:
590
,
},
})
crawler
=
runner
.
create_crawler
(
SingleRequestSpider
)
yield
crawler
.
crawl
(
seed
=
url
,
mockserver
=
self
.
mockserver
)
failure
=
crawler
.
spider
.
meta
[
"failure"
]
self
.
assertEqual
(
failure
.
request
.
url
,
url
)
self
.
assertIsInstance
(
failure
.
value
,
ZeroDivisionError
)
@
defer
.
inlineCallbacks
def
test_downloader_middleware_override_request_in_process_response
(
self
):
"""
Downloader middleware which returns a response with an specific 'request' attribute.
* The spider callback should receive the overriden response.request
* Handlers listening to the response_received signal should receive the overriden response.request
* The "crawled" log message should show the overriden response.request
"""
signal_params
=
{}
def
signal_handler
(
response
,
request
,
spider
):
signal_params
[
"response"
]
=
response
signal_params
[
"request"
]
=
request
url
=
self
.
mockserver
.
url
(
"/status?n=200"
)
runner
=
CrawlerRunner
(
settings
=
{
"DOWNLOADER_MIDDLEWARES"
:
{
__name__
+
".ProcessResponseMiddleware"
:
595
,
}
})
crawler
=
runner
.
create_crawler
(
SingleRequestSpider
)
crawler
.
signals
.
connect
(
signal_handler
,
signal
=
signals
.
response_received
)
with
LogCapture
()
as
log
:
yield
crawler
.
crawl
(
seed
=
url
,
mockserver
=
self
.
mockserver
)
response
=
crawler
.
spider
.
meta
[
"responses"
][
0
]
self
.
assertEqual
(
response
.
request
.
url
,
OVERRIDEN_URL
)
self
.
assertEqual
(
signal_params
[
"response"
].
url
,
url
)
self
.
assertEqual
(
signal_params
[
"request"
].
url
,
OVERRIDEN_URL
)
log
.
check_present
(
(
"scrapy.core.engine"
,
"DEBUG"
,
"Crawled (200) <GET {}> (referer: None)"
.
format
(
OVERRIDEN_URL
)),
)
@
defer
.
inlineCallbacks
def
test_downloader_middleware_override_in_process_exception
(
self
):
"""
An exception is raised but caught by the next middleware, which
returns a Response with a specific 'request' attribute.
The spider callback should receive the overriden response.request
"""
url
=
self
.
mockserver
.
url
(
"/status?n=200"
)
runner
=
CrawlerRunner
(
settings
=
{
"DOWNLOADER_MIDDLEWARES"
:
{
__name__
+
".RaiseExceptionRequestMiddleware"
:
590
,
__name__
+
".CatchExceptionOverrideRequestMiddleware"
:
595
,
},
})
crawler
=
runner
.
create_crawler
(
SingleRequestSpider
)
yield
crawler
.
crawl
(
seed
=
url
,
mockserver
=
self
.
mockserver
)
response
=
crawler
.
spider
.
meta
[
"responses"
][
0
]
self
.
assertEqual
(
response
.
body
,
b
"Caught ZeroDivisionError"
)
self
.
assertEqual
(
response
.
request
.
url
,
OVERRIDEN_URL
)
@
defer
.
inlineCallbacks
def
test_downloader_middleware_do_not_override_in_process_exception
(
self
):
"""
An exception is raised but caught by the next middleware, which
returns a Response without a specific 'request' attribute.
The spider callback should receive the original response.request
"""
url
=
self
.
mockserver
.
url
(
"/status?n=200"
)
runner
=
CrawlerRunner
(
settings
=
{
"DOWNLOADER_MIDDLEWARES"
:
{
__name__
+
".RaiseExceptionRequestMiddleware"
:
590
,
__name__
+
".CatchExceptionDoNotOverrideRequestMiddleware"
:
595
,
},
})
crawler
=
runner
.
create_crawler
(
SingleRequestSpider
)
yield
crawler
.
crawl
(
seed
=
url
,
mockserver
=
self
.
mockserver
)
response
=
crawler
.
spider
.
meta
[
"responses"
][
0
]
self
.
assertEqual
(
response
.
body
,
b
"Caught ZeroDivisionError"
)
self
.
assertEqual
(
response
.
request
.
url
,
url
)
@
defer
.
inlineCallbacks
def
test_downloader_middleware_alternative_callback
(
self
):
"""
Downloader middleware which returns a response with a
specific 'request' attribute, with an alternative callback
"""
runner
=
CrawlerRunner
(
settings
=
{
"DOWNLOADER_MIDDLEWARES"
:
{
__name__
+
".AlternativeCallbacksMiddleware"
:
595
,
}
})
crawler
=
runner
.
create_crawler
(
AlternativeCallbacksSpider
)
with
LogCapture
()
as
log
:
url
=
self
.
mockserver
.
url
(
"/status?n=200"
)
yield
crawler
.
crawl
(
seed
=
url
,
mockserver
=
self
.
mockserver
)
log
.
check_present
(
(
"alternative_callbacks_spider"
,
"INFO"
,
"alt_callback was invoked with foo=bar"
),
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录