未验证 提交 2aa4f3cb 编写于 作者: E Eugenio Lacuesta 提交者: GitHub

Conditional request attribute binding for responses (#4632)

上级 acb3b443
......@@ -423,6 +423,11 @@ response_received
:param spider: the spider for which the response is intended
:type spider: :class:`~scrapy.spiders.Spider` object
.. note:: The ``request`` argument might not contain the original request that
reached the downloader, if a :ref:`topics-downloader-middleware` modifies
the :class:`~scrapy.http.Response` object and sets a specific ``request``
attribute.
response_downloaded
~~~~~~~~~~~~~~~~~~~
......
......@@ -243,12 +243,17 @@ class ExecutionEngine:
% (type(response), response)
)
if isinstance(response, Response):
response.request = request # tie request to response received
logkws = self.logformatter.crawled(request, response, spider)
if response.request is None:
response.request = request
logkws = self.logformatter.crawled(response.request, response, spider)
if logkws is not None:
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
self.signals.send_catch_log(signals.response_received,
response=response, request=request, spider=spider)
self.signals.send_catch_log(
signal=signals.response_received,
response=response,
request=response.request,
spider=spider,
)
return response
def _on_complete(_):
......
......@@ -12,7 +12,7 @@ from scrapy import signals
from scrapy.core.spidermw import SpiderMiddlewareManager
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
from scrapy.http import Request, Response
from scrapy.utils.defer import defer_result, defer_succeed, iter_errback, parallel
from scrapy.utils.defer import defer_fail, defer_succeed, iter_errback, parallel
from scrapy.utils.log import failure_to_exc_info, logformatter_adapter
from scrapy.utils.misc import load_object, warn_on_generator_with_return_value
from scrapy.utils.spider import iterate_spider_output
......@@ -120,40 +120,40 @@ class Scraper:
response, request, deferred = slot.next_response_request_deferred()
self._scrape(response, request, spider).chainDeferred(deferred)
def _scrape(self, response, request, spider):
"""Handle the downloaded response or failure through the spider
callback/errback"""
if not isinstance(response, (Response, Failure)):
raise TypeError(
"Incorrect type: expected Response or Failure, got %s: %r"
% (type(response), response)
)
dfd = self._scrape2(response, request, spider) # returns spider's processed output
dfd.addErrback(self.handle_spider_error, request, response, spider)
dfd.addCallback(self.handle_spider_output, request, response, spider)
def _scrape(self, result, request, spider):
"""
Handle the downloaded response or failure through the spider callback/errback
"""
if not isinstance(result, (Response, Failure)):
raise TypeError("Incorrect type: expected Response or Failure, got %s: %r" % (type(result), result))
dfd = self._scrape2(result, request, spider) # returns spider's processed output
dfd.addErrback(self.handle_spider_error, request, result, spider)
dfd.addCallback(self.handle_spider_output, request, result, spider)
return dfd
def _scrape2(self, request_result, request, spider):
"""Handle the different cases of request's result been a Response or a
Failure"""
if not isinstance(request_result, Failure):
return self.spidermw.scrape_response(
self.call_spider, request_result, request, spider)
else:
dfd = self.call_spider(request_result, request, spider)
return dfd.addErrback(
self._log_download_errors, request_result, request, spider)
def _scrape2(self, result, request, spider):
"""
Handle the different cases of request's result been a Response or a Failure
"""
if isinstance(result, Response):
return self.spidermw.scrape_response(self.call_spider, result, request, spider)
else: # result is a Failure
dfd = self.call_spider(result, request, spider)
return dfd.addErrback(self._log_download_errors, result, request, spider)
def call_spider(self, result, request, spider):
result.request = request
dfd = defer_result(result)
callback = request.callback or spider._parse
warn_on_generator_with_return_value(spider, callback)
warn_on_generator_with_return_value(spider, request.errback)
dfd.addCallbacks(callback=callback,
errback=request.errback,
callbackKeywords=request.cb_kwargs)
if isinstance(result, Response):
if getattr(result, "request", None) is None:
result.request = request
callback = result.request.callback or spider._parse
warn_on_generator_with_return_value(spider, callback)
dfd = defer_succeed(result)
dfd.addCallback(callback, **result.request.cb_kwargs)
else: # result is a Failure
result.request = request
warn_on_generator_with_return_value(spider, request.errback)
dfd = defer_fail(result)
dfd.addErrback(request.errback)
return dfd.addCallback(iterate_spider_output)
def handle_spider_error(self, _failure, request, response, spider):
......
......@@ -130,6 +130,9 @@ ignore_errors = True
[mypy-tests.test_pipelines]
ignore_errors = True
[mypy-tests.test_request_attribute_binding]
ignore_errors = True
[mypy-tests.test_request_cb_kwargs]
ignore_errors = True
......
from twisted.internet import defer
from twisted.trial.unittest import TestCase
from scrapy import Request, signals
from scrapy.crawler import CrawlerRunner
from scrapy.http.response import Response
from testfixtures import LogCapture
from tests.mockserver import MockServer
from tests.spiders import SingleRequestSpider
OVERRIDEN_URL = "https://example.org"
class ProcessResponseMiddleware:
def process_response(self, request, response, spider):
return response.replace(request=Request(OVERRIDEN_URL))
class RaiseExceptionRequestMiddleware:
def process_request(self, request, spider):
1 / 0
return request
class CatchExceptionOverrideRequestMiddleware:
def process_exception(self, request, exception, spider):
return Response(
url="http://localhost/",
body=b"Caught " + exception.__class__.__name__.encode("utf-8"),
request=Request(OVERRIDEN_URL),
)
class CatchExceptionDoNotOverrideRequestMiddleware:
def process_exception(self, request, exception, spider):
return Response(
url="http://localhost/",
body=b"Caught " + exception.__class__.__name__.encode("utf-8"),
)
class AlternativeCallbacksSpider(SingleRequestSpider):
name = "alternative_callbacks_spider"
def alt_callback(self, response, foo=None):
self.logger.info("alt_callback was invoked with foo=%s", foo)
class AlternativeCallbacksMiddleware:
def process_response(self, request, response, spider):
new_request = request.replace(
url=OVERRIDEN_URL,
callback=spider.alt_callback,
cb_kwargs={"foo": "bar"},
)
return response.replace(request=new_request)
class CrawlTestCase(TestCase):
def setUp(self):
self.mockserver = MockServer()
self.mockserver.__enter__()
def tearDown(self):
self.mockserver.__exit__(None, None, None)
@defer.inlineCallbacks
def test_response_200(self):
url = self.mockserver.url("/status?n=200")
crawler = CrawlerRunner().create_crawler(SingleRequestSpider)
yield crawler.crawl(seed=url, mockserver=self.mockserver)
response = crawler.spider.meta["responses"][0]
self.assertEqual(response.request.url, url)
@defer.inlineCallbacks
def test_response_error(self):
for status in ("404", "500"):
url = self.mockserver.url("/status?n={}".format(status))
crawler = CrawlerRunner().create_crawler(SingleRequestSpider)
yield crawler.crawl(seed=url, mockserver=self.mockserver)
failure = crawler.spider.meta["failure"]
response = failure.value.response
self.assertEqual(failure.request.url, url)
self.assertEqual(response.request.url, url)
@defer.inlineCallbacks
def test_downloader_middleware_raise_exception(self):
url = self.mockserver.url("/status?n=200")
runner = CrawlerRunner(settings={
"DOWNLOADER_MIDDLEWARES": {
__name__ + ".RaiseExceptionRequestMiddleware": 590,
},
})
crawler = runner.create_crawler(SingleRequestSpider)
yield crawler.crawl(seed=url, mockserver=self.mockserver)
failure = crawler.spider.meta["failure"]
self.assertEqual(failure.request.url, url)
self.assertIsInstance(failure.value, ZeroDivisionError)
@defer.inlineCallbacks
def test_downloader_middleware_override_request_in_process_response(self):
"""
Downloader middleware which returns a response with an specific 'request' attribute.
* The spider callback should receive the overriden response.request
* Handlers listening to the response_received signal should receive the overriden response.request
* The "crawled" log message should show the overriden response.request
"""
signal_params = {}
def signal_handler(response, request, spider):
signal_params["response"] = response
signal_params["request"] = request
url = self.mockserver.url("/status?n=200")
runner = CrawlerRunner(settings={
"DOWNLOADER_MIDDLEWARES": {
__name__ + ".ProcessResponseMiddleware": 595,
}
})
crawler = runner.create_crawler(SingleRequestSpider)
crawler.signals.connect(signal_handler, signal=signals.response_received)
with LogCapture() as log:
yield crawler.crawl(seed=url, mockserver=self.mockserver)
response = crawler.spider.meta["responses"][0]
self.assertEqual(response.request.url, OVERRIDEN_URL)
self.assertEqual(signal_params["response"].url, url)
self.assertEqual(signal_params["request"].url, OVERRIDEN_URL)
log.check_present(
("scrapy.core.engine", "DEBUG", "Crawled (200) <GET {}> (referer: None)".format(OVERRIDEN_URL)),
)
@defer.inlineCallbacks
def test_downloader_middleware_override_in_process_exception(self):
"""
An exception is raised but caught by the next middleware, which
returns a Response with a specific 'request' attribute.
The spider callback should receive the overriden response.request
"""
url = self.mockserver.url("/status?n=200")
runner = CrawlerRunner(settings={
"DOWNLOADER_MIDDLEWARES": {
__name__ + ".RaiseExceptionRequestMiddleware": 590,
__name__ + ".CatchExceptionOverrideRequestMiddleware": 595,
},
})
crawler = runner.create_crawler(SingleRequestSpider)
yield crawler.crawl(seed=url, mockserver=self.mockserver)
response = crawler.spider.meta["responses"][0]
self.assertEqual(response.body, b"Caught ZeroDivisionError")
self.assertEqual(response.request.url, OVERRIDEN_URL)
@defer.inlineCallbacks
def test_downloader_middleware_do_not_override_in_process_exception(self):
"""
An exception is raised but caught by the next middleware, which
returns a Response without a specific 'request' attribute.
The spider callback should receive the original response.request
"""
url = self.mockserver.url("/status?n=200")
runner = CrawlerRunner(settings={
"DOWNLOADER_MIDDLEWARES": {
__name__ + ".RaiseExceptionRequestMiddleware": 590,
__name__ + ".CatchExceptionDoNotOverrideRequestMiddleware": 595,
},
})
crawler = runner.create_crawler(SingleRequestSpider)
yield crawler.crawl(seed=url, mockserver=self.mockserver)
response = crawler.spider.meta["responses"][0]
self.assertEqual(response.body, b"Caught ZeroDivisionError")
self.assertEqual(response.request.url, url)
@defer.inlineCallbacks
def test_downloader_middleware_alternative_callback(self):
"""
Downloader middleware which returns a response with a
specific 'request' attribute, with an alternative callback
"""
runner = CrawlerRunner(settings={
"DOWNLOADER_MIDDLEWARES": {
__name__ + ".AlternativeCallbacksMiddleware": 595,
}
})
crawler = runner.create_crawler(AlternativeCallbacksSpider)
with LogCapture() as log:
url = self.mockserver.url("/status?n=200")
yield crawler.crawl(seed=url, mockserver=self.mockserver)
log.check_present(
("alternative_callbacks_spider", "INFO", "alt_callback was invoked with foo=bar"),
)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册