提交 1e9b52c3 编写于 作者: A Andrey Rakhmatullin

Refactor SpiderMiddlewareManager.scrape_response.

上级 904a5013
......@@ -41,86 +41,92 @@ class SpiderMiddlewareManager(MiddlewareManager):
process_spider_exception = getattr(mw, 'process_spider_exception', None)
self.methods['process_spider_exception'].appendleft(process_spider_exception)
def scrape_response(self, scrape_func, response, request, spider):
def process_spider_input(response):
for method in self.methods['process_spider_input']:
try:
result = method(response=response, spider=spider)
if result is not None:
msg = (f"Middleware {_fname(method)} must return None "
f"or raise an exception, got {type(result)}")
raise _InvalidOutput(msg)
except _InvalidOutput:
raise
except Exception:
return scrape_func(Failure(), request, spider)
return scrape_func(response, request, spider)
def _evaluate_iterable(iterable, exception_processor_index, recover_to):
def _process_spider_input(self, scrape_func, response, request, spider):
for method in self.methods['process_spider_input']:
try:
for r in iterable:
yield r
except Exception as ex:
exception_result = process_spider_exception(Failure(ex), exception_processor_index)
if isinstance(exception_result, Failure):
raise
recover_to.extend(exception_result)
def process_spider_exception(_failure, start_index=0):
exception = _failure.value
# don't handle _InvalidOutput exception
if isinstance(exception, _InvalidOutput):
return _failure
method_list = islice(self.methods['process_spider_exception'], start_index, None)
for method_index, method in enumerate(method_list, start=start_index):
if method is None:
continue
result = method(response=response, exception=exception, spider=spider)
if _isiterable(result):
# stop exception handling by handing control over to the
# process_spider_output chain if an iterable has been returned
return process_spider_output(result, method_index + 1)
elif result is None:
continue
else:
result = method(response=response, spider=spider)
if result is not None:
msg = (f"Middleware {_fname(method)} must return None "
f"or an iterable, got {type(result)}")
f"or raise an exception, got {type(result)}")
raise _InvalidOutput(msg)
except _InvalidOutput:
raise
except Exception:
return scrape_func(Failure(), request, spider)
return scrape_func(response, request, spider)
def _evaluate_iterable(self, response, spider, iterable, exception_processor_index, recover_to):
try:
for r in iterable:
yield r
except Exception as ex:
exception_result = self._process_spider_exception(response, spider, Failure(ex),
exception_processor_index)
if isinstance(exception_result, Failure):
raise
recover_to.extend(exception_result)
def _process_spider_exception(self, response, spider, _failure, start_index=0):
exception = _failure.value
# don't handle _InvalidOutput exception
if isinstance(exception, _InvalidOutput):
return _failure
method_list = islice(self.methods['process_spider_exception'], start_index, None)
for method_index, method in enumerate(method_list, start=start_index):
if method is None:
continue
result = method(response=response, exception=exception, spider=spider)
if _isiterable(result):
# stop exception handling by handing control over to the
# process_spider_output chain if an iterable has been returned
return self._process_spider_output(response, spider, result, method_index + 1)
elif result is None:
continue
else:
msg = (f"Middleware {_fname(method)} must return None "
f"or an iterable, got {type(result)}")
raise _InvalidOutput(msg)
return _failure
def _process_spider_output(self, response, spider, result, start_index=0):
# items in this iterable do not need to go through the process_spider_output
# chain, they went through it already from the process_spider_exception method
recovered = MutableChain()
method_list = islice(self.methods['process_spider_output'], start_index, None)
for method_index, method in enumerate(method_list, start=start_index):
if method is None:
continue
try:
# might fail directly if the output value is not a generator
result = method(response=response, result=result, spider=spider)
except Exception as ex:
exception_result = self._process_spider_exception(response, spider, Failure(ex), method_index + 1)
if isinstance(exception_result, Failure):
raise
return exception_result
if _isiterable(result):
result = self._evaluate_iterable(response, spider, result, method_index + 1, recovered)
else:
msg = (f"Middleware {_fname(method)} must return an "
f"iterable, got {type(result)}")
raise _InvalidOutput(msg)
def process_spider_output(result, start_index=0):
# items in this iterable do not need to go through the process_spider_output
# chain, they went through it already from the process_spider_exception method
recovered = MutableChain()
method_list = islice(self.methods['process_spider_output'], start_index, None)
for method_index, method in enumerate(method_list, start=start_index):
if method is None:
continue
try:
# might fail directly if the output value is not a generator
result = method(response=response, result=result, spider=spider)
except Exception as ex:
exception_result = process_spider_exception(Failure(ex), method_index + 1)
if isinstance(exception_result, Failure):
raise
return exception_result
if _isiterable(result):
result = _evaluate_iterable(result, method_index + 1, recovered)
else:
msg = (f"Middleware {_fname(method)} must return an "
f"iterable, got {type(result)}")
raise _InvalidOutput(msg)
return MutableChain(result, recovered)
return MutableChain(result, recovered)
def _process_callback_output(self, response, spider, result):
recovered = MutableChain()
result = self._evaluate_iterable(response, spider, result, 0, recovered)
return MutableChain(self._process_spider_output(response, spider, result), recovered)
def scrape_response(self, scrape_func, response, request, spider):
def process_callback_output(result):
recovered = MutableChain()
result = _evaluate_iterable(result, 0, recovered)
return MutableChain(process_spider_output(result), recovered)
return self._process_callback_output(response, spider, result)
def process_spider_exception(_failure):
return self._process_spider_exception(response, spider, _failure)
dfd = mustbe_deferred(process_spider_input, response)
dfd = mustbe_deferred(self._process_spider_input, scrape_func, response, request, spider)
dfd.addCallbacks(callback=process_callback_output, errback=process_spider_exception)
return dfd
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册