diff --git a/scrapy/core/spidermw.py b/scrapy/core/spidermw.py index 763e0cdf626100fbc6089511ca0f5e9a1554fec5..289292da7a3bc304eb2ff73829bab5d96774050c 100644 --- a/scrapy/core/spidermw.py +++ b/scrapy/core/spidermw.py @@ -41,86 +41,92 @@ class SpiderMiddlewareManager(MiddlewareManager): process_spider_exception = getattr(mw, 'process_spider_exception', None) self.methods['process_spider_exception'].appendleft(process_spider_exception) - def scrape_response(self, scrape_func, response, request, spider): - - def process_spider_input(response): - for method in self.methods['process_spider_input']: - try: - result = method(response=response, spider=spider) - if result is not None: - msg = (f"Middleware {_fname(method)} must return None " - f"or raise an exception, got {type(result)}") - raise _InvalidOutput(msg) - except _InvalidOutput: - raise - except Exception: - return scrape_func(Failure(), request, spider) - return scrape_func(response, request, spider) - - def _evaluate_iterable(iterable, exception_processor_index, recover_to): + def _process_spider_input(self, scrape_func, response, request, spider): + for method in self.methods['process_spider_input']: try: - for r in iterable: - yield r - except Exception as ex: - exception_result = process_spider_exception(Failure(ex), exception_processor_index) - if isinstance(exception_result, Failure): - raise - recover_to.extend(exception_result) - - def process_spider_exception(_failure, start_index=0): - exception = _failure.value - # don't handle _InvalidOutput exception - if isinstance(exception, _InvalidOutput): - return _failure - method_list = islice(self.methods['process_spider_exception'], start_index, None) - for method_index, method in enumerate(method_list, start=start_index): - if method is None: - continue - result = method(response=response, exception=exception, spider=spider) - if _isiterable(result): - # stop exception handling by handing control over to the - # process_spider_output chain if an iterable has been returned - return process_spider_output(result, method_index + 1) - elif result is None: - continue - else: + result = method(response=response, spider=spider) + if result is not None: msg = (f"Middleware {_fname(method)} must return None " - f"or an iterable, got {type(result)}") + f"or raise an exception, got {type(result)}") raise _InvalidOutput(msg) + except _InvalidOutput: + raise + except Exception: + return scrape_func(Failure(), request, spider) + return scrape_func(response, request, spider) + + def _evaluate_iterable(self, response, spider, iterable, exception_processor_index, recover_to): + try: + for r in iterable: + yield r + except Exception as ex: + exception_result = self._process_spider_exception(response, spider, Failure(ex), + exception_processor_index) + if isinstance(exception_result, Failure): + raise + recover_to.extend(exception_result) + + def _process_spider_exception(self, response, spider, _failure, start_index=0): + exception = _failure.value + # don't handle _InvalidOutput exception + if isinstance(exception, _InvalidOutput): return _failure + method_list = islice(self.methods['process_spider_exception'], start_index, None) + for method_index, method in enumerate(method_list, start=start_index): + if method is None: + continue + result = method(response=response, exception=exception, spider=spider) + if _isiterable(result): + # stop exception handling by handing control over to the + # process_spider_output chain if an iterable has been returned + return self._process_spider_output(response, spider, result, method_index + 1) + elif result is None: + continue + else: + msg = (f"Middleware {_fname(method)} must return None " + f"or an iterable, got {type(result)}") + raise _InvalidOutput(msg) + return _failure + + def _process_spider_output(self, response, spider, result, start_index=0): + # items in this iterable do not need to go through the process_spider_output + # chain, they went through it already from the process_spider_exception method + recovered = MutableChain() + + method_list = islice(self.methods['process_spider_output'], start_index, None) + for method_index, method in enumerate(method_list, start=start_index): + if method is None: + continue + try: + # might fail directly if the output value is not a generator + result = method(response=response, result=result, spider=spider) + except Exception as ex: + exception_result = self._process_spider_exception(response, spider, Failure(ex), method_index + 1) + if isinstance(exception_result, Failure): + raise + return exception_result + if _isiterable(result): + result = self._evaluate_iterable(response, spider, result, method_index + 1, recovered) + else: + msg = (f"Middleware {_fname(method)} must return an " + f"iterable, got {type(result)}") + raise _InvalidOutput(msg) - def process_spider_output(result, start_index=0): - # items in this iterable do not need to go through the process_spider_output - # chain, they went through it already from the process_spider_exception method - recovered = MutableChain() - - method_list = islice(self.methods['process_spider_output'], start_index, None) - for method_index, method in enumerate(method_list, start=start_index): - if method is None: - continue - try: - # might fail directly if the output value is not a generator - result = method(response=response, result=result, spider=spider) - except Exception as ex: - exception_result = process_spider_exception(Failure(ex), method_index + 1) - if isinstance(exception_result, Failure): - raise - return exception_result - if _isiterable(result): - result = _evaluate_iterable(result, method_index + 1, recovered) - else: - msg = (f"Middleware {_fname(method)} must return an " - f"iterable, got {type(result)}") - raise _InvalidOutput(msg) + return MutableChain(result, recovered) - return MutableChain(result, recovered) + def _process_callback_output(self, response, spider, result): + recovered = MutableChain() + result = self._evaluate_iterable(response, spider, result, 0, recovered) + return MutableChain(self._process_spider_output(response, spider, result), recovered) + def scrape_response(self, scrape_func, response, request, spider): def process_callback_output(result): - recovered = MutableChain() - result = _evaluate_iterable(result, 0, recovered) - return MutableChain(process_spider_output(result), recovered) + return self._process_callback_output(response, spider, result) + + def process_spider_exception(_failure): + return self._process_spider_exception(response, spider, _failure) - dfd = mustbe_deferred(process_spider_input, response) + dfd = mustbe_deferred(self._process_spider_input, scrape_func, response, request, spider) dfd.addCallbacks(callback=process_callback_output, errback=process_spider_exception) return dfd