提交 e96b7782 编写于 作者: E Eugenio Lacuesta

Merge branch 'master' into process_spider_exception_generator_experiment

......@@ -5,15 +5,4 @@ omit =
tests/*
scrapy/xlib/*
scrapy/conf.py
scrapy/stats.py
scrapy/project.py
scrapy/utils/decorator.py
scrapy/statscol.py
scrapy/squeue.py
scrapy/log.py
scrapy/dupefilter.py
scrapy/command.py
scrapy/linkextractor.py
scrapy/spider.py
scrapy/contrib/*
scrapy/contrib_exp/*
......@@ -21,12 +21,16 @@ matrix:
env: TOXENV=py35
- python: 3.6
env: TOXENV=py36
- python: 3.7
env: TOXENV=py37
dist: xenial
sudo: true
- python: 3.6
env: TOXENV=docs
install:
- |
if [ "$TOXENV" = "pypy" ]; then
export PYPY_VERSION="pypy-5.9-linux_x86_64-portable"
export PYPY_VERSION="pypy-6.0.0-linux_x86_64-portable"
wget "https://bitbucket.org/squeaky/portable-pypy/downloads/${PYPY_VERSION}.tar.bz2"
tar -jxf ${PYPY_VERSION}.tar.bz2
virtualenv --python="$PYPY_VERSION/bin/pypy" "$HOME/virtualenvs/$PYPY_VERSION"
......
......@@ -11,21 +11,12 @@ def _py_files(folder):
collect_ignore = [
# deprecated or moved modules
"scrapy/conf.py",
"scrapy/stats.py",
"scrapy/project.py",
"scrapy/utils/decorator.py",
"scrapy/statscol.py",
"scrapy/squeue.py",
"scrapy/log.py",
"scrapy/dupefilter.py",
"scrapy/command.py",
"scrapy/linkextractor.py",
"scrapy/spider.py",
# not a test, but looks like a test
"scrapy/utils/testsite.py",
] + _py_files("scrapy/contrib") + _py_files("scrapy/contrib_exp")
]
if (twisted_version.major, twisted_version.minor, twisted_version.micro) >= (15, 5, 0):
collect_ignore += _py_files("scrapy/xlib/tx")
......
......@@ -3,6 +3,31 @@
Release notes
=============
Scrapy 1.6.0 (unreleased)
-------------------------
Cleanups
~~~~~~~~
* Remove deprecated ``CrawlerSettings`` class.
* Remove deprecated ``Settings.overrides`` and ``Settings.defaults`` attributes.
Scrapy 1.5.1 (2018-07-12)
-------------------------
This is a maintenance release with important bug fixes, but no new features:
* ``O(N^2)`` gzip decompression issue which affected Python 3 and PyPy
is fixed (:issue:`3281`);
* skipping of TLS validation errors is improved (:issue:`3166`);
* Ctrl-C handling is fixed in Python 3.5+ (:issue:`3096`);
* testing fixes (:issue:`3092`, :issue:`3263`);
* documentation improvements (:issue:`3058`, :issue:`3059`, :issue:`3089`,
:issue:`3123`, :issue:`3127`, :issue:`3189`, :issue:`3224`, :issue:`3280`,
:issue:`3279`, :issue:`3201`, :issue:`3260`, :issue:`3284`, :issue:`3298`,
:issue:`3294`).
Scrapy 1.5.0 (2017-12-29)
-------------------------
......
......@@ -177,7 +177,7 @@ The feeds are stored on `Amazon S3`_.
* ``s3://mybucket/path/to/export.csv``
* ``s3://aws_key:aws_secret@mybucket/path/to/export.csv``
* Required external libraries: `botocore`_ or `boto`_
* Required external libraries: `botocore`_ (Python 2 and Python 3) or `boto`_ (Python 2 only)
The AWS credentials can be passed as user/password in the URI, or they can be
passed through the following settings:
......
......@@ -335,17 +335,6 @@ See also: :ref:`faq-bfo-dfo` about tuning Scrapy for BFO or DFO.
other priority settings :setting:`REDIRECT_PRIORITY_ADJUST`
and :setting:`RETRY_PRIORITY_ADJUST`.
.. setting:: DEPTH_STATS
DEPTH_STATS
-----------
Default: ``True``
Scope: ``scrapy.spidermiddlewares.depth.DepthMiddleware``
Whether to collect maximum depth stats.
.. setting:: DEPTH_STATS_VERBOSE
DEPTH_STATS_VERBOSE
......
......@@ -135,6 +135,29 @@ item_dropped
to be dropped
:type exception: :exc:`~scrapy.exceptions.DropItem` exception
item_error
------------
.. signal:: item_error
.. function:: item_error(item, response, spider, failure)
Sent when a :ref:`topics-item-pipeline` generates an error (ie. raises
an exception), except :exc:`~scrapy.exceptions.DropItem` exception.
This signal supports returning deferreds from their handlers.
:param item: the item dropped from the :ref:`topics-item-pipeline`
:type item: dict or :class:`~scrapy.item.Item` object
:param response: the response being processed when the exception was raised
:type response: :class:`~scrapy.http.Response` object
:param spider: the spider which raised the exception
:type spider: :class:`~scrapy.spiders.Spider` object
:param failure: the exception raised as a Twisted `Failure`_ object
:type failure: `Failure`_ object
spider_closed
-------------
......
......@@ -213,7 +213,8 @@ DepthMiddleware
* :setting:`DEPTH_LIMIT` - The maximum depth that will be allowed to
crawl for any site. If zero, no limit will be imposed.
* :setting:`DEPTH_STATS` - Whether to collect depth stats.
* :setting:`DEPTH_STATS_VERBOSE` - Whether to collect the number of
requests for each depth.
* :setting:`DEPTH_PRIORITY` - Whether to prioritize the requests based on
their depth.
......
......@@ -2,9 +2,9 @@ Twisted>=13.1.0
lxml
pyOpenSSL
cssselect>=0.9
w3lib>=1.17.0
queuelib
w3lib>=1.17.0
six>=1.5.2
PyDispatcher>=2.0.5
service_identity
parsel>=1.4
service_identity
Twisted >= 17.9.0
Twisted>=17.9.0
lxml>=3.2.4
pyOpenSSL>=0.13.1
cssselect>=0.9
queuelib>=1.1.1
w3lib>=1.17.0
six>=1.5.2
PyDispatcher>=2.0.5
parsel>=1.4
service_identity
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.command` is deprecated, "
"use `scrapy.commands` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.commands import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.closespider` is deprecated, "
"use `scrapy.extensions.closespider` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.extensions.closespider import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.corestats` is deprecated, "
"use `scrapy.extensions.corestats` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.extensions.corestats import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.debug` is deprecated, "
"use `scrapy.extensions.debug` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.extensions.debug import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.downloadermiddleware.ajaxcrawl` is deprecated, "
"use `scrapy.downloadermiddlewares.ajaxcrawl` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.downloadermiddlewares.ajaxcrawl import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.downloadermiddleware.chunked` is deprecated, "
"use `scrapy.downloadermiddlewares.chunked` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.downloadermiddlewares.chunked import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.downloadermiddleware.cookies` is deprecated, "
"use `scrapy.downloadermiddlewares.cookies` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.downloadermiddlewares.cookies import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.downloadermiddleware.decompression` is deprecated, "
"use `scrapy.downloadermiddlewares.decompression` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.downloadermiddlewares.decompression import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.downloadermiddleware.defaultheaders` is deprecated, "
"use `scrapy.downloadermiddlewares.defaultheaders` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.downloadermiddlewares.defaultheaders import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.downloadermiddleware.downloadtimeout` is deprecated, "
"use `scrapy.downloadermiddlewares.downloadtimeout` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.downloadermiddlewares.downloadtimeout import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.downloadermiddleware.httpauth` is deprecated, "
"use `scrapy.downloadermiddlewares.httpauth` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.downloadermiddlewares.httpauth import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.downloadermiddleware.httpcache` is deprecated, "
"use `scrapy.downloadermiddlewares.httpcache` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.downloadermiddlewares.httpcache import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.downloadermiddleware.httpcompression` is deprecated, "
"use `scrapy.downloadermiddlewares.httpcompression` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.downloadermiddlewares.httpcompression import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.downloadermiddleware.httpproxy` is deprecated, "
"use `scrapy.downloadermiddlewares.httpproxy` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.downloadermiddlewares.httpproxy import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.downloadermiddleware.redirect` is deprecated, "
"use `scrapy.downloadermiddlewares.redirect` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.downloadermiddlewares.redirect import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.downloadermiddleware.retry` is deprecated, "
"use `scrapy.downloadermiddlewares.retry` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.downloadermiddlewares.retry import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.downloadermiddleware.robotstxt` is deprecated, "
"use `scrapy.downloadermiddlewares.robotstxt` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.downloadermiddlewares.robotstxt import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.downloadermiddleware.stats` is deprecated, "
"use `scrapy.downloadermiddlewares.stats` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.downloadermiddlewares.stats import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.downloadermiddleware.useragent` is deprecated, "
"use `scrapy.downloadermiddlewares.useragent` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.downloadermiddlewares.useragent import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.exporter` is deprecated, "
"use `scrapy.exporters` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.exporters import *
from scrapy.exporters import PythonItemExporter
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.feedexport` is deprecated, "
"use `scrapy.extensions.feedexport` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.extensions.feedexport import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.httpcache` is deprecated, "
"use `scrapy.extensions.httpcache` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.extensions.httpcache import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.linkextractors` is deprecated, "
"use `scrapy.linkextractors` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.linkextractors import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.linkextractors.htmlparser` is deprecated, "
"use `scrapy.linkextractors.htmlparser` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.linkextractors.htmlparser import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.linkextractors.lxmlhtml` is deprecated, "
"use `scrapy.linkextractors.lxmlhtml` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.linkextractors.lxmlhtml import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.linkextractors.regex` is deprecated, "
"use `scrapy.linkextractors.regex` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.linkextractors.regex import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.linkextractors.sgml` is deprecated, "
"use `scrapy.linkextractors.sgml` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.linkextractors.sgml import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.loader` is deprecated, "
"use `scrapy.loader` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.loader import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.loader.common` is deprecated, "
"use `scrapy.loader.common` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.loader.common import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.loader.processor` is deprecated, "
"use `scrapy.loader.processors` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.loader.processors import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.logstats` is deprecated, "
"use `scrapy.extensions.logstats` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.extensions.logstats import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.memdebug` is deprecated, "
"use `scrapy.extensions.memdebug` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.extensions.memdebug import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.memusage` is deprecated, "
"use `scrapy.extensions.memusage` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.extensions.memusage import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.pipeline` is deprecated, "
"use `scrapy.pipelines` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.pipelines import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.pipeline.files` is deprecated, "
"use `scrapy.pipelines.files` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.pipelines.files import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.pipeline.images` is deprecated, "
"use `scrapy.pipelines.images` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.pipelines.images import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.pipeline.media` is deprecated, "
"use `scrapy.pipelines.media` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.pipelines.media import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.spidermiddleware.depth` is deprecated, "
"use `scrapy.spidermiddlewares.depth` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.spidermiddlewares.depth import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.spidermiddleware.httperror` is deprecated, "
"use `scrapy.spidermiddlewares.httperror` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.spidermiddlewares.httperror import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.spidermiddleware.offsite` is deprecated, "
"use `scrapy.spidermiddlewares.offsite` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.spidermiddlewares.offsite import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.spidermiddleware.referer` is deprecated, "
"use `scrapy.spidermiddlewares.referer` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.spidermiddlewares.referer import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.spidermiddleware.urllength` is deprecated, "
"use `scrapy.spidermiddlewares.urllength` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.spidermiddlewares.urllength import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.spiders` is deprecated, "
"use `scrapy.spiders` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.spiders import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.spiders.crawl` is deprecated, "
"use `scrapy.spiders.crawl` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.spiders.crawl import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.spiders.feed` is deprecated, "
"use `scrapy.spiders.feed` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.spiders.feed import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.spiders.init` is deprecated, "
"use `scrapy.spiders.init` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.spiders.init import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.spiders.sitemap` is deprecated, "
"use `scrapy.spiders.sitemap` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.spiders.sitemap import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.spiderstate` is deprecated, "
"use `scrapy.extensions.spiderstate` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.extensions.spiderstate import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.statsmailer` is deprecated, "
"use `scrapy.extensions.statsmailer` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.extensions.statsmailer import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib.throttle` is deprecated, "
"use `scrapy.extensions.throttle` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.extensions.throttle import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib_exp.downloadermiddleware.decompression` is deprecated, "
"use `scrapy.downloadermiddlewares.decompression` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.downloadermiddlewares.decompression import DecompressionMiddleware
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib_exp.iterators` is deprecated, use `scrapy.utils.iterators` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.utils.iterators import xmliter_lxml
......@@ -232,6 +232,9 @@ class Scraper(object):
logger.error('Error processing %(item)s', {'item': item},
exc_info=failure_to_exc_info(output),
extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_error, item=item, response=response,
spider=spider, failure=output)
else:
logkws = self.logformatter.scraped(output, response, spider)
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
......
import base64
from six.moves.urllib.parse import unquote, urlunparse
from six.moves.urllib.request import getproxies, proxy_bypass
from six.moves.urllib.parse import unquote
try:
from urllib2 import _parse_proxy
except ImportError:
from urllib.request import _parse_proxy
from six.moves.urllib.parse import urlunparse
from scrapy.utils.httpobj import urlparse_cached
from scrapy.exceptions import NotConfigured
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_bytes
......@@ -17,8 +16,8 @@ class HttpProxyMiddleware(object):
def __init__(self, auth_encoding='latin-1'):
self.auth_encoding = auth_encoding
self.proxies = {}
for type, url in getproxies().items():
self.proxies[type] = self._get_proxy(url, type)
for type_, url in getproxies().items():
self.proxies[type_] = self._get_proxy(url, type_)
@classmethod
def from_crawler(cls, crawler):
......
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.dupefilter` is deprecated, "
"use `scrapy.dupefilters` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.dupefilters import *
......@@ -214,7 +214,8 @@ class CsvItemExporter(BaseItemExporter):
file,
line_buffering=False,
write_through=True,
encoding=self.encoding
encoding=self.encoding,
newline='' # Windows needs this https://github.com/scrapy/scrapy/issues/3034
) if six.PY3 else file
self.csv_writer = csv.writer(self.stream, **kwargs)
self._headers_not_written = True
......
......@@ -6,13 +6,15 @@ See documentation in docs/topics/telnetconsole.rst
import pprint
import logging
import traceback
from twisted.internet import protocol
try:
from twisted.conch import manhole, telnet
from twisted.conch.insults import insults
TWISTED_CONCH_AVAILABLE = True
except ImportError:
except (ImportError, SyntaxError):
_TWISTED_CONCH_TRACEBACK = traceback.format_exc()
TWISTED_CONCH_AVAILABLE = False
from scrapy.exceptions import NotConfigured
......@@ -40,7 +42,9 @@ class TelnetConsole(protocol.ServerFactory):
if not crawler.settings.getbool('TELNETCONSOLE_ENABLED'):
raise NotConfigured
if not TWISTED_CONCH_AVAILABLE:
raise NotConfigured
raise NotConfigured(
'TELNETCONSOLE_ENABLED setting is True but required twisted '
'modules failed to import:\n' + _TWISTED_CONCH_TRACEBACK)
self.crawler = crawler
self.noisy = False
self.portrange = [int(x) for x in crawler.settings.getlist('TELNETCONSOLE_PORT')]
......
......@@ -16,7 +16,3 @@ class ISpiderLoader(Interface):
def find_by_request(request):
"""Return the list of spiders names that can handle the given request"""
# ISpiderManager is deprecated, don't use it!
# An alias is kept for backwards compatibility.
ISpiderManager = ISpiderLoader
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.linkextractor` is deprecated, "
"use `scrapy.linkextractors` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.linkextractors import *
......@@ -41,7 +41,8 @@ IGNORED_EXTENSIONS = [
_re_type = type(re.compile("", 0))
_matches = lambda url, regexs: any(r.search(url) for r in regexs)
_is_valid_url = lambda url: url.split('://', 1)[0] in {'http', 'https', 'file'}
_is_valid_url = lambda url: url.split('://', 1)[0] in {'http', 'https', \
'file', 'ftp'}
class FilteringLinkExtractor(object):
......
"""
Obsolete module, kept for giving a meaningful error message when trying to
import.
"""
raise ImportError("""scrapy.project usage has become obsolete.
If you want to get the Scrapy crawler from your extension, middleware or
pipeline implement the `from_crawler` class method (or look up for extending
components that have already done it, such as spiders).
For example:
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)""")
......@@ -6,7 +6,6 @@ from collections import MutableMapping
from importlib import import_module
from pprint import pformat
from scrapy.utils.deprecate import create_deprecated_class
from scrapy.exceptions import ScrapyDeprecationWarning
from . import default_settings
......@@ -405,30 +404,6 @@ class BaseSettings(MutableMapping):
else:
p.text(pformat(self.copy_to_dict()))
@property
def overrides(self):
warnings.warn("`Settings.overrides` attribute is deprecated and won't "
"be supported in Scrapy 0.26, use "
"`Settings.set(name, value, priority='cmdline')` instead",
category=ScrapyDeprecationWarning, stacklevel=2)
try:
o = self._overrides
except AttributeError:
self._overrides = o = _DictProxy(self, 'cmdline')
return o
@property
def defaults(self):
warnings.warn("`Settings.defaults` attribute is deprecated and won't "
"be supported in Scrapy 0.26, use "
"`Settings.set(name, value, priority='default')` instead",
category=ScrapyDeprecationWarning, stacklevel=2)
try:
o = self._defaults
except AttributeError:
self._defaults = o = _DictProxy(self, 'default')
return o
class _DictProxy(MutableMapping):
......@@ -479,29 +454,6 @@ class Settings(BaseSettings):
self.update(values, priority)
class CrawlerSettings(Settings):
def __init__(self, settings_module=None, **kw):
self.settings_module = settings_module
Settings.__init__(self, **kw)
def __getitem__(self, opt_name):
if opt_name in self.overrides:
return self.overrides[opt_name]
if self.settings_module and hasattr(self.settings_module, opt_name):
return getattr(self.settings_module, opt_name)
if opt_name in self.defaults:
return self.defaults[opt_name]
return Settings.__getitem__(self, opt_name)
def __str__(self):
return "<CrawlerSettings module=%r>" % self.settings_module
CrawlerSettings = create_deprecated_class(
'CrawlerSettings', CrawlerSettings,
new_class_path='scrapy.settings.Settings')
def iter_default_settings():
"""Return the default settings as an iterator of (name, value) tuples"""
for name in dir(default_settings):
......
......@@ -55,7 +55,7 @@ DEFAULT_REQUEST_HEADERS = {
}
DEPTH_LIMIT = 0
DEPTH_STATS = True
DEPTH_STATS_VERBOSE = False
DEPTH_PRIORITY = 0
DNSCACHE_ENABLED = True
......
......@@ -17,6 +17,7 @@ response_received = object()
response_downloaded = object()
item_scraped = object()
item_dropped = object()
item_error = object()
# for backwards compatibility
stats_spider_opened = spider_opened
......
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.spider` is deprecated, "
"use `scrapy.spiders` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.spiders import *
"""
Backwards compatibility shim. Use scrapy.spiderloader instead.
"""
from scrapy.spiderloader import SpiderLoader
from scrapy.utils.deprecate import create_deprecated_class
SpiderManager = create_deprecated_class('SpiderManager', SpiderLoader)
......@@ -13,7 +13,7 @@ logger = logging.getLogger(__name__)
class DepthMiddleware(object):
def __init__(self, maxdepth, stats=None, verbose_stats=False, prio=1):
def __init__(self, maxdepth, stats, verbose_stats=False, prio=1):
self.maxdepth = maxdepth
self.stats = stats
self.verbose_stats = verbose_stats
......@@ -41,7 +41,7 @@ class DepthMiddleware(object):
extra={'spider': spider}
)
return False
elif self.stats:
else:
if self.verbose_stats:
self.stats.inc_value('request_depth_count/%s' % depth,
spider=spider)
......@@ -50,7 +50,7 @@ class DepthMiddleware(object):
return True
# base case (depth=0)
if self.stats and 'depth' not in response.meta:
if 'depth' not in response.meta:
response.meta['depth'] = 0
if self.verbose_stats:
self.stats.inc_value('request_depth_count/0', spider=spider)
......
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.squeue` is deprecated, "
"use `scrapy.squeues` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.squeues import *
"""
Obsolete module, kept for giving a meaningful error message when trying to
import.
"""
raise ImportError("scrapy.stats usage has become obsolete, use "
"`crawler.stats` attribute instead")
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.statscol` is deprecated, "
"use `scrapy.statscollectors` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.statscollectors import *
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.utils.decorator` is deprecated, "
"use `scrapy.utils.decorators` instead",
ScrapyDeprecationWarning, stacklevel=2)
from scrapy.utils.decorators import *
......@@ -124,26 +124,7 @@ def _clspath(cls, forced=None):
DEPRECATION_RULES = [
('scrapy.contrib_exp.downloadermiddleware.decompression.', 'scrapy.downloadermiddlewares.decompression.'),
('scrapy.contrib_exp.iterators.', 'scrapy.utils.iterators.'),
('scrapy.contrib.downloadermiddleware.', 'scrapy.downloadermiddlewares.'),
('scrapy.contrib.exporter.', 'scrapy.exporters.'),
('scrapy.contrib.linkextractors.', 'scrapy.linkextractors.'),
('scrapy.contrib.loader.processor.', 'scrapy.loader.processors.'),
('scrapy.contrib.loader.', 'scrapy.loader.'),
('scrapy.contrib.pipeline.', 'scrapy.pipelines.'),
('scrapy.contrib.spidermiddleware.', 'scrapy.spidermiddlewares.'),
('scrapy.contrib.spiders.', 'scrapy.spiders.'),
('scrapy.contrib.', 'scrapy.extensions.'),
('scrapy.command.', 'scrapy.commands.'),
('scrapy.dupefilter.', 'scrapy.dupefilters.'),
('scrapy.linkextractor.', 'scrapy.linkextractors.'),
('scrapy.telnet.', 'scrapy.extensions.telnet.'),
('scrapy.spider.', 'scrapy.spiders.'),
('scrapy.squeue.', 'scrapy.squeues.'),
('scrapy.statscol.', 'scrapy.statscollectors.'),
('scrapy.utils.decorator.', 'scrapy.utils.decorators.'),
('scrapy.spidermanager.SpiderManager', 'scrapy.spiderloader.SpiderLoader'),
]
......
......@@ -98,8 +98,9 @@ def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
"""
encoding = obj.encoding if isinstance(obj, TextResponse) else encoding or 'utf-8'
def _getrow(csv_r):
return [to_unicode(field, encoding) for field in next(csv_r)]
def row_to_unicode(row_):
return [to_unicode(field, encoding) for field in row_]
# Python 3 csv reader input object needs to return strings
if six.PY3:
......@@ -113,10 +114,14 @@ def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
csv_r = csv.reader(lines, **kwargs)
if not headers:
headers = _getrow(csv_r)
while True:
row = _getrow(csv_r)
try:
row = next(csv_r)
except StopIteration:
return
headers = row_to_unicode(row)
for row in csv_r:
row = row_to_unicode(row)
if len(row) != len(headers):
logger.warning("ignoring row %(csvlnum)d (length: %(csvrow)d, "
"should be: %(csvheader)d)",
......
Twisted!=18.4.0
lxml!=4.2.2
\ No newline at end of file
......@@ -9,3 +9,9 @@ class ZeroDivisionErrorPipeline(object):
def process_item(self, item, spider):
return item
class ProcessWithZeroDivisionErrorPipiline(object):
def process_item(self, item, spider):
1/0
......@@ -10,4 +10,4 @@ brotlipy
testfixtures
# optional for shell wrapper tests
bpython
ipython
ipython<6.0
pytest==2.9.2
pytest==3.6.3
pytest-twisted
pytest-cov==2.2.1
pytest-cov==2.5.1
testfixtures
jmespath
leveldb
......
import logging
import os
import tempfile
import warnings
import unittest
......@@ -14,8 +13,9 @@ from scrapy.spiderloader import SpiderLoader
from scrapy.utils.log import configure_logging, get_scrapy_root_handler
from scrapy.utils.spider import DefaultSpider
from scrapy.utils.misc import load_object
from scrapy.utils.test import get_crawler
from scrapy.extensions.throttle import AutoThrottle
from scrapy.extensions import telnet
class BaseCrawlerTest(unittest.TestCase):
......@@ -100,6 +100,8 @@ class CrawlerLoggingTestCase(unittest.TestCase):
custom_settings = {
'LOG_LEVEL': 'INFO',
'LOG_FILE': log_file.name,
# disable telnet if not available to avoid an extra warning
'TELNETCONSOLE_ENABLED': telnet.TWISTED_CONCH_AVAILABLE,
}
configure_logging()
......
......@@ -74,6 +74,14 @@ class DictItemsSpider(TestSpider):
item_cls = dict
class ItemZeroDivisionErrorSpider(TestSpider):
custom_settings = {
"ITEM_PIPELINES": {
"tests.pipelines.ProcessWithZeroDivisionErrorPipiline": 300,
}
}
def start_test_site(debug=False):
root_dir = os.path.join(tests_datadir, "test_site")
r = static.File(root_dir)
......@@ -95,6 +103,7 @@ class CrawlerRun(object):
self.respplug = []
self.reqplug = []
self.reqdropped = []
self.itemerror = []
self.itemresp = []
self.signals_catched = {}
self.spider_class = spider_class
......@@ -112,6 +121,7 @@ class CrawlerRun(object):
self.crawler = get_crawler(self.spider_class)
self.crawler.signals.connect(self.item_scraped, signals.item_scraped)
self.crawler.signals.connect(self.item_error, signals.item_error)
self.crawler.signals.connect(self.request_scheduled, signals.request_scheduled)
self.crawler.signals.connect(self.request_dropped, signals.request_dropped)
self.crawler.signals.connect(self.response_downloaded, signals.response_downloaded)
......@@ -136,6 +146,9 @@ class CrawlerRun(object):
u = urlparse(url)
return u.path
def item_error(self, item, response, spider, failure):
self.itemerror.append((item, response, spider, failure))
def item_scraped(self, item, spider, response):
self.itemresp.append((item, response))
......@@ -175,6 +188,10 @@ class EngineTest(unittest.TestCase):
self._assert_scheduled_requests(urls_to_visit=7)
self._assert_dropped_requests()
self.run = CrawlerRun(ItemZeroDivisionErrorSpider)
yield self.run.run()
self._assert_items_error()
def _assert_visited_urls(self):
must_be_visited = ["/", "/redirect", "/redirected",
"/item1.html", "/item2.html", "/item999.html"]
......@@ -209,6 +226,20 @@ class EngineTest(unittest.TestCase):
if self.run.getpath(response.url) == '/redirect':
self.assertEqual(302, response.status)
def _assert_items_error(self):
self.assertEqual(2, len(self.run.itemerror))
for item, response, spider, failure in self.run.itemerror:
self.assertEqual(failure.value.__class__, ZeroDivisionError)
self.assertEqual(spider, self.run.spider)
self.assertEqual(item['url'], response.url)
if 'item1.html' in item['url']:
self.assertEqual('Item 1 name', item['name'])
self.assertEqual('100', item['price'])
if 'item2.html' in item['url']:
self.assertEqual('Item 2 name', item['name'])
self.assertEqual('200', item['price'])
def _assert_scraped_items(self):
self.assertEqual(2, len(self.run.itemresp))
for item, response in self.run.itemresp:
......
......@@ -451,6 +451,17 @@ class Base:
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
])
def test_ftp_links(self):
body = b"""
<html><body>
<div><a href="ftp://www.external.com/">An Item</a></div>
</body></html>"""
response = HtmlResponse("http://www.example.com/index.html", body=body, encoding='utf8')
lx = self.extractor_cls()
self.assertEqual(lx.extract_links(response), [
Link(url='ftp://www.external.com/', text=u'An Item', fragment='', nofollow=False),
])
class LxmlLinkExtractorTestCase(Base.LinkExtractorTestCase):
extractor_cls = LxmlLinkExtractor
......@@ -471,4 +482,3 @@ class LxmlLinkExtractorTestCase(Base.LinkExtractorTestCase):
@pytest.mark.xfail
def test_restrict_xpaths_with_html_entities(self):
super(LxmlLinkExtractorTestCase, self).test_restrict_xpaths_with_html_entities()
import os
import io
import hashlib
import random
import warnings
from tempfile import mkdtemp, TemporaryFile
from tempfile import mkdtemp
from shutil import rmtree
from twisted.trial import unittest
......@@ -401,8 +401,9 @@ class ImagesPipelineTestCaseCustomSettings(unittest.TestCase):
self.assertEqual(getattr(pipeline_cls, pipe_attr.lower()),
expected_value)
def _create_image(format, *a, **kw):
buf = TemporaryFile()
buf = io.BytesIO()
Image.new(*a, **kw).save(buf, format)
buf.seek(0)
return Image.open(buf)
......
......@@ -3,8 +3,7 @@ import unittest
import warnings
from scrapy.settings import (BaseSettings, Settings, SettingsAttribute,
CrawlerSettings, SETTINGS_PRIORITIES,
get_settings_priority)
SETTINGS_PRIORITIES, get_settings_priority)
from tests import mock
from . import default_settings
......@@ -341,35 +340,6 @@ class BaseSettingsTest(unittest.TestCase):
self.assertTrue(frozencopy.frozen)
self.assertIsNot(frozencopy, self.settings)
def test_deprecated_attribute_overrides(self):
self.settings.set('BAR', 'fuz', priority='cmdline')
with warnings.catch_warnings(record=True) as w:
self.settings.overrides['BAR'] = 'foo'
self.assertIn("Settings.overrides", str(w[0].message))
self.assertEqual(self.settings.get('BAR'), 'foo')
self.assertEqual(self.settings.overrides.get('BAR'), 'foo')
self.assertIn('BAR', self.settings.overrides)
self.settings.overrides.update(BAR='bus')
self.assertEqual(self.settings.get('BAR'), 'bus')
self.assertEqual(self.settings.overrides.get('BAR'), 'bus')
self.settings.overrides.setdefault('BAR', 'fez')
self.assertEqual(self.settings.get('BAR'), 'bus')
self.settings.overrides.setdefault('FOO', 'fez')
self.assertEqual(self.settings.get('FOO'), 'fez')
self.assertEqual(self.settings.overrides.get('FOO'), 'fez')
def test_deprecated_attribute_defaults(self):
self.settings.set('BAR', 'fuz', priority='default')
with warnings.catch_warnings(record=True) as w:
self.settings.defaults['BAR'] = 'foo'
self.assertIn("Settings.defaults", str(w[0].message))
self.assertEqual(self.settings.get('BAR'), 'foo')
self.assertEqual(self.settings.defaults.get('BAR'), 'foo')
self.assertIn('BAR', self.settings.defaults)
class SettingsTest(unittest.TestCase):
......@@ -422,33 +392,5 @@ class SettingsTest(unittest.TestCase):
self.assertEqual(mydict['key'], 'val')
class CrawlerSettingsTest(unittest.TestCase):
def test_deprecated_crawlersettings(self):
def _get_settings(settings_dict=None):
settings_module = type('SettingsModuleMock', (object,), settings_dict or {})
return CrawlerSettings(settings_module)
with warnings.catch_warnings(record=True) as w:
settings = _get_settings()
self.assertIn("CrawlerSettings is deprecated", str(w[0].message))
# test_global_defaults
self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 180)
# test_defaults
settings.defaults['DOWNLOAD_TIMEOUT'] = '99'
self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 99)
# test_settings_module
settings = _get_settings({'DOWNLOAD_TIMEOUT': '3'})
self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 3)
# test_overrides
settings = _get_settings({'DOWNLOAD_TIMEOUT': '3'})
settings.overrides['DOWNLOAD_TIMEOUT'] = '15'
self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 15)
if __name__ == "__main__":
unittest.main()
......@@ -10,6 +10,7 @@ from twisted.python.failure import Failure
from scrapy.utils.log import (failure_to_exc_info, TopLevelFormatter,
LogCounterHandler, StreamLogger)
from scrapy.utils.test import get_crawler
from scrapy.extensions import telnet
class FailureToExcInfoTest(unittest.TestCase):
......@@ -65,10 +66,14 @@ class TopLevelFormatterTest(unittest.TestCase):
class LogCounterHandlerTest(unittest.TestCase):
def setUp(self):
settings = {'LOG_LEVEL': 'WARNING'}
if not telnet.TWISTED_CONCH_AVAILABLE:
# disable it to avoid the extra warning
settings['TELNETCONSOLE_ENABLED'] = False
self.logger = logging.getLogger('test')
self.logger.setLevel(logging.NOTSET)
self.logger.propagate = False
self.crawler = get_crawler(settings_dict={'LOG_LEVEL': 'WARNING'})
self.crawler = get_crawler(settings_dict=settings)
self.handler = LogCounterHandler(self.crawler)
self.logger.addHandler(self.handler)
......
......@@ -9,13 +9,13 @@ envlist = py27
[testenv]
deps =
-ctests/constraints.txt
-rrequirements.txt
-rrequirements-py2.txt
# Extras
botocore
google-cloud-storage
Pillow != 3.0.0
leveldb
-rtests/requirements.txt
-rtests/requirements-py2.txt
passenv =
S3_TEST_FILE_URI
AWS_ACCESS_KEY_ID
......@@ -35,7 +35,7 @@ deps =
Pillow==2.3.0
cssselect==0.9.1
zope.interface==4.0.5
-rtests/requirements.txt
-rtests/requirements-py2.txt
[testenv:jessie]
# https://packages.debian.org/en/jessie/python/
......@@ -50,7 +50,7 @@ deps =
Pillow==2.6.1
cssselect==0.9.1
zope.interface==4.1.1
-rtests/requirements.txt
-rtests/requirements-py2.txt
[testenv:trunk]
basepython = python2.7
......@@ -67,6 +67,7 @@ commands =
[testenv:py34]
basepython = python3.4
deps =
-ctests/constraints.txt
-rrequirements-py3.txt
# Extras
Pillow
......@@ -80,6 +81,10 @@ deps = {[testenv:py34]deps}
basepython = python3.6
deps = {[testenv:py34]deps}
[testenv:py37]
basepython = python3.7
deps = {[testenv:py34]deps}
[testenv:pypy3]
basepython = pypy3
deps = {[testenv:py34]deps}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册