提交 c7f82199 编写于 作者: P Pablo Hoffman

- removed scrapy.conf singleton from scrapy.log, scrapy.responsetypes,

  scrapy.http.response.text, scrapy.selector
- fixed bug with scrapy.conf.settings backwards compatibility support
- added facility to notify (and provide some guidelines) about deprecated/obsolete settings
上级 391cc060
......@@ -225,17 +225,6 @@ Default::
The default headers used for Scrapy HTTP Requests. They're populated in the
:class:`~scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware`.
.. setting:: DEFAULT_RESPONSE_ENCODING
DEFAULT_RESPONSE_ENCODING
-------------------------
Default: ``'ascii'``
The default encoding to use for :class:`~scrapy.http.TextResponse` objects (and
subclasses) when no encoding is declared and no encoding could be inferred from
the body.
.. setting:: DEPTH_LIMIT
DEPTH_LIMIT
......@@ -827,7 +816,7 @@ the default value for this setting see: http://www.boutell.com/newfaq/misc/urlle
USER_AGENT
----------
Default: ``"Scrapy/0.15 (+http://scrapy.org)"``
Default: ``"Scrapy/VERSION (+http://scrapy.org)"``
The default User-Agent to use when crawling, unless overridden.
......
......@@ -10,6 +10,7 @@ from scrapy.command import ScrapyCommand
from scrapy.exceptions import UsageError
from scrapy.utils.misc import walk_modules
from scrapy.utils.project import inside_project, get_project_settings
from scrapy.settings.deprecated import check_deprecated_settings
def _iter_command_classes(module_name):
# TODO: add `name` attribute to commands and and merge this function with
......@@ -80,10 +81,17 @@ def _run_print_help(parser, func, *a, **kw):
parser.print_help()
sys.exit(2)
def execute(argv=None):
def execute(argv=None, settings=None):
if argv is None:
argv = sys.argv
settings = get_project_settings()
if settings is None:
settings = get_project_settings()
check_deprecated_settings(settings)
# backwards compatibility to support scrapy.conf.settings
from scrapy import conf
conf.settings = settings
crawler = CrawlerProcess(settings)
crawler.install()
inproject = inside_project()
......
......@@ -28,7 +28,7 @@ class ScrapyCommand(object):
@property
def crawler(self):
if not log.started:
log.start()
log.start_from_settings(self.settings)
self._crawler.configure()
return self._crawler
......
# This module is kept for backwards compatibility.
# This module is kept for backwards compatibility, so users can import
# scrapy.conf.settings and get the settings they expect
#
# TODO: Add deprecation warning once all scrapy.conf instances have been
# removed from Scrapy codebase.
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
......@@ -54,6 +54,7 @@ class ExecutionEngine(object):
self.crawler = crawler
self.settings = crawler.settings
self.signals = crawler.signals
self.logformatter = crawler.logformatter
self.slots = {}
self.running = False
self.paused = False
......@@ -193,7 +194,7 @@ class ExecutionEngine(object):
assert isinstance(response, (Response, Request))
if isinstance(response, Response):
response.request = request # tie request to response received
logkws = log.formatter.crawled(request, response, spider)
logkws = self.logformatter.crawled(request, response, spider)
log.msg(level=log.DEBUG, spider=spider, **logkws)
self.signals.send_catch_log(signal=signals.response_received, \
response=response, request=request, spider=spider)
......
......@@ -67,6 +67,7 @@ class Scraper(object):
self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS')
self.crawler = crawler
self.signals = crawler.signals
self.logformatter = crawler.logformatter
@defer.inlineCallbacks
def open_spider(self, spider):
......@@ -198,14 +199,14 @@ class Scraper(object):
if isinstance(output, Failure):
ex = output.value
if isinstance(ex, DropItem):
logkws = log.formatter.dropped(item, ex, response, spider)
logkws = self.logformatter.dropped(item, ex, response, spider)
log.msg(level=log.WARNING, spider=spider, **logkws)
return self.signals.send_catch_log_deferred(signal=signals.item_dropped, \
item=item, spider=spider, exception=output.value)
else:
log.err(output, 'Error processing %(item)s', item=item, spider=spider)
else:
logkws = log.formatter.scraped(output, response, spider)
logkws = self.logformatter.scraped(output, response, spider)
log.msg(level=log.DEBUG, spider=spider, **logkws)
return self.signals.send_catch_log_deferred(signal=signals.item_scraped, \
item=output, response=response, spider=spider)
......
......@@ -33,6 +33,8 @@ class Crawler(object):
if self.configured:
return
self.configured = True
lf_cls = load_object(self.settings['LOG_FORMATTER'])
self.logformatter = lf_cls.from_crawler(self)
self.extensions = ExtensionManager.from_crawler(self)
spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
self.spiders = spman_cls.from_crawler(self)
......
......@@ -9,12 +9,11 @@ from w3lib.encoding import html_to_unicode, resolve_encoding, \
html_body_declared_encoding, http_content_type_encoding
from scrapy.http.response import Response
from scrapy.utils.python import memoizemethod_noargs
from scrapy.conf import settings
class TextResponse(Response):
_DEFAULT_ENCODING = settings['DEFAULT_RESPONSE_ENCODING']
_DEFAULT_ENCODING = 'ascii'
def __init__(self, *args, **kwargs):
self._encoding = kwargs.pop('encoding', None)
......
......@@ -10,9 +10,7 @@ import warnings
from twisted.python import log
import scrapy
from scrapy.conf import settings
from scrapy.utils.python import unicode_to_str
from scrapy.utils.misc import load_object
# Logging levels
DEBUG = logging.DEBUG
......@@ -93,36 +91,23 @@ def _adapt_eventdict(eventDict, log_level=INFO, encoding='utf-8', prepend_level=
return ev
def _get_log_level(level_name_or_id=None):
if level_name_or_id is None:
lvlname = settings['LOG_LEVEL']
return globals()[lvlname]
elif isinstance(level_name_or_id, int):
def _get_log_level(level_name_or_id):
if isinstance(level_name_or_id, int):
return level_name_or_id
elif isinstance(level_name_or_id, basestring):
return globals()[level_name_or_id]
else:
raise ValueError("Unknown log level: %r" % level_name_or_id)
def start(logfile=None, loglevel=None, logstdout=None):
global started
if started or not settings.getbool('LOG_ENABLED'):
return
started = True
if log.defaultObserver: # check twisted log not already started
def start(logfile=None, loglevel='INFO', logstdout=True, logencoding='utf-8'):
if log.defaultObserver or True: # check twisted log not already started
loglevel = _get_log_level(loglevel)
logfile = logfile or settings['LOG_FILE']
file = open(logfile, 'a') if logfile else sys.stderr
if logstdout is None:
logstdout = settings.getbool('LOG_STDOUT')
sflo = ScrapyFileLogObserver(file, loglevel, settings['LOG_ENCODING'])
sflo = ScrapyFileLogObserver(file, loglevel, logencoding)
_oldshowwarning = warnings.showwarning
log.startLoggingWithObserver(sflo.emit, setStdout=logstdout)
# restore warnings, wrongly silenced by Twisted
warnings.showwarning = _oldshowwarning
msg("Scrapy %s started (bot: %s)" % (scrapy.__version__, \
settings['BOT_NAME']))
def msg(message=None, _level=INFO, **kw):
kw['logLevel'] = kw.pop('level', _level)
......@@ -137,4 +122,13 @@ def err(_stuff=None, _why=None, **kw):
kw.setdefault('system', 'scrapy')
log.err(_stuff, _why, **kw)
formatter = load_object(settings['LOG_FORMATTER'])()
def start_from_settings(settings):
global started
if started or not settings.getbool('LOG_ENABLED'):
return
started = True
start(settings['LOG_FILE'], settings['LOG_LEVEL'], settings['LOG_STDOUT'],
settings['LOG_ENCODING'])
msg("Scrapy %s started (bot: %s)" % (scrapy.__version__, \
settings['BOT_NAME']))
......@@ -37,3 +37,7 @@ class LogFormatter(object):
'exception': exception,
'item': item,
}
@classmethod
def from_crawler(cls, crawler):
return cls()
......@@ -11,7 +11,6 @@ from cStringIO import StringIO
from scrapy.http import Response
from scrapy.utils.misc import load_object
from scrapy.utils.python import isbinarytext
from scrapy.conf import settings
class ResponseTypes(object):
......@@ -31,7 +30,6 @@ class ResponseTypes(object):
}
def __init__(self):
self.CLASSES.update(settings.get('RESPONSE_CLASSES', {}))
self.classes = {}
self.mimetypes = MimeTypes()
mimedata = get_data('scrapy', 'mime.types')
......
"""
XPath selectors
To select the backend explicitly use the SELECTORS_BACKEND variable in your
project settings.
To select the backend explicitly use the SCRAPY_SELECTORS_BACKEND environment
variable.
Two backends are currently available: lxml (default) and libxml2.
"""
from scrapy.conf import settings
import os
if settings['SELECTORS_BACKEND'] == 'lxml':
from scrapy.selector.lxmlsel import *
elif settings['SELECTORS_BACKEND'] == 'libxml2':
backend = os.environ.get('SCRAPY_SELECTORS_BACKEND')
if backend == 'libxml2':
from scrapy.selector.libxml2sel import *
elif settings['SELECTORS_BACKEND'] == 'dummy':
from scrapy.selector.dummysel import *
elif backend == 'lxml':
from scrapy.selector.lxmlsel import *
else:
try:
import lxml
except ImportError:
try:
import libxml2
except ImportError:
from scrapy.selector.dummysel import *
else:
from scrapy.selector.libxml2sel import *
import libxml2
from scrapy.selector.libxml2sel import *
else:
from scrapy.selector.lxmlsel import *
"""
Dummy selectors
"""
from .list import XPathSelectorList as XPathSelectorList
__all__ = ['HtmlXPathSelector', 'XmlXPathSelector', 'XPathSelector', \
'XPathSelectorList']
class XPathSelector(object):
def __init__(self, *a, **kw):
pass
def _raise(self, *a, **kw):
raise RuntimeError("No selectors backend available. " \
"Please install libxml2 or lxml")
select = re = extract = register_namespace = __nonzero__ = _raise
XmlXPathSelector = XPathSelector
HtmlXPathSelector = XPathSelector
......@@ -41,8 +41,6 @@ DEFAULT_REQUEST_HEADERS = {
'Accept-Language': 'en',
}
DEFAULT_RESPONSE_ENCODING = 'ascii'
DEPTH_LIMIT = 0
DEPTH_STATS = True
DEPTH_PRIORITY = 0
......@@ -197,8 +195,6 @@ SCHEDULER = 'scrapy.core.scheduler.Scheduler'
SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleLifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.LifoMemoryQueue'
SELECTORS_BACKEND = None # possible values: libxml2, lxml
SPIDER_MANAGER_CLASS = 'scrapy.spidermanager.SpiderManager'
SPIDER_MIDDLEWARES = {}
......@@ -224,7 +220,7 @@ TEMPLATES_DIR = abspath(join(dirname(__file__), '..', 'templates'))
URLLENGTH_LIMIT = 2083
USER_AGENT = 'Scrapy/0.15 (+http://scrapy.org)'
USER_AGENT = 'Scrapy/%s (+http://scrapy.org)' % __import__('scrapy').__version__
TELNETCONSOLE_ENABLED = 1
TELNETCONSOLE_PORT = [6023, 6073]
......
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
DEPRECATED_SETTINGS = [
('TRACK_REFS', 'no longer needed (trackref is always enabled)'),
('RESPONSE_CLASSES', 'no longer supported'),
('DEFAULT_RESPONSE_ENCODING', 'no longer supported'),
('BOT_VERSION', 'no longer used (user agent defaults to Scrapy now)'),
('ENCODING_ALIASES', 'no longer needed (encoding discovery uses w3lib now)'),
('STATS_ENABLED', 'no longer supported (change STATS_CLASS instead)'),
('SQLITE_DB', 'no longer supported'),
('SELECTORS_BACKEND', 'use SCRAPY_SELECTORS_BACKEND environment variable instead'),
]
def check_deprecated_settings(settings):
deprecated = [x for x in DEPRECATED_SETTINGS if settings[x[0]] is not None]
if deprecated:
msg = "You are using the following settings which are deprecated or obsolete"
msg += " (ask scrapy-users@googlegroups.com for alternatives):"
msg = msg + "\n " + "\n ".join("%s: %s" % x for x in deprecated)
warnings.warn(msg, ScrapyDeprecationWarning)
......@@ -11,7 +11,6 @@ class LogTest(unittest.TestCase):
def test_get_log_level(self):
default_log_level = getattr(log, default_settings.LOG_LEVEL)
self.assertEqual(log._get_log_level(), default_log_level)
self.assertEqual(log._get_log_level('WARNING'), log.WARNING)
self.assertEqual(log._get_log_level(log.WARNING), log.WARNING)
self.assertRaises(ValueError, log._get_log_level, object())
......
import unittest
from scrapy.http import TextResponse
from scrapy.selector.dummysel import XmlXPathSelector, HtmlXPathSelector, \
XPathSelector
class XPathSelectorTestCase(unittest.TestCase):
def test_raises(self):
response = TextResponse(url="http://example.com", body='test')
for cls in [XmlXPathSelector, HtmlXPathSelector, XPathSelector]:
sel = cls(response)
self.assertRaises(RuntimeError, sel.select, '//h2')
self.assertRaises(RuntimeError, sel.re, 'lala')
self.assertRaises(RuntimeError, sel.extract)
self.assertRaises(RuntimeError, sel.register_namespace, 'a', 'b')
self.assertRaises(RuntimeError, sel.__nonzero__, 'a', 'b')
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册