提交 d7038b2a 编写于 作者: J Julia Medina

SpiderManager interface cleanup

上级 39c6a80f
......@@ -77,8 +77,7 @@ how you :ref:`configure the downloader middlewares
.. attribute:: spiders
The spider manager which takes care of loading and instantiating
spiders.
The spider manager which takes care of loading spiders.
Most extensions won't need to access this attribute.
......@@ -300,6 +299,54 @@ Settings API
Alias for a :meth:`~freeze` call in the object returned by :meth:`copy`
.. _topics-api-spidermanager:
SpiderManager API
=================
.. module:: scrapy.spidermanager
:synopsis: The spider manager
.. class:: SpiderManager
This class is in charge of retrieving and handling the spider classes
defined across the project.
Custom spider managers can be employed by specifying their path in the
:setting:`SPIDER_MANAGER_CLASS` project setting. They must fully implement
the :class:`scrapy.interfaces.ISpiderManager` interface to guarantee an
errorless execution.
.. method:: from_settings(settings)
This class method is used by Scrapy to create an instance of the class.
It's called with the current project settings, and it loads the spiders
found in the modules of the :setting:`SPIDER_MODULES` setting.
:param settings: project settings
:type settings: :class:`~scrapy.settings.Settings` instance
.. method:: load(spider_name)
Get the Spider class with the given name. It'll look into the previously
loaded spiders for a spider class with name `spider_name` and will raise
a KeyError if not found.
:param spider_name: spider class name
:type spider_name: str
.. method:: list()
Get the names of the available spiders in the project.
.. method:: find_by_request(request)
List the spiders' names that can handle the given request. Will try to
match the request's url against the domains of the spiders.
:param request: queried request
:type request: :class:`~scrapy.http.Request` instance
.. _topics-api-signals:
Signals API
......
......@@ -768,6 +768,16 @@ A dict containing the scrapy contracts enabled by default in Scrapy. You should
never modify this setting in your project, modify :setting:`SPIDER_CONTRACTS`
instead. For more info see :ref:`topics-contracts`.
.. setting:: SPIDER_MANAGER_CLASS
SPIDER_MANAGER_CLASS
--------------------
Default: ``'scrapy.spidermanager.SpiderManager'``
The class that will be used for handling spiders, which must implement the
:ref:`topics-api-spidermanager`.
.. setting:: SPIDER_MIDDLEWARES
SPIDER_MIDDLEWARES
......
......@@ -2,10 +2,12 @@ from zope.interface import Interface
class ISpiderManager(Interface):
def create(spider_name, **spider_args):
"""Returns a new Spider instance for the given spider name, and using
the given spider arguments. If the spider name is not found, it must
raise a KeyError."""
def from_settings(settings):
"""Returns an instance of the class for the given settings"""
def load(spider_name):
"""Returns the Spider class for the given spider name. If the spider
name is not found, it must raise a KeyError."""
def list():
"""Return a list with the names of all spiders available in the
......
......@@ -6,7 +6,6 @@ spiders
from zope.interface import implementer
import six
from scrapy import signals
from scrapy.interfaces import ISpiderManager
from scrapy.utils.misc import walk_modules
from scrapy.utils.spider import iter_spider_classes
......@@ -15,8 +14,8 @@ from scrapy.utils.spider import iter_spider_classes
@implementer(ISpiderManager)
class SpiderManager(object):
def __init__(self, spider_modules):
self.spider_modules = spider_modules
def __init__(self, settings):
self.spider_modules = settings['SPIDER_MODULES']
self._spiders = {}
for name in self.spider_modules:
for module in walk_modules(name):
......@@ -28,33 +27,17 @@ class SpiderManager(object):
@classmethod
def from_settings(cls, settings):
return cls(settings.getlist('SPIDER_MODULES'))
return cls(settings)
@classmethod
def from_crawler(cls, crawler):
sm = cls.from_settings(crawler.settings)
sm.crawler = crawler
crawler.signals.connect(sm.close_spider, signals.spider_closed)
return sm
def create(self, spider_name, **spider_kwargs):
def load(self, spider_name):
try:
spcls = self._spiders[spider_name]
return self._spiders[spider_name]
except KeyError:
raise KeyError("Spider not found: %s" % spider_name)
if hasattr(self, 'crawler') and hasattr(spcls, 'from_crawler'):
return spcls.from_crawler(self.crawler, **spider_kwargs)
else:
return spcls(**spider_kwargs)
raise KeyError("Spider not found: {}".format(spider_name))
def find_by_request(self, request):
return [name for name, cls in six.iteritems(self._spiders)
if cls.handles_request(request)]
def list(self):
return self._spiders.keys()
def close_spider(self, spider, reason):
closed = getattr(spider, 'closed', None)
if callable(closed):
return closed(reason)
return list(self._spiders.keys())
......@@ -10,6 +10,7 @@ from twisted.trial import unittest
# alone
from scrapy.interfaces import ISpiderManager
from scrapy.spidermanager import SpiderManager
from scrapy.settings import Settings
from scrapy.http import Request
module_dir = os.path.dirname(os.path.abspath(__file__))
......@@ -23,7 +24,8 @@ class SpiderManagerTest(unittest.TestCase):
self.spiders_dir = os.path.join(self.tmpdir, 'test_spiders_xxx')
shutil.copytree(orig_spiders_dir, self.spiders_dir)
sys.path.append(self.tmpdir)
self.spiderman = SpiderManager(['test_spiders_xxx'])
settings = Settings({'SPIDER_MODULES': ['test_spiders_xxx']})
self.spiderman = SpiderManager.from_settings(settings)
def tearDown(self):
del self.spiderman
......@@ -35,14 +37,11 @@ class SpiderManagerTest(unittest.TestCase):
def test_list(self):
self.assertEqual(set(self.spiderman.list()),
set(['spider1', 'spider2', 'spider3', 'spider4']))
set(['spider1', 'spider2', 'spider3']))
def test_create(self):
spider1 = self.spiderman.create("spider1")
self.assertEqual(spider1.__class__.__name__, 'Spider1')
spider2 = self.spiderman.create("spider2", foo="bar")
self.assertEqual(spider2.__class__.__name__, 'Spider2')
self.assertEqual(spider2.foo, 'bar')
def test_load(self):
spider1 = self.spiderman.load("spider1")
self.assertEqual(spider1.__name__, 'Spider1')
def test_find_by_request(self):
self.assertEqual(self.spiderman.find_by_request(Request('http://scrapy1.org/test')),
......@@ -59,13 +58,13 @@ class SpiderManagerTest(unittest.TestCase):
['spider3'])
def test_load_spider_module(self):
self.spiderman = SpiderManager(['tests.test_spidermanager.test_spiders.spider1'])
module = 'tests.test_spidermanager.test_spiders.spider1'
settings = Settings({'SPIDER_MODULES': [module]})
self.spiderman = SpiderManager.from_settings(settings)
assert len(self.spiderman._spiders) == 1
def test_load_base_spider(self):
self.spiderman = SpiderManager(['tests.test_spidermanager.test_spiders.spider0'])
module = 'tests.test_spidermanager.test_spiders.spider0'
settings = Settings({'SPIDER_MODULES': [module]})
self.spiderman = SpiderManager.from_settings(settings)
assert len(self.spiderman._spiders) == 0
def test_load_from_crawler(self):
spider = self.spiderman.create('spider4', a='OK')
self.assertEqual(spider.a, 'OK')
from scrapy.spider import Spider
class Spider4(Spider):
name = "spider4"
@classmethod
def from_crawler(cls, crawler, **kwargs):
o = cls(**kwargs)
o.crawler = crawler
return o
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册