SpiderManager interface cleanup

d7038b2a · Julia Medina · 39c6a80f · d7038b2a · d7038b2a · d7038b2a
6 changed file
--- a/docs/topics/api.rst
+++ b/docs/topics/api.rst
@@ -77,8 +77,7 @@ how you :ref:`configure the downloader middlewares

    .. attribute:: spiders

-        The spider manager which takes care of loading and instantiating
-        spiders.
+        The spider manager which takes care of loading spiders.

        Most extensions won't need to access this attribute.

@@ -300,6 +299,54 @@ Settings API

       Alias for a :meth:`~freeze` call in the object returned by :meth:`copy`

+.. _topics-api-spidermanager:
+
+SpiderManager API
+=================
+
+.. module:: scrapy.spidermanager
+   :synopsis: The spider manager
+
+.. class:: SpiderManager
+
+    This class is in charge of retrieving and handling the spider classes
+    defined across the project.
+
+    Custom spider managers can be employed by specifying their path in the
+    :setting:`SPIDER_MANAGER_CLASS` project setting. They must fully implement
+    the :class:`scrapy.interfaces.ISpiderManager` interface to guarantee an
+    errorless execution.
+
+    .. method:: from_settings(settings)
+
+       This class method is used by Scrapy to create an instance of the class.
+       It's called with the current project settings, and it loads the spiders
+       found in the modules of the :setting:`SPIDER_MODULES` setting.
+
+       :param settings: project settings
+       :type settings: :class:`~scrapy.settings.Settings` instance
+
+    .. method:: load(spider_name)
+
+       Get the Spider class with the given name. It'll look into the previously
+       loaded spiders for a spider class with name `spider_name` and will raise
+       a KeyError if not found.
+
+       :param spider_name: spider class name
+       :type spider_name: str
+
+    .. method:: list()
+
+       Get the names of the available spiders in the project.
+
+    .. method:: find_by_request(request)
+
+       List the spiders' names that can handle the given request. Will try to
+       match the request's url against the domains of the spiders.
+
+       :param request: queried request
+       :type request: :class:`~scrapy.http.Request` instance
+
 .. _topics-api-signals:

 Signals API

--- a/docs/topics/settings.rst
+++ b/docs/topics/settings.rst
@@ -768,6 +768,16 @@ A dict containing the scrapy contracts enabled by default in Scrapy. You should
 never modify this setting in your project, modify :setting:`SPIDER_CONTRACTS`
 instead. For more info see :ref:`topics-contracts`.

+.. setting:: SPIDER_MANAGER_CLASS
+
+SPIDER_MANAGER_CLASS
+--------------------
+
+Default: ``'scrapy.spidermanager.SpiderManager'``
+
+The class that will be used for handling spiders, which must implement the
+:ref:`topics-api-spidermanager`.
+
 .. setting:: SPIDER_MIDDLEWARES

 SPIDER_MIDDLEWARES

--- a/scrapy/interfaces.py
+++ b/scrapy/interfaces.py
@@ -2,10 +2,12 @@ from zope.interface import Interface

 class ISpiderManager(Interface):

-    def create(spider_name, **spider_args):
-        """Returns a new Spider instance for the given spider name, and using
-        the given spider arguments. If the spider name is not found, it must
-        raise a KeyError."""
+    def from_settings(settings):
+        """Returns an instance of the class for the given settings"""
+
+    def load(spider_name):
+        """Returns the Spider class for the given spider name. If the spider
+        name is not found, it must raise a KeyError."""

    def list():
        """Return a list with the names of all spiders available in the

--- a/scrapy/spidermanager.py
+++ b/scrapy/spidermanager.py
@@ -6,7 +6,6 @@ spiders
 from zope.interface import implementer
 import six

-from scrapy import signals
 from scrapy.interfaces import ISpiderManager
 from scrapy.utils.misc import walk_modules
 from scrapy.utils.spider import iter_spider_classes
@@ -15,8 +14,8 @@ from scrapy.utils.spider import iter_spider_classes
 @implementer(ISpiderManager)
 class SpiderManager(object):

-    def __init__(self, spider_modules):
-        self.spider_modules = spider_modules
+    def __init__(self, settings):
+        self.spider_modules = settings['SPIDER_MODULES']
        self._spiders = {}
        for name in self.spider_modules:
            for module in walk_modules(name):
@@ -28,33 +27,17 @@ class SpiderManager(object):

    @classmethod
    def from_settings(cls, settings):
-        return cls(settings.getlist('SPIDER_MODULES'))
+        return cls(settings)

-    @classmethod
-    def from_crawler(cls, crawler):
-        sm = cls.from_settings(crawler.settings)
-        sm.crawler = crawler
-        crawler.signals.connect(sm.close_spider, signals.spider_closed)
-        return sm
-
-    def create(self, spider_name, **spider_kwargs):
+    def load(self, spider_name):
        try:
-            spcls = self._spiders[spider_name]
+            return self._spiders[spider_name]
        except KeyError:
-            raise KeyError("Spider not found: %s" % spider_name)
-        if hasattr(self, 'crawler') and hasattr(spcls, 'from_crawler'):
-            return spcls.from_crawler(self.crawler, **spider_kwargs)
-        else:
-            return spcls(**spider_kwargs)
+            raise KeyError("Spider not found: {}".format(spider_name))

    def find_by_request(self, request):
        return [name for name, cls in six.iteritems(self._spiders)
            if cls.handles_request(request)]

    def list(self):
-        return self._spiders.keys()
-
-    def close_spider(self, spider, reason):
-        closed = getattr(spider, 'closed', None)
-        if callable(closed):
-            return closed(reason)
+        return list(self._spiders.keys())
--- a/tests/test_spidermanager/__init__.py
+++ b/tests/test_spidermanager/__init__.py
@@ -10,6 +10,7 @@ from twisted.trial import unittest
 # alone
 from scrapy.interfaces import ISpiderManager
 from scrapy.spidermanager import SpiderManager
+from scrapy.settings import Settings
 from scrapy.http import Request

 module_dir = os.path.dirname(os.path.abspath(__file__))
@@ -23,7 +24,8 @@ class SpiderManagerTest(unittest.TestCase):
        self.spiders_dir = os.path.join(self.tmpdir, 'test_spiders_xxx')
        shutil.copytree(orig_spiders_dir, self.spiders_dir)
        sys.path.append(self.tmpdir)
-        self.spiderman = SpiderManager(['test_spiders_xxx'])
+        settings = Settings({'SPIDER_MODULES': ['test_spiders_xxx']})
+        self.spiderman = SpiderManager.from_settings(settings)

    def tearDown(self):
        del self.spiderman
@@ -35,14 +37,11 @@ class SpiderManagerTest(unittest.TestCase):

    def test_list(self):
        self.assertEqual(set(self.spiderman.list()),
-            set(['spider1', 'spider2', 'spider3', 'spider4']))
+            set(['spider1', 'spider2', 'spider3']))

-    def test_create(self):
-        spider1 = self.spiderman.create("spider1")
-        self.assertEqual(spider1.__class__.__name__, 'Spider1')
-        spider2 = self.spiderman.create("spider2", foo="bar")
-        self.assertEqual(spider2.__class__.__name__, 'Spider2')
-        self.assertEqual(spider2.foo, 'bar')
+    def test_load(self):
+        spider1 = self.spiderman.load("spider1")
+        self.assertEqual(spider1.__name__, 'Spider1')

    def test_find_by_request(self):
        self.assertEqual(self.spiderman.find_by_request(Request('http://scrapy1.org/test')),
@@ -59,13 +58,13 @@ class SpiderManagerTest(unittest.TestCase):
            ['spider3'])

    def test_load_spider_module(self):
-        self.spiderman = SpiderManager(['tests.test_spidermanager.test_spiders.spider1'])
+        module = 'tests.test_spidermanager.test_spiders.spider1'
+        settings = Settings({'SPIDER_MODULES': [module]})
+        self.spiderman = SpiderManager.from_settings(settings)
        assert len(self.spiderman._spiders) == 1

    def test_load_base_spider(self):
-        self.spiderman = SpiderManager(['tests.test_spidermanager.test_spiders.spider0'])
+        module = 'tests.test_spidermanager.test_spiders.spider0'
+        settings = Settings({'SPIDER_MODULES': [module]})
+        self.spiderman = SpiderManager.from_settings(settings)
        assert len(self.spiderman._spiders) == 0
-
-    def test_load_from_crawler(self):
-        spider = self.spiderman.create('spider4', a='OK')
-        self.assertEqual(spider.a, 'OK')
--- a/tests/test_spidermanager/test_spiders/spider4.py
+++ b/tests/test_spidermanager/test_spiders/spider4.py
-from scrapy.spider import Spider
-
-class Spider4(Spider):
-    name = "spider4"
-
-    @classmethod
-    def from_crawler(cls, crawler, **kwargs):
-        o = cls(**kwargs)
-        o.crawler = crawler
-        return o