CrawlerProcess cleanup changes

d4027356 · Julia Medina · 980e30a1 · d4027356 · d4027356 · d4027356
隐藏空白更改
内联并排

Showing with 143 addition and 77 deletion

docs/topics/api.rst docs/topics/api.rst +48 -0

docs/topics/practices.rst docs/topics/practices.rst +46 -30

scrapy/crawler.py scrapy/crawler.py +49 -47

未找到文件。
--- a/docs/topics/api.rst
+++ b/docs/topics/api.rst
@@ -105,6 +105,53 @@ how you :ref:`configure the downloader middlewares

        Returns a deferred that is fired when the crawl is finished.

+.. class:: CrawlerRunner(settings)
+
+    This is a convenient helper class that creates, configures and runs
+    crawlers inside an already setup Twisted `reactor`_.
+
+    The CrawlerRunner object must be instantiated with a
+    :class:`~scrapy.settings.Settings` object.
+
+    This class shouldn't be needed (since Scrapy is responsible of using it
+    accordingly) unless writing scripts that manually handle the crawling
+    process. See :ref:`run-from-script` for an example.
+
+    .. attribute:: crawlers
+
+       Set of :class:`crawlers <scrapy.crawler.Crawler>` created by the
+       :meth:`crawl` method.
+
+    .. attribute:: crawl_deferreds
+
+       Set of the `deferreds`_ return by the :meth:`crawl` method. This
+       collection it's useful for keeping track of current crawling state.
+
+    .. method:: crawl(spidercls, \*args, \**kwargs)
+
+       This method sets up the crawling of the given `spidercls` with the
+       provided arguments.
+
+       It takes care of loading the spider class while configuring and starting
+       a crawler for it.
+
+       Returns a deferred that is fired when the crawl is finished.
+
+       :param spidercls: spider class or spider's name inside the project
+       :type spidercls: :class:`~scrapy.spider.Spider` subclass or str
+
+       :param args: arguments to initializate the spider
+       :type args: list
+
+       :param kwargs: keyword arguments to initializate the spider
+       :type kwargs: dict
+
+    .. method:: stop()
+
+       Stops simultaneously all the crawling jobs taking place.
+
+       Returns a deferred that is fired when they all have ended.
+
 .. _topics-api-settings:

 Settings API
@@ -470,3 +517,4 @@ class (which they all inherit from).

 .. _deferreds: http://twistedmatrix.com/documents/current/core/howto/defer.html
 .. _deferred: http://twistedmatrix.com/documents/current/core/howto/defer.html
+.. _reactor: http://twistedmatrix.com/documents/current/core/howto/reactor-basics.html
--- a/docs/topics/practices.rst
+++ b/docs/topics/practices.rst
@@ -19,8 +19,9 @@ Remember that Scrapy is built on top of the Twisted
 asynchronous networking library, so you need to run it inside the Twisted reactor.

 Note that you will also have to shutdown the Twisted reactor yourself after the
-spider is finished. This can be achieved by connecting a handler to the
-``signals.spider_closed`` signal.
+spider is finished. This can be achieved by adding callbacks to the deferred
+returned by the :meth:`CrawlerRunner.crawl
+<scrapy.crawler.CrawlerRunner.crawl>` method.

 What follows is a working example of how to do that, using the `testspiders`_
 project as example.
@@ -28,20 +29,21 @@ project as example.
 ::

    from twisted.internet import reactor
-    from scrapy.crawler import Crawler
-    from scrapy import log, signals
-    from testspiders.spiders.followall import FollowAllSpider
+    from scrapy.crawler import CrawlerRunner
    from scrapy.utils.project import get_project_settings

-    spider = FollowAllSpider(domain='scrapinghub.com')
-    settings = get_project_settings()
-    crawler = Crawler(settings)
-    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
-    crawler.configure()
-    crawler.crawl(spider)
-    crawler.start()
-    log.start()
-    reactor.run() # the script will block here until the spider_closed signal was sent
+    # If you aren't inside a Scrapy project, you could use an instance of the
+    # Settings class in scrapy.settings instead of the configuration returned
+    # by get_project_settings
+    runner = CrawlerRunner(get_project_settings())
+
+    # 'followall' is the name of one of the spiders of the project. If you
+    # aren't working in a Scrapy project, use the spider class as first
+    # argument instead of its name (or set the SPIDER_MODULES setting so Scrapy
+    # knows where to look at)
+    d = runner.crawl('followall', domain='scrapinghub.com')
+    d.addBoth(lambda _: reactor.stop())
+    reactor.run() # the script will block here until the crawling is finished

 .. seealso:: `Twisted Reactor Overview`_.

@@ -52,28 +54,42 @@ By default, Scrapy runs a single spider per process when you run ``scrapy
 crawl``. However, Scrapy supports running multiple spiders per process using
 the :ref:`internal API <topics-api>`.

-Here is an example, using the `testspiders`_ project:
+Here is an example that runs multiple spiders simultaneously, using the
+`testspiders`_ project:

 ::

-    from twisted.internet import reactor
-    from scrapy.crawler import Crawler
-    from scrapy import log
-    from testspiders.spiders.followall import FollowAllSpider
+    from twisted.internet import reactor, defer
+    from scrapy.crawler import CrawlerRunner
    from scrapy.utils.project import get_project_settings

-    def setup_crawler(domain):
-        spider = FollowAllSpider(domain=domain)
-        settings = get_project_settings()
-        crawler = Crawler(settings)
-        crawler.configure()
-        crawler.crawl(spider)
-        crawler.start()
-
+    runner = CrawlerRunner(get_project_settings())
+    dfs = set()
    for domain in ['scrapinghub.com', 'insophia.com']:
-        setup_crawler(domain)
-    log.start()
-    reactor.run()
+        d = runner.crawl('followall', domain=domain)
+        dfs.add(d)
+
+    defer.DeferredList(dfs).addBoth(lambda _: reactor.stop())
+    reactor.run() # the script will block here until all crawling jobs are finished
+
+Same example but running the spiders sequentially by chaining the deferreds:
+
+::
+
+    from twisted.internet import reactor, defer
+    from scrapy.crawler import CrawlerRunner
+    from scrapy.utils.project import get_project_settings
+
+    runner = CrawlerRunner(get_project_settings())
+
+    @defer.inlineCallbacks
+    def crawl():
+        for domain in ['scrapinghub.com', 'insophia.com']:
+            yield runner.crawl('followall', domain=domain)
+        reactor.stop()
+
+    crawl()
+    reactor.run() # the script will block here until the last crawl call is finished

 .. seealso:: :ref:`run-from-script`.


--- a/scrapy/crawler.py
+++ b/scrapy/crawler.py
+import six
 import signal

 from twisted.internet import reactor, defer
@@ -70,31 +71,50 @@ class Crawler(object):
            yield defer.maybeDeferred(self.engine.stop)


-class CrawlerProcess(object):
-    """ A class to run multiple scrapy crawlers in a process sequentially"""
+class CrawlerRunner(object):

    def __init__(self, settings):
-        install_shutdown_handlers(self._signal_shutdown)
        self.settings = settings
-        self.crawlers = {}
-        self.stopping = False
-        self._started = None
+        smcls = load_object(settings['SPIDER_MANAGER_CLASS'])
+        self.spiders = smcls.from_settings(settings.frozencopy())
+        self.crawlers = set()
+        self.crawl_deferreds = set()

-    def create_crawler(self, name=None):
-        if name not in self.crawlers:
-            self.crawlers[name] = Crawler(self.settings)
+    def crawl(self, spidercls, *args, **kwargs):
+        crawler = self._create_logged_crawler(spidercls)
+        self.crawlers.add(crawler)

-        return self.crawlers[name]
+        crawler.install()
+        crawler.signals.connect(crawler.uninstall, signals.engine_stopped)

-    def start(self):
-        if self.start_crawling():
-            self.start_reactor()
+        d = crawler.crawl(*args, **kwargs)
+        self.crawl_deferreds.add(d)
+        return d
+
+    def _create_logged_crawler(self, spidercls):
+        crawler = self._create_crawler(spidercls)
+        log_observer = log.start_from_crawler(crawler)
+        if log_observer:
+            crawler.signals.connect(log_observer.stop, signals.engine_stopped)
+        return crawler
+
+    def _create_crawler(self, spidercls):
+        if isinstance(spidercls, six.string_types):
+            spidercls = self.spiders.load(spidercls)
+        crawler = Crawler(spidercls, self.settings.frozencopy())
+        return crawler

-    @defer.inlineCallbacks
    def stop(self):
-        self.stopping = True
-        if self._active_crawler:
-            yield self._active_crawler.stop()
+        return defer.DeferredList(c.stop() for c in self.crawlers)
+
+
+class CrawlerProcess(CrawlerRunner):
+    """A class to run multiple scrapy crawlers in a process simultaneously"""
+
+    def __init__(self, settings):
+        super(CrawlerProcess, self).__init__(settings)
+        install_shutdown_handlers(self._signal_shutdown)
+        self.stopping = False

    def _signal_shutdown(self, signum, _):
        install_shutdown_handlers(self._signal_kill)
@@ -110,43 +130,25 @@ class CrawlerProcess(object):
                level=log.INFO, signame=signame)
        reactor.callFromThread(self._stop_reactor)

-    # ------------------------------------------------------------------------#
-    # The following public methods can't be considered stable and may change at
-    # any moment.
-    #
-    # start_crawling and start_reactor are called from scrapy.commands.shell
-    # They are splitted because reactor is started on a different thread than IPython shell.
-    #
-    def start_crawling(self):
+    def start(self, stop_after_crawl=True):
+        self._start_logging()
+        self._start_reactor(stop_after_crawl)
+
+    def _start_logging(self):
        log.scrapy_info(self.settings)
-        return self._start_crawler() is not None

-    def start_reactor(self):
+    def _start_reactor(self, stop_after_crawl=True):
+        if stop_after_crawl:
+            d = defer.DeferredList(self.crawl_deferreds)
+            if d.called:
+                # Don't start the reactor if the deferreds are already fired
+                return
+            d.addBoth(lambda _: self._stop_reactor())
        if self.settings.getbool('DNSCACHE_ENABLED'):
            reactor.installResolver(CachingThreadedResolver(reactor))
        reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
        reactor.run(installSignalHandlers=False)  # blocking call

-    def _start_crawler(self):
-        if not self.crawlers or self.stopping:
-            return
-
-        name, crawler = self.crawlers.popitem()
-        self._active_crawler = crawler
-        log_observer = log.start_from_crawler(crawler)
-        crawler.configure()
-        crawler.install()
-        crawler.signals.connect(crawler.uninstall, signals.engine_stopped)
-        if log_observer:
-            crawler.signals.connect(log_observer.stop, signals.engine_stopped)
-        crawler.signals.connect(self._check_done, signals.engine_stopped)
-        crawler.start()
-        return name, crawler
-
-    def _check_done(self, **kwargs):
-        if not self._start_crawler():
-            self._stop_reactor()
-
    def _stop_reactor(self, _=None):
        try:
            reactor.stop()