Added RANDOMIZE_DOWNLOAD_DELAY setting

c1f81986 · Pablo Hoffman · 23fcf48a · c1f81986 · c1f81986 · c1f81986
Showing with 51 addition and 5 deletion

docs/topics/settings.rst docs/topics/settings.rst +30 -0

scrapy/conf/default_settings.py scrapy/conf/default_settings.py +2 -0

scrapy/core/downloader/manager.py scrapy/core/downloader/manager.py +19 -5

未找到文件。
--- a/docs/topics/settings.rst
+++ b/docs/topics/settings.rst
@@ -418,6 +418,15 @@ supported.  Example::

    DOWNLOAD_DELAY = 0.25    # 250 ms of delay 

+This setting is also affected by the :setting:`RANDOMIZE_DOWNLOAD_DELAY`
+setting (which is enabled by default). By default, Scrapy doesn't wait a fixed
+amount of time between requests, but uses a random interval between 0.5 and 1.5
+* :setting:`DOWNLOAD_DELAY`.
+
+Another way to change the download delay (per spider, instead of globally) is
+by using the ``download_delay`` spider attribute, which takes more precedence
+than this setting.
+
 .. setting:: DOWNLOAD_TIMEOUT

 DOWNLOAD_TIMEOUT
@@ -677,6 +686,27 @@ Example::

    NEWSPIDER_MODULE = 'mybot.spiders_dev'

+.. setting:: RANDOMIZE_DOWNLOAD_DELAY
+
+RANDOMIZE_DOWNLOAD_DELAY
+------------------------
+
+Default: ``True``
+
+If enabled, Scrapy will wait a random amount of time (between 0.5 and 1.5
+* :setting:`DOWNLOAD_DELAY`) while fetching requests from the same
+spider.
+
+This randomization decreases the chance of the crawler being detected (and
+subsequently blocked) by sites which analyze requests looking for statistically
+significant similarities in the time between their times.
+
+The randomization policy is the same used by `wget`_ ``--random-wait`` option.
+
+If :setting:`DOWNLOAD_DELAY` is zero (default) this option has no effect.
+
+.. _wget: http://www.gnu.org/software/wget/manual/wget.html
+
 .. setting:: REDIRECT_MAX_TIMES

 REDIRECT_MAX_TIMES

--- a/scrapy/conf/default_settings.py
+++ b/scrapy/conf/default_settings.py
@@ -122,6 +122,8 @@ MYSQL_CONNECTION_SETTINGS = {}

 NEWSPIDER_MODULE = ''

+RANDOMIZE_DOWNLOAD_DELAY = True
+
 REDIRECT_MAX_METAREFRESH_DELAY = 100
 REDIRECT_MAX_TIMES = 20 # uses Firefox default setting
 REDIRECT_PRIORITY_ADJUST = +2

--- a/scrapy/core/downloader/manager.py
+++ b/scrapy/core/downloader/manager.py
@@ -2,6 +2,7 @@
 Download web pages using asynchronous IO
 """

+import random
 from time import time

 from twisted.internet import reactor, defer
@@ -20,15 +21,21 @@ class SpiderInfo(object):

    def __init__(self, download_delay=None, max_concurrent_requests=None):
        if download_delay is None:
-            self.download_delay = settings.getfloat('DOWNLOAD_DELAY')
+            self._download_delay = settings.getfloat('DOWNLOAD_DELAY')
        else:
-            self.download_delay = download_delay
-        if self.download_delay:
+            self._download_delay = float(download_delay)
+        if self._download_delay:
            self.max_concurrent_requests = 1
        elif max_concurrent_requests is None:
            self.max_concurrent_requests = settings.getint('CONCURRENT_REQUESTS_PER_SPIDER')
        else:
            self.max_concurrent_requests =  max_concurrent_requests
+        if self._download_delay and settings.getbool('RANDOMIZE_DOWNLOAD_DELAY'):
+            # same policy as wget --random-wait
+            self.random_delay_interval = (0.5*self._download_delay, \
+                1.5*self._download_delay)
+        else:
+            self.random_delay_interval = None

        self.active = set()
        self.queue = []
@@ -44,6 +51,12 @@ class SpiderInfo(object):
        # use self.active to include requests in the downloader middleware
        return len(self.active) > 2 * self.max_concurrent_requests

+    def download_delay(self):
+        if self.random_delay_interval:
+            return random.uniform(*self.random_delay_interval)
+        else:
+            return self._download_delay
+
    def cancel_request_calls(self):
        for call in self.next_request_calls:
            call.cancel()
@@ -99,8 +112,9 @@ class Downloader(object):

        # Delay queue processing if a download_delay is configured
        now = time()
-        if site.download_delay:
-            penalty = site.download_delay - now + site.lastseen
+        delay = site.download_delay()
+        if delay:
+            penalty = delay - now + site.lastseen
            if penalty > 0:
                d = defer.Deferred()
                d.addCallback(self._process_queue)