提交 c1f81986 编写于 作者: P Pablo Hoffman

Added RANDOMIZE_DOWNLOAD_DELAY setting

上级 23fcf48a
......@@ -418,6 +418,15 @@ supported. Example::
DOWNLOAD_DELAY = 0.25 # 250 ms of delay
This setting is also affected by the :setting:`RANDOMIZE_DOWNLOAD_DELAY`
setting (which is enabled by default). By default, Scrapy doesn't wait a fixed
amount of time between requests, but uses a random interval between 0.5 and 1.5
* :setting:`DOWNLOAD_DELAY`.
Another way to change the download delay (per spider, instead of globally) is
by using the ``download_delay`` spider attribute, which takes more precedence
than this setting.
.. setting:: DOWNLOAD_TIMEOUT
DOWNLOAD_TIMEOUT
......@@ -677,6 +686,27 @@ Example::
NEWSPIDER_MODULE = 'mybot.spiders_dev'
.. setting:: RANDOMIZE_DOWNLOAD_DELAY
RANDOMIZE_DOWNLOAD_DELAY
------------------------
Default: ``True``
If enabled, Scrapy will wait a random amount of time (between 0.5 and 1.5
* :setting:`DOWNLOAD_DELAY`) while fetching requests from the same
spider.
This randomization decreases the chance of the crawler being detected (and
subsequently blocked) by sites which analyze requests looking for statistically
significant similarities in the time between their times.
The randomization policy is the same used by `wget`_ ``--random-wait`` option.
If :setting:`DOWNLOAD_DELAY` is zero (default) this option has no effect.
.. _wget: http://www.gnu.org/software/wget/manual/wget.html
.. setting:: REDIRECT_MAX_TIMES
REDIRECT_MAX_TIMES
......
......@@ -122,6 +122,8 @@ MYSQL_CONNECTION_SETTINGS = {}
NEWSPIDER_MODULE = ''
RANDOMIZE_DOWNLOAD_DELAY = True
REDIRECT_MAX_METAREFRESH_DELAY = 100
REDIRECT_MAX_TIMES = 20 # uses Firefox default setting
REDIRECT_PRIORITY_ADJUST = +2
......
......@@ -2,6 +2,7 @@
Download web pages using asynchronous IO
"""
import random
from time import time
from twisted.internet import reactor, defer
......@@ -20,15 +21,21 @@ class SpiderInfo(object):
def __init__(self, download_delay=None, max_concurrent_requests=None):
if download_delay is None:
self.download_delay = settings.getfloat('DOWNLOAD_DELAY')
self._download_delay = settings.getfloat('DOWNLOAD_DELAY')
else:
self.download_delay = download_delay
if self.download_delay:
self._download_delay = float(download_delay)
if self._download_delay:
self.max_concurrent_requests = 1
elif max_concurrent_requests is None:
self.max_concurrent_requests = settings.getint('CONCURRENT_REQUESTS_PER_SPIDER')
else:
self.max_concurrent_requests = max_concurrent_requests
if self._download_delay and settings.getbool('RANDOMIZE_DOWNLOAD_DELAY'):
# same policy as wget --random-wait
self.random_delay_interval = (0.5*self._download_delay, \
1.5*self._download_delay)
else:
self.random_delay_interval = None
self.active = set()
self.queue = []
......@@ -44,6 +51,12 @@ class SpiderInfo(object):
# use self.active to include requests in the downloader middleware
return len(self.active) > 2 * self.max_concurrent_requests
def download_delay(self):
if self.random_delay_interval:
return random.uniform(*self.random_delay_interval)
else:
return self._download_delay
def cancel_request_calls(self):
for call in self.next_request_calls:
call.cancel()
......@@ -99,8 +112,9 @@ class Downloader(object):
# Delay queue processing if a download_delay is configured
now = time()
if site.download_delay:
penalty = site.download_delay - now + site.lastseen
delay = site.download_delay()
if delay:
penalty = delay - now + site.lastseen
if penalty > 0:
d = defer.Deferred()
d.addCallback(self._process_queue)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册