提交 5835224e 编写于 作者: P Pablo Hoffman

Merge pull request #896 from scrapy/robotstxt-once

[MRG] process robots.txt once
......@@ -785,11 +785,19 @@ RobotsTxtMiddleware
and the :setting:`ROBOTSTXT_OBEY` setting is enabled.
.. warning:: Keep in mind that, if you crawl using multiple concurrent
requests per domain, Scrapy could still download some forbidden pages
requests per domain, Scrapy could still download some forbidden pages
if they were requested before the robots.txt file was downloaded. This
is a known limitation of the current robots.txt middleware and will
be fixed in the future.
.. reqmeta:: dont_obey_robotstxt
If :attr:`Request.meta <scrapy.http.Request.meta>` has
``dont_obey_robotstxt`` key set to True
the request will be ignored by this middleware even if
:setting:`ROBOTSTXT_OBEY` is enabled.
DownloaderStats
---------------
......
......@@ -228,6 +228,7 @@ Those are:
* :reqmeta:`cookiejar`
* :reqmeta:`redirect_urls`
* :reqmeta:`bindaddress`
* :reqmeta:`dont_obey_robotstxt`
.. reqmeta:: bindaddress
......
......@@ -22,16 +22,16 @@ class RobotsTxtMiddleware(object):
self.crawler = crawler
self._useragent = crawler.settings.get('USER_AGENT')
self._parsers = {}
self._spider_netlocs = set()
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
useragent = self._useragent
if request.meta.get('dont_obey_robotstxt'):
return
rp = self.robot_parser(request, spider)
if rp and not rp.can_fetch(useragent, request.url):
if rp and not rp.can_fetch(self._useragent, request.url):
log.msg(format="Forbidden by robots.txt: %(request)s",
level=log.DEBUG, request=request)
raise IgnoreRequest
......@@ -42,10 +42,13 @@ class RobotsTxtMiddleware(object):
if netloc not in self._parsers:
self._parsers[netloc] = None
robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
robotsreq = Request(robotsurl, priority=self.DOWNLOAD_PRIORITY)
robotsreq = Request(
robotsurl,
priority=self.DOWNLOAD_PRIORITY,
meta={'dont_obey_robotstxt': True}
)
dfd = self.crawler.engine.download(robotsreq, spider)
dfd.addCallback(self._parse_robots)
self._spider_netlocs.add(netloc)
return self._parsers[netloc]
def _parse_robots(self, response):
......
from __future__ import absolute_import
import re
import mock
from twisted.internet import reactor
......@@ -11,7 +12,44 @@ from scrapy.settings import Settings
class RobotsTxtMiddlewareTest(unittest.TestCase):
def test(self):
def test_robotstxt(self):
middleware = self._get_middleware()
# There is a bit of neglect in robotstxt.py: robots.txt is fetched asynchronously,
# and it is actually fetched only *after* first process_request completes.
# So, first process_request will always succeed.
# We defer test() because otherwise robots.txt download mock will be called after assertRaises failure.
self.assertNotIgnored(Request('http://site.local'), middleware)
def test(r):
self.assertNotIgnored(Request('http://site.local/allowed'), middleware)
self.assertIgnored(Request('http://site.local/admin/main'), middleware)
self.assertIgnored(Request('http://site.local/static/'), middleware)
deferred = Deferred()
deferred.addCallback(test)
reactor.callFromThread(deferred.callback, None)
return deferred
def test_robotstxt_meta(self):
meta = {'dont_obey_robotstxt': True}
middleware = self._get_middleware()
self.assertNotIgnored(Request('http://site.local', meta=meta), middleware)
def test(r):
self.assertNotIgnored(Request('http://site.local/allowed', meta=meta), middleware)
self.assertNotIgnored(Request('http://site.local/admin/main', meta=meta), middleware)
self.assertNotIgnored(Request('http://site.local/static/', meta=meta), middleware)
deferred = Deferred()
deferred.addCallback(test)
reactor.callFromThread(deferred.callback, None)
return deferred
def assertNotIgnored(self, request, middleware):
spider = None # not actually used
self.assertIsNone(middleware.process_request(request, spider))
def assertIgnored(self, request, middleware):
spider = None # not actually used
self.assertRaises(IgnoreRequest, middleware.process_request, request, spider)
def _get_crawler(self):
crawler = mock.MagicMock()
crawler.settings = Settings()
crawler.settings.set('USER_AGENT', 'CustomAgent')
......@@ -29,18 +67,8 @@ class RobotsTxtMiddlewareTest(unittest.TestCase):
reactor.callFromThread(deferred.callback, response)
return deferred
crawler.engine.download.side_effect = return_response
middleware = RobotsTxtMiddleware(crawler)
spider = None # not actually used
# There is a bit of neglect in robotstxt.py: robots.txt is fetched asynchronously,
# and it is actually fetched only *after* first process_request completes.
# So, first process_request will always succeed.
# We defer test() because otherwise robots.txt download mock will be called after assertRaises failure.
self.assertIsNone(middleware.process_request(Request('http://site.local'), spider)) # not affected by robots.txt
def test(r):
self.assertIsNone(middleware.process_request(Request('http://site.local/allowed'), spider))
self.assertRaises(IgnoreRequest, middleware.process_request, Request('http://site.local/admin/main'), spider)
self.assertRaises(IgnoreRequest, middleware.process_request, Request('http://site.local/static/'), spider)
deferred = Deferred()
deferred.addCallback(test)
reactor.callFromThread(deferred.callback, None)
return deferred
return crawler
def _get_middleware(self):
crawler = self._get_crawler()
return RobotsTxtMiddleware(crawler)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册