未验证 提交 a21b8004 编写于 作者: K Konstantin Lopuhin 提交者: GitHub

Merge pull request #3011 from Jane222/master

[MRG+1] Issues a warning when user puts a URL into allowed_domains (#2250)
......@@ -6,6 +6,7 @@ See documentation in docs/topics/spider-middleware.rst
import re
import logging
import warnings
from scrapy import signals
from scrapy.http import Request
......@@ -52,9 +53,18 @@ class OffsiteMiddleware(object):
allowed_domains = getattr(spider, 'allowed_domains', None)
if not allowed_domains:
return re.compile('') # allow all by default
url_pattern = re.compile("^https?://.*$")
for domain in allowed_domains:
if url_pattern.match(domain):
warnings.warn("allowed_domains accepts only domains, not URLs. Ignoring URL entry %s in allowed_domains." % domain, URLWarning)
regex = r'^(.*\.)?(%s)$' % '|'.join(re.escape(d) for d in allowed_domains if d is not None)
return re.compile(regex)
def spider_opened(self, spider):
self.host_regex = self.get_host_regex(spider)
self.domains_seen = set()
class URLWarning(Warning):
pass
......@@ -5,7 +5,9 @@ from six.moves.urllib.parse import urlparse
from scrapy.http import Response, Request
from scrapy.spiders import Spider
from scrapy.spidermiddlewares.offsite import OffsiteMiddleware
from scrapy.spidermiddlewares.offsite import URLWarning
from scrapy.utils.test import get_crawler
import warnings
class TestOffsiteMiddleware(TestCase):
......@@ -68,3 +70,13 @@ class TestOffsiteMiddleware4(TestOffsiteMiddleware3):
reqs = [Request('http://scrapytest.org/1')]
out = list(self.mw.process_spider_output(res, reqs, self.spider))
self.assertEqual(out, reqs)
class TestOffsiteMiddleware5(TestOffsiteMiddleware4):
def test_get_host_regex(self):
self.spider.allowed_domains = ['http://scrapytest.org', 'scrapy.org', 'scrapy.test.org']
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
self.mw.get_host_regex(self.spider)
assert issubclass(w[-1].category, URLWarning)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册