提交 5ff64ad0 编写于 作者: E Eugenio Lacuesta

handle relative sitemap urls in robots.txt

上级 2086ff40
......@@ -32,7 +32,7 @@ class SitemapSpider(Spider):
def _parse_sitemap(self, response):
if response.url.endswith('/robots.txt'):
for url in sitemap_urls_from_robots(response.text):
for url in sitemap_urls_from_robots(response.text, base_url=response.url):
yield Request(url, callback=self._parse_sitemap)
else:
body = self._get_sitemap_body(response)
......
......@@ -4,7 +4,9 @@ Module for processing Sitemaps.
Note: The main purpose of this module is to provide support for the
SitemapSpider, its API is subject to change without notice.
"""
import lxml.etree
from six.moves.urllib.parse import urljoin
class Sitemap(object):
......@@ -34,10 +36,11 @@ class Sitemap(object):
yield d
def sitemap_urls_from_robots(robots_text):
def sitemap_urls_from_robots(robots_text, base_url=None):
"""Return an iterator over all sitemap urls contained in the given
robots.txt file
"""
for line in robots_text.splitlines():
if line.lstrip().lower().startswith('sitemap:'):
yield line.split(':', 1)[1].strip()
url = line.split(':', 1)[1].strip()
yield urljoin(base_url, url)
......@@ -332,13 +332,17 @@ class SitemapSpiderTest(SpiderTest):
robots = b"""# Sitemap files
Sitemap: http://example.com/sitemap.xml
Sitemap: http://example.com/sitemap-product-index.xml
Sitemap: HTTP://example.com/sitemap-uppercase.xml
Sitemap: /sitemap-relative-url.xml
"""
r = TextResponse(url="http://www.example.com/robots.txt", body=robots)
spider = self.spider_class("example.com")
self.assertEqual([req.url for req in spider._parse_sitemap(r)],
['http://example.com/sitemap.xml',
'http://example.com/sitemap-product-index.xml'])
'http://example.com/sitemap-product-index.xml',
'http://example.com/sitemap-uppercase.xml',
'http://www.example.com/sitemap-relative-url.xml'])
class BaseSpiderDeprecationTest(unittest.TestCase):
......
......@@ -119,13 +119,18 @@ Disallow: /s*/*tags
# Sitemap files
Sitemap: http://example.com/sitemap.xml
Sitemap: http://example.com/sitemap-product-index.xml
Sitemap: HTTP://example.com/sitemap-uppercase.xml
Sitemap: /sitemap-relative-url.xml
# Forums
Disallow: /forum/search/
Disallow: /forum/active/
"""
self.assertEqual(list(sitemap_urls_from_robots(robots)),
['http://example.com/sitemap.xml', 'http://example.com/sitemap-product-index.xml'])
self.assertEqual(list(sitemap_urls_from_robots(robots, base_url='http://example.com')),
['http://example.com/sitemap.xml',
'http://example.com/sitemap-product-index.xml',
'http://example.com/sitemap-uppercase.xml',
'http://example.com/sitemap-relative-url.xml'])
def test_sitemap_blanklines(self):
"""Assert we can deal with starting blank lines before <xml> tag"""
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册