handle relative sitemap urls in robots.txt

5ff64ad0 · Eugenio Lacuesta · 2086ff40 · 5ff64ad0 · 5ff64ad0 · 5ff64ad0
4 changed file
--- a/scrapy/spiders/sitemap.py
+++ b/scrapy/spiders/sitemap.py
@@ -32,7 +32,7 @@ class SitemapSpider(Spider):

    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
-            for url in sitemap_urls_from_robots(response.text):
+            for url in sitemap_urls_from_robots(response.text, base_url=response.url):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)

--- a/scrapy/utils/sitemap.py
+++ b/scrapy/utils/sitemap.py
@@ -4,7 +4,9 @@ Module for processing Sitemaps.
 Note: The main purpose of this module is to provide support for the
 SitemapSpider, its API is subject to change without notice.
 """
+
 import lxml.etree
+from six.moves.urllib.parse import urljoin


 class Sitemap(object):
@@ -34,10 +36,11 @@ class Sitemap(object):
                yield d


-def sitemap_urls_from_robots(robots_text):
+def sitemap_urls_from_robots(robots_text, base_url=None):
    """Return an iterator over all sitemap urls contained in the given
    robots.txt file
    """
    for line in robots_text.splitlines():
        if line.lstrip().lower().startswith('sitemap:'):
-            yield line.split(':', 1)[1].strip()
+            url = line.split(':', 1)[1].strip()
+            yield urljoin(base_url, url)
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@@ -332,13 +332,17 @@ class SitemapSpiderTest(SpiderTest):
        robots = b"""# Sitemap files
 Sitemap: http://example.com/sitemap.xml
 Sitemap: http://example.com/sitemap-product-index.xml
+Sitemap: HTTP://example.com/sitemap-uppercase.xml
+Sitemap: /sitemap-relative-url.xml
 """

        r = TextResponse(url="http://www.example.com/robots.txt", body=robots)
        spider = self.spider_class("example.com")
        self.assertEqual([req.url for req in spider._parse_sitemap(r)],
                         ['http://example.com/sitemap.xml',
-                          'http://example.com/sitemap-product-index.xml'])
+                          'http://example.com/sitemap-product-index.xml',
+                          'http://example.com/sitemap-uppercase.xml',
+                          'http://www.example.com/sitemap-relative-url.xml'])


 class BaseSpiderDeprecationTest(unittest.TestCase):

--- a/tests/test_utils_sitemap.py
+++ b/tests/test_utils_sitemap.py
@@ -119,13 +119,18 @@ Disallow: /s*/*tags
 # Sitemap files
 Sitemap: http://example.com/sitemap.xml
 Sitemap: http://example.com/sitemap-product-index.xml
+Sitemap: HTTP://example.com/sitemap-uppercase.xml
+Sitemap: /sitemap-relative-url.xml

 # Forums
 Disallow: /forum/search/
 Disallow: /forum/active/
 """
-        self.assertEqual(list(sitemap_urls_from_robots(robots)),
-             ['http://example.com/sitemap.xml', 'http://example.com/sitemap-product-index.xml'])
+        self.assertEqual(list(sitemap_urls_from_robots(robots, base_url='http://example.com')),
+                         ['http://example.com/sitemap.xml',
+                          'http://example.com/sitemap-product-index.xml',
+                          'http://example.com/sitemap-uppercase.xml',
+                          'http://example.com/sitemap-relative-url.xml'])

    def test_sitemap_blanklines(self):
        """Assert we can deal with starting blank lines before <xml> tag"""