Merge pull request #1767 from orangain/sitemap-robotstxt

[MRG+1] PY3: Fix SitemapSpider to extract sitemap urls from robots.txt properly

Merge pull request #1767 from orangain/sitemap-robotstxt
[MRG+1] PY3: Fix SitemapSpider to extract sitemap urls from robots.txt properly
44bc4c06 · Mikhail Korobov · e328a9b9 · 25c56159 · 44bc4c06 · 44bc4c06
隐藏空白更改
内联并排

Showing with 13 addition and 1 deletion

scrapy/spiders/sitemap.py scrapy/spiders/sitemap.py +1 -1

tests/test_spider.py tests/test_spider.py +12 -0

未找到文件。
--- a/scrapy/spiders/sitemap.py
+++ b/scrapy/spiders/sitemap.py
@@ -32,7 +32,7 @@ class SitemapSpider(Spider):

    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
-            for url in sitemap_urls_from_robots(response.body):
+            for url in sitemap_urls_from_robots(response.text):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)

--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@@ -328,6 +328,18 @@ class SitemapSpiderTest(SpiderTest):
        r = Response(url="http://www.example.com/sitemap.xml.gz", body=self.GZBODY)
        self.assertSitemapBody(r, self.BODY)

+    def test_get_sitemap_urls_from_robotstxt(self):
+        robots = b"""# Sitemap files
+Sitemap: http://example.com/sitemap.xml
+Sitemap: http://example.com/sitemap-product-index.xml
+"""
+
+        r = TextResponse(url="http://www.example.com/robots.txt", body=robots)
+        spider = self.spider_class("example.com")
+        self.assertEqual([req.url for req in spider._parse_sitemap(r)],
+                         ['http://example.com/sitemap.xml',
+                          'http://example.com/sitemap-product-index.xml'])
+

 class BaseSpiderDeprecationTest(unittest.TestCase):