提交 44bc4c06 编写于 作者: M Mikhail Korobov

Merge pull request #1767 from orangain/sitemap-robotstxt

[MRG+1] PY3: Fix SitemapSpider to extract sitemap urls from robots.txt properly
......@@ -32,7 +32,7 @@ class SitemapSpider(Spider):
def _parse_sitemap(self, response):
if response.url.endswith('/robots.txt'):
for url in sitemap_urls_from_robots(response.body):
for url in sitemap_urls_from_robots(response.text):
yield Request(url, callback=self._parse_sitemap)
else:
body = self._get_sitemap_body(response)
......
......@@ -328,6 +328,18 @@ class SitemapSpiderTest(SpiderTest):
r = Response(url="http://www.example.com/sitemap.xml.gz", body=self.GZBODY)
self.assertSitemapBody(r, self.BODY)
def test_get_sitemap_urls_from_robotstxt(self):
robots = b"""# Sitemap files
Sitemap: http://example.com/sitemap.xml
Sitemap: http://example.com/sitemap-product-index.xml
"""
r = TextResponse(url="http://www.example.com/robots.txt", body=robots)
spider = self.spider_class("example.com")
self.assertEqual([req.url for req in spider._parse_sitemap(r)],
['http://example.com/sitemap.xml',
'http://example.com/sitemap-product-index.xml'])
class BaseSpiderDeprecationTest(unittest.TestCase):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册