提交 ade7662f 编写于 作者: D Daniel Graña

Merge pull request #661 from ananana/sgml-attrs-tuple

Fixed default value of attrs argument in SgmlLinkExtractor to be tuple
......@@ -69,8 +69,9 @@ SgmlLinkExtractor
domains which won't be considered for extracting the links
:type deny_domains: str or list
:param deny_extensions: a list of extensions that should be ignored when
extracting links. If not given, it will default to the
:param deny_extensions: a single value or list of strings containing
extensions that should be ignored when extracting links.
If not given, it will default to the
``IGNORED_EXTENSIONS`` list defined in the `scrapy.linkextractor`_
module.
:type deny_extensions: list
......@@ -85,7 +86,7 @@ SgmlLinkExtractor
Defaults to ``('a', 'area')``.
:type tags: str or list
:param attrs: list of attributes which should be considered when looking
:param attrs: an attribute or list of attributes which should be considered when looking
for links to extract (only for those tags specified in the ``tags``
parameter). Defaults to ``('href',)``
:type attrs: list
......
......@@ -95,7 +95,7 @@ _is_valid_url = lambda url: url.split('://', 1)[0] in set(['http', 'https', 'fil
class SgmlLinkExtractor(BaseSgmlLinkExtractor):
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None,
deny_extensions=None):
self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
......@@ -105,9 +105,9 @@ class SgmlLinkExtractor(BaseSgmlLinkExtractor):
self.canonicalize = canonicalize
if deny_extensions is None:
deny_extensions = IGNORED_EXTENSIONS
self.deny_extensions = set(['.' + e for e in deny_extensions])
tag_func = lambda x: x in tags
attr_func = lambda x: x in attrs
self.deny_extensions = {'.' + e for e in arg_to_iter(deny_extensions)}
tag_func = lambda x: x in arg_to_iter(tags)
attr_func = lambda x: x in arg_to_iter(attrs)
BaseSgmlLinkExtractor.__init__(self,
tag=tag_func,
attr=attr_func,
......
......@@ -277,6 +277,11 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
Link(url='http://example.org/page.html', text=u'asd'),
])
lx = SgmlLinkExtractor(deny_extensions="jpg")
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.org/page.html', text=u'asd'),
])
def test_process_value(self):
"""Test restrict_xpaths with encodings"""
html = """
......@@ -304,6 +309,67 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
def test_attrs(self):
lx = SgmlLinkExtractor(attrs="href")
self.assertEqual(lx.extract_links(self.response), [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),
])
lx = SgmlLinkExtractor(attrs=("href","src"), tags=("a","area","img"), deny_extensions=())
self.assertEqual(lx.extract_links(self.response), [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample2.jpg', text=u''),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),
])
lx = SgmlLinkExtractor(attrs=None)
self.assertEqual(lx.extract_links(self.response), [])
html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
response = HtmlResponse("http://example.com/index.html", body=html)
lx = SgmlLinkExtractor(attrs=("href"))
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample1.html', text=u''),
])
def test_tags(self):
html = """<html><area href="sample1.html"></area><a href="sample2.html">sample 2</a><img src="sample2.jpg"/></html>"""
response = HtmlResponse("http://example.com/index.html", body=html)
lx = SgmlLinkExtractor(tags=None)
self.assertEqual(lx.extract_links(response), [])
lx = SgmlLinkExtractor()
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
])
lx = SgmlLinkExtractor(tags="area")
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample1.html', text=u''),
])
lx = SgmlLinkExtractor(tags="a")
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample2.html', text=u'sample 2'),
])
lx = SgmlLinkExtractor(tags=("a","img"), attrs=("href", "src"), deny_extensions=())
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample2.jpg', text=u''),
])
class HtmlParserLinkExtractorTestCase(unittest.TestCase):
def setUp(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册