Merge pull request #661 from ananana/sgml-attrs-tuple

Fixed default value of attrs argument in SgmlLinkExtractor to be tuple

Merge pull request #661 from ananana/sgml-attrs-tuple
Fixed default value of attrs argument in SgmlLinkExtractor to be tuple
ade7662f · Daniel Graña · f6874550 · 73109bf9 · ade7662f · ade7662f
3 changed file
--- a/docs/topics/link-extractors.rst
+++ b/docs/topics/link-extractors.rst
@@ -69,8 +69,9 @@ SgmlLinkExtractor
        domains which won't be considered for extracting the links
    :type deny_domains: str or list

-    :param deny_extensions: a list of extensions that should be ignored when
-        extracting links. If not given, it will default to the
+    :param deny_extensions: a single value or list of strings containing
+        extensions that should be ignored when extracting links. 
+        If not given, it will default to the
        ``IGNORED_EXTENSIONS`` list defined in the `scrapy.linkextractor`_
        module.
    :type deny_extensions: list
@@ -85,7 +86,7 @@ SgmlLinkExtractor
        Defaults to ``('a', 'area')``.
    :type tags: str or list

-    :param attrs: list of attributes which should be considered when looking
+    :param attrs: an attribute or list of attributes which should be considered when looking
        for links to extract (only for those tags specified in the ``tags``
        parameter). Defaults to ``('href',)``
    :type attrs: list

--- a/scrapy/contrib/linkextractors/sgml.py
+++ b/scrapy/contrib/linkextractors/sgml.py
@@ -95,7 +95,7 @@ _is_valid_url = lambda url: url.split('://', 1)[0] in set(['http', 'https', 'fil
 class SgmlLinkExtractor(BaseSgmlLinkExtractor):

    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
-                 tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
+                 tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None,
                 deny_extensions=None):
        self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
        self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
@@ -105,9 +105,9 @@ class SgmlLinkExtractor(BaseSgmlLinkExtractor):
        self.canonicalize = canonicalize
        if deny_extensions is None:
            deny_extensions = IGNORED_EXTENSIONS
-        self.deny_extensions = set(['.' + e for e in deny_extensions])
-        tag_func = lambda x: x in tags
-        attr_func = lambda x: x in attrs
+        self.deny_extensions = {'.' + e for e in arg_to_iter(deny_extensions)}
+        tag_func = lambda x: x in arg_to_iter(tags)
+        attr_func = lambda x: x in arg_to_iter(attrs)
        BaseSgmlLinkExtractor.__init__(self,
                                       tag=tag_func,
                                       attr=attr_func,

--- a/scrapy/tests/test_contrib_linkextractors.py
+++ b/scrapy/tests/test_contrib_linkextractors.py
@@ -277,6 +277,11 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
            Link(url='http://example.org/page.html', text=u'asd'),
        ])

+        lx = SgmlLinkExtractor(deny_extensions="jpg")
+        self.assertEqual(lx.extract_links(response), [
+            Link(url='http://example.org/page.html', text=u'asd'),
+        ])
+
    def test_process_value(self):
        """Test restrict_xpaths with encodings"""
        html = """
@@ -304,6 +309,67 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
                         [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])


+    def test_attrs(self):
+        lx = SgmlLinkExtractor(attrs="href")
+        self.assertEqual(lx.extract_links(self.response), [
+            Link(url='http://example.com/sample1.html', text=u''),
+            Link(url='http://example.com/sample2.html', text=u'sample 2'),
+            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
+            Link(url='http://www.google.com/something', text=u''),
+            Link(url='http://example.com/innertag.html', text=u'inner tag'),
+        ])
+
+        lx = SgmlLinkExtractor(attrs=("href","src"), tags=("a","area","img"), deny_extensions=())
+        self.assertEqual(lx.extract_links(self.response), [
+            Link(url='http://example.com/sample1.html', text=u''),
+            Link(url='http://example.com/sample2.html', text=u'sample 2'),
+            Link(url='http://example.com/sample2.jpg', text=u''),
+            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
+            Link(url='http://www.google.com/something', text=u''),
+            Link(url='http://example.com/innertag.html', text=u'inner tag'),
+        ])
+
+        lx = SgmlLinkExtractor(attrs=None)
+        self.assertEqual(lx.extract_links(self.response), [])
+
+        html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
+        response = HtmlResponse("http://example.com/index.html", body=html)
+        lx = SgmlLinkExtractor(attrs=("href"))
+        self.assertEqual(lx.extract_links(response), [
+            Link(url='http://example.com/sample1.html', text=u''),
+        ])
+
+
+    def test_tags(self):
+        html = """<html><area href="sample1.html"></area><a href="sample2.html">sample 2</a><img src="sample2.jpg"/></html>"""
+        response = HtmlResponse("http://example.com/index.html", body=html)
+
+        lx = SgmlLinkExtractor(tags=None)
+        self.assertEqual(lx.extract_links(response), [])
+
+        lx = SgmlLinkExtractor()
+        self.assertEqual(lx.extract_links(response), [
+            Link(url='http://example.com/sample1.html', text=u''),
+            Link(url='http://example.com/sample2.html', text=u'sample 2'),
+        ])
+
+        lx = SgmlLinkExtractor(tags="area")
+        self.assertEqual(lx.extract_links(response), [
+            Link(url='http://example.com/sample1.html', text=u''),
+        ])
+
+        lx = SgmlLinkExtractor(tags="a")
+        self.assertEqual(lx.extract_links(response), [
+            Link(url='http://example.com/sample2.html', text=u'sample 2'),
+        ])
+
+        lx = SgmlLinkExtractor(tags=("a","img"), attrs=("href", "src"), deny_extensions=())
+        self.assertEqual(lx.extract_links(response), [
+            Link(url='http://example.com/sample2.html', text=u'sample 2'),
+            Link(url='http://example.com/sample2.jpg', text=u''),
+        ])
+
+
 class HtmlParserLinkExtractorTestCase(unittest.TestCase):

    def setUp(self):