From c2a424daaeca851c9d4a6b930eabf2a0422fdfe3 Mon Sep 17 00:00:00 2001 From: Claudio Salazar Date: Sat, 5 Apr 2014 00:13:27 +0800 Subject: [PATCH] Fixed XML selector against XXE attacks --- scrapy/selector/unified.py | 7 ++++++- scrapy/tests/test_selector.py | 10 ++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/scrapy/selector/unified.py b/scrapy/selector/unified.py index babb65070..60598a24d 100644 --- a/scrapy/selector/unified.py +++ b/scrapy/selector/unified.py @@ -15,11 +15,16 @@ from .csstranslator import ScrapyHTMLTranslator, ScrapyGenericTranslator __all__ = ['Selector', 'SelectorList'] + +class SafeXMLParser(etree.XMLParser): + def __init__(self, *args, **kwargs): + super(SafeXMLParser, self).__init__(*args, resolve_entities=False, **kwargs) + _ctgroup = { 'html': {'_parser': etree.HTMLParser, '_csstranslator': ScrapyHTMLTranslator(), '_tostring_method': 'html'}, - 'xml': {'_parser': etree.XMLParser, + 'xml': {'_parser': SafeXMLParser, '_csstranslator': ScrapyGenericTranslator(), '_tostring_method': 'xml'}, } diff --git a/scrapy/tests/test_selector.py b/scrapy/tests/test_selector.py index 489a163a0..95a44be27 100644 --- a/scrapy/tests/test_selector.py +++ b/scrapy/tests/test_selector.py @@ -332,6 +332,16 @@ class SelectorTestCase(unittest.TestCase): div_class = x.xpath('//div/@class') self.assertTrue(all(map(lambda e: hasattr(e._root, 'getparent'), div_class))) + def test_xml_entity_expansion(self): + malicious_xml = ''\ + ' ]>&xxe;' + + response = XmlResponse('http://example.com', body=malicious_xml) + sel = self.sscls(response=response) + + self.assertEqual(sel.extract(), '&xxe;') + class DeprecatedXpathSelectorTest(unittest.TestCase): -- GitLab