Minimize effect of http://bugs.python.org/issue8271 on TextResponses by...

Minimize effect of http://bugs.python.org/issue8271 on TextResponses by changing str.decode errors policy by custom `replace` alike error handler

Minimize effect of http://bugs.python.org/issue8271 on TextResponses by...
Minimize effect of http://bugs.python.org/issue8271 on TextResponses by changing str.decode errors policy by custom `replace` alike error handler
8b86e1d0 · Daniel Grana · 3fcd69c3 · 8b86e1d0 · 8b86e1d0
隐藏空白更改
内联并排

Showing with 16 addition and 4 deletion

scrapy/http/response/text.py scrapy/http/response/text.py +8 -3

scrapy/tests/test_http_response.py scrapy/tests/test_http_response.py +8 -1

未找到文件。
--- a/scrapy/http/response/text.py
+++ b/scrapy/http/response/text.py
@@ -6,14 +6,19 @@ See documentation in docs/topics/request-response.rst
 """

 import re
-
+import codecs
 from scrapy.xlib.BeautifulSoup import UnicodeDammit
-
 from scrapy.http.response import Response
 from scrapy.utils.python import memoizemethod_noargs
 from scrapy.utils.encoding import encoding_exists, resolve_encoding
 from scrapy.conf import settings

+
+# Python decoder doesn't follow unicode standard when handling
+# bad utf-8 encoded strings. see http://bugs.python.org/issue8271
+codecs.register_error('scrapy_replace', lambda exc: (u'\ufffd', exc.start+1))
+
+
 class TextResponse(Response):

    _DEFAULT_ENCODING = settings['DEFAULT_RESPONSE_ENCODING']
@@ -77,7 +82,7 @@ class TextResponse(Response):
    def body_as_unicode(self):
        """Return body as unicode"""
        if self._cached_ubody is None:
-            self._cached_ubody = self.body.decode(self.encoding, 'replace')
+            self._cached_ubody = self.body.decode(self.encoding, 'scrapy_replace')
        return self._cached_ubody

    @memoizemethod_noargs

--- a/scrapy/tests/test_http_response.py
+++ b/scrapy/tests/test_http_response.py
@@ -215,7 +215,7 @@ class TextResponseTest(BaseResponseTest):
    def test_invalid_utf8_encoded_body_with_valid_utf8_BOM(self):
        r6 = self.response_class("http://www.example.com", headers={"Content-type": ["text/html; charset=utf-8"]}, body="\xef\xbb\xbfWORD\xe3\xab")
        self.assertEqual(r6.encoding, 'utf-8')
-        self.assertEqual(r6.body_as_unicode(), u'\ufeffWORD\ufffd')
+        self.assertEqual(r6.body_as_unicode(), u'\ufeffWORD\ufffd\ufffd')

    def test_replace_wrong_encoding(self):
        """Test invalid chars are replaced properly"""
@@ -223,6 +223,13 @@ class TextResponseTest(BaseResponseTest):
        # XXX: Policy for replacing invalid chars may suffer minor variations
        # but it should always contain the unicode replacement char (u'\ufffd')
        assert u'\ufffd' in r.body_as_unicode(), repr(r.body_as_unicode())
+        assert u'PREFIX' in r.body_as_unicode(), repr(r.body_as_unicode())
+        assert u'SUFFIX' in r.body_as_unicode(), repr(r.body_as_unicode())
+
+        # Do not destroy html tags due to encoding bugs
+        r = self.response_class("http://example.com", encoding='utf-8', \
+                body='\xf0<span>value</span>')
+        assert u'<span>value</span>' in r.body_as_unicode(), repr(r.body_as_unicode())

        # FIXME: This test should pass once we stop using BeautifulSoup's UnicodeDammit in TextResponse
        #r = self.response_class("http://www.example.com", body='PREFIX\xe3\xabSUFFIX')