提交 8b86e1d0 编写于 作者: D Daniel Grana

Minimize effect of http://bugs.python.org/issue8271 on TextResponses by...

Minimize effect of http://bugs.python.org/issue8271 on TextResponses by changing str.decode errors policy by custom `replace` alike error handler
上级 3fcd69c3
......@@ -6,14 +6,19 @@ See documentation in docs/topics/request-response.rst
"""
import re
import codecs
from scrapy.xlib.BeautifulSoup import UnicodeDammit
from scrapy.http.response import Response
from scrapy.utils.python import memoizemethod_noargs
from scrapy.utils.encoding import encoding_exists, resolve_encoding
from scrapy.conf import settings
# Python decoder doesn't follow unicode standard when handling
# bad utf-8 encoded strings. see http://bugs.python.org/issue8271
codecs.register_error('scrapy_replace', lambda exc: (u'\ufffd', exc.start+1))
class TextResponse(Response):
_DEFAULT_ENCODING = settings['DEFAULT_RESPONSE_ENCODING']
......@@ -77,7 +82,7 @@ class TextResponse(Response):
def body_as_unicode(self):
"""Return body as unicode"""
if self._cached_ubody is None:
self._cached_ubody = self.body.decode(self.encoding, 'replace')
self._cached_ubody = self.body.decode(self.encoding, 'scrapy_replace')
return self._cached_ubody
@memoizemethod_noargs
......
......@@ -215,7 +215,7 @@ class TextResponseTest(BaseResponseTest):
def test_invalid_utf8_encoded_body_with_valid_utf8_BOM(self):
r6 = self.response_class("http://www.example.com", headers={"Content-type": ["text/html; charset=utf-8"]}, body="\xef\xbb\xbfWORD\xe3\xab")
self.assertEqual(r6.encoding, 'utf-8')
self.assertEqual(r6.body_as_unicode(), u'\ufeffWORD\ufffd')
self.assertEqual(r6.body_as_unicode(), u'\ufeffWORD\ufffd\ufffd')
def test_replace_wrong_encoding(self):
"""Test invalid chars are replaced properly"""
......@@ -223,6 +223,13 @@ class TextResponseTest(BaseResponseTest):
# XXX: Policy for replacing invalid chars may suffer minor variations
# but it should always contain the unicode replacement char (u'\ufffd')
assert u'\ufffd' in r.body_as_unicode(), repr(r.body_as_unicode())
assert u'PREFIX' in r.body_as_unicode(), repr(r.body_as_unicode())
assert u'SUFFIX' in r.body_as_unicode(), repr(r.body_as_unicode())
# Do not destroy html tags due to encoding bugs
r = self.response_class("http://example.com", encoding='utf-8', \
body='\xf0<span>value</span>')
assert u'<span>value</span>' in r.body_as_unicode(), repr(r.body_as_unicode())
# FIXME: This test should pass once we stop using BeautifulSoup's UnicodeDammit in TextResponse
#r = self.response_class("http://www.example.com", body='PREFIX\xe3\xabSUFFIX')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册