diff --git a/scrapy/http/response/text.py b/scrapy/http/response/text.py index c294ade639a035785540eac4f25cc7621dc8ea4b..59f44fb5666f566157800c4f873d97366dce5786 100644 --- a/scrapy/http/response/text.py +++ b/scrapy/http/response/text.py @@ -6,14 +6,19 @@ See documentation in docs/topics/request-response.rst """ import re - +import codecs from scrapy.xlib.BeautifulSoup import UnicodeDammit - from scrapy.http.response import Response from scrapy.utils.python import memoizemethod_noargs from scrapy.utils.encoding import encoding_exists, resolve_encoding from scrapy.conf import settings + +# Python decoder doesn't follow unicode standard when handling +# bad utf-8 encoded strings. see http://bugs.python.org/issue8271 +codecs.register_error('scrapy_replace', lambda exc: (u'\ufffd', exc.start+1)) + + class TextResponse(Response): _DEFAULT_ENCODING = settings['DEFAULT_RESPONSE_ENCODING'] @@ -77,7 +82,7 @@ class TextResponse(Response): def body_as_unicode(self): """Return body as unicode""" if self._cached_ubody is None: - self._cached_ubody = self.body.decode(self.encoding, 'replace') + self._cached_ubody = self.body.decode(self.encoding, 'scrapy_replace') return self._cached_ubody @memoizemethod_noargs diff --git a/scrapy/tests/test_http_response.py b/scrapy/tests/test_http_response.py index 87138219b5442aabfc5ac2420fc6aa34ace2269f..437d75b98f4d5473bd6eadd29ba699524e9b58c4 100644 --- a/scrapy/tests/test_http_response.py +++ b/scrapy/tests/test_http_response.py @@ -215,7 +215,7 @@ class TextResponseTest(BaseResponseTest): def test_invalid_utf8_encoded_body_with_valid_utf8_BOM(self): r6 = self.response_class("http://www.example.com", headers={"Content-type": ["text/html; charset=utf-8"]}, body="\xef\xbb\xbfWORD\xe3\xab") self.assertEqual(r6.encoding, 'utf-8') - self.assertEqual(r6.body_as_unicode(), u'\ufeffWORD\ufffd') + self.assertEqual(r6.body_as_unicode(), u'\ufeffWORD\ufffd\ufffd') def test_replace_wrong_encoding(self): """Test invalid chars are replaced properly""" @@ -223,6 +223,13 @@ class TextResponseTest(BaseResponseTest): # XXX: Policy for replacing invalid chars may suffer minor variations # but it should always contain the unicode replacement char (u'\ufffd') assert u'\ufffd' in r.body_as_unicode(), repr(r.body_as_unicode()) + assert u'PREFIX' in r.body_as_unicode(), repr(r.body_as_unicode()) + assert u'SUFFIX' in r.body_as_unicode(), repr(r.body_as_unicode()) + + # Do not destroy html tags due to encoding bugs + r = self.response_class("http://example.com", encoding='utf-8', \ + body='\xf0value') + assert u'value' in r.body_as_unicode(), repr(r.body_as_unicode()) # FIXME: This test should pass once we stop using BeautifulSoup's UnicodeDammit in TextResponse #r = self.response_class("http://www.example.com", body='PREFIX\xe3\xabSUFFIX')