diff --git a/scrapy/http/response/text.py b/scrapy/http/response/text.py
index c294ade639a035785540eac4f25cc7621dc8ea4b..59f44fb5666f566157800c4f873d97366dce5786 100644
--- a/scrapy/http/response/text.py
+++ b/scrapy/http/response/text.py
@@ -6,14 +6,19 @@ See documentation in docs/topics/request-response.rst
"""
import re
-
+import codecs
from scrapy.xlib.BeautifulSoup import UnicodeDammit
-
from scrapy.http.response import Response
from scrapy.utils.python import memoizemethod_noargs
from scrapy.utils.encoding import encoding_exists, resolve_encoding
from scrapy.conf import settings
+
+# Python decoder doesn't follow unicode standard when handling
+# bad utf-8 encoded strings. see http://bugs.python.org/issue8271
+codecs.register_error('scrapy_replace', lambda exc: (u'\ufffd', exc.start+1))
+
+
class TextResponse(Response):
_DEFAULT_ENCODING = settings['DEFAULT_RESPONSE_ENCODING']
@@ -77,7 +82,7 @@ class TextResponse(Response):
def body_as_unicode(self):
"""Return body as unicode"""
if self._cached_ubody is None:
- self._cached_ubody = self.body.decode(self.encoding, 'replace')
+ self._cached_ubody = self.body.decode(self.encoding, 'scrapy_replace')
return self._cached_ubody
@memoizemethod_noargs
diff --git a/scrapy/tests/test_http_response.py b/scrapy/tests/test_http_response.py
index 87138219b5442aabfc5ac2420fc6aa34ace2269f..437d75b98f4d5473bd6eadd29ba699524e9b58c4 100644
--- a/scrapy/tests/test_http_response.py
+++ b/scrapy/tests/test_http_response.py
@@ -215,7 +215,7 @@ class TextResponseTest(BaseResponseTest):
def test_invalid_utf8_encoded_body_with_valid_utf8_BOM(self):
r6 = self.response_class("http://www.example.com", headers={"Content-type": ["text/html; charset=utf-8"]}, body="\xef\xbb\xbfWORD\xe3\xab")
self.assertEqual(r6.encoding, 'utf-8')
- self.assertEqual(r6.body_as_unicode(), u'\ufeffWORD\ufffd')
+ self.assertEqual(r6.body_as_unicode(), u'\ufeffWORD\ufffd\ufffd')
def test_replace_wrong_encoding(self):
"""Test invalid chars are replaced properly"""
@@ -223,6 +223,13 @@ class TextResponseTest(BaseResponseTest):
# XXX: Policy for replacing invalid chars may suffer minor variations
# but it should always contain the unicode replacement char (u'\ufffd')
assert u'\ufffd' in r.body_as_unicode(), repr(r.body_as_unicode())
+ assert u'PREFIX' in r.body_as_unicode(), repr(r.body_as_unicode())
+ assert u'SUFFIX' in r.body_as_unicode(), repr(r.body_as_unicode())
+
+ # Do not destroy html tags due to encoding bugs
+ r = self.response_class("http://example.com", encoding='utf-8', \
+ body='\xf0value')
+ assert u'value' in r.body_as_unicode(), repr(r.body_as_unicode())
# FIXME: This test should pass once we stop using BeautifulSoup's UnicodeDammit in TextResponse
#r = self.response_class("http://www.example.com", body='PREFIX\xe3\xabSUFFIX')