提交 83d5eff0 编写于 作者: P Pablo Hoffman

More refactoring to encoding handling in TextResponse and subclasses

上级 de896fa6
......@@ -19,12 +19,13 @@ class TextResponse(Response):
_DEFAULT_ENCODING = settings['DEFAULT_RESPONSE_ENCODING']
_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)
__slots__ = ['_encoding', '_cached_benc']
__slots__ = ['_encoding', '_cached_benc', '_cached_ubody']
def __init__(self, url, status=200, headers=None, body=None, meta=None, \
flags=None, encoding=None):
self._encoding = encoding
self._cached_benc = None
self._cached_ubody = None
super(TextResponse, self).__init__(url, status, headers, body, meta, flags)
def _get_url(self):
......@@ -57,24 +58,27 @@ class TextResponse(Response):
@property
def encoding(self):
return self._get_encoding(infer=True)
def _get_encoding(self, infer=False):
enc = self._declared_encoding()
if not (enc and encoding_exists(enc)):
enc = self._body_inferred_encoding() or self._DEFAULT_ENCODING
if enc and not encoding_exists(enc):
enc = None
if not enc and infer:
enc = self._body_inferred_encoding()
if not enc:
enc = self._DEFAULT_ENCODING
return resolve_encoding(enc)
def _declared_encoding(self):
return self._encoding or self._headers_encoding() \
or self._body_declared_encoding()
@memoizemethod_noargs
def body_as_unicode(self):
"""Return body as unicode"""
denc = self._declared_encoding()
dencs = [resolve_encoding(denc)] if denc else []
dammit = UnicodeDammit(self.body, dencs)
benc = dammit.originalEncoding
self._cached_benc = benc if benc != 'ascii' else None
return self.body.decode(benc) if benc == 'utf-16' else dammit.unicode
if self._cached_ubody is None:
self._cached_ubody = self.body.decode(self.encoding, 'replace')
return self._cached_ubody
@memoizemethod_noargs
def _headers_encoding(self):
......@@ -88,7 +92,13 @@ class TextResponse(Response):
def _body_inferred_encoding(self):
if self._cached_benc is None:
self.body_as_unicode()
enc = self._get_encoding()
dammit = UnicodeDammit(self.body, [enc])
benc = dammit.originalEncoding
self._cached_benc = benc
# UnicodeDammit is buggy decoding utf-16
if self._cached_ubody is None and benc != 'utf-16':
self._cached_ubody = dammit.unicode
return self._cached_benc
def _body_declared_encoding(self):
......
......@@ -171,13 +171,6 @@ class RequestTest(unittest.TestCase):
self.assertEqual(r4.meta, {})
assert r4.dont_filter is False
# __init__ and replace() signatures must be equal unles *args,**kwargs is used
i_args, i_varargs, i_varkwargs, _ = getargspec(self.request_class.__init__)
self.assertFalse(bool(i_varargs) ^ bool(i_varkwargs))
if not i_varargs:
r_args, _, _, _ = getargspec(self.request_class.replace)
self.assertEqual(i_args, r_args)
def test_weakref_slots(self):
"""Check that classes are using slots and are weak-referenceable"""
x = self.request_class('http://www.example.com')
......
......@@ -3,7 +3,6 @@ import weakref
from scrapy.http import Response, TextResponse, HtmlResponse, XmlResponse, Headers
from scrapy.utils.encoding import resolve_encoding
from scrapy.conf import settings
class BaseResponseTest(unittest.TestCase):
......@@ -145,7 +144,7 @@ class TextResponseTest(BaseResponseTest):
def test_unicode_url(self):
# instantiate with unicode url without encoding (should set default encoding)
resp = self.response_class(u"http://www.example.com/")
self._assert_response_encoding(resp, settings['DEFAULT_RESPONSE_ENCODING'])
self._assert_response_encoding(resp, self.response_class._DEFAULT_ENCODING)
# make sure urls are converted to str
resp = self.response_class(url=u"http://www.example.com/", encoding='utf-8')
......@@ -198,6 +197,32 @@ class TextResponseTest(BaseResponseTest):
# TextResponse (and subclasses) must be passed a encoding when instantiating with unicode bodies
self.assertRaises(TypeError, self.response_class, "http://www.example.com", body=u"\xa3")
def test_declared_encoding_invalid(self):
"""Check that unknown declared encodings are ignored"""
r = self.response_class("http://www.example.com", headers={"Content-type": ["text/html; charset=UKNOWN"]}, body="\xc2\xa3")
self.assertEqual(r._declared_encoding(), None)
self._assert_response_values(r, 'utf-8', u"\xa3")
def test_utf16(self):
"""Test utf-16 because UnicodeDammit is known to have problems with"""
r = self.response_class("http://www.example.com", body='\xff\xfeh\x00i\x00', encoding='utf-16')
self._assert_response_values(r, 'utf-16', u"hi")
def test_invalid_utf8_encoded_body_with_valid_utf8_BOM(self):
r6 = self.response_class("http://www.example.com", headers={"Content-type": ["text/html; charset=utf-8"]}, body="\xef\xbb\xbfWORD\xe3\xab")
self.assertEqual(r6.encoding, 'utf-8')
self.assertEqual(r6.body_as_unicode(), u'\ufeffWORD\ufffd')
def test_replace_wrong_encoding(self):
"""Test invalid chars are replaced properly"""
# XXX: Policy for replacing invalid chars may change without prior notice
r = self.response_class("http://www.example.com", encoding='utf-8', body='PREFIX\xe3\xabSUFFIX')
assert u'\ufffd' in r.body_as_unicode(), repr(r.body_as_unicode())
# FIXME: This test should pass once we stop using BeautifulSoup's UnicodeDammit in TextResponse
#r = self.response_class("http://www.example.com", body='PREFIX\xe3\xabSUFFIX')
#assert u'\ufffd' in r.body_as_unicode(), repr(r.body_as_unicode())
class HtmlResponseTest(TextResponseTest):
response_class = HtmlResponse
......@@ -239,7 +264,7 @@ class XmlResponseTest(TextResponseTest):
body = "<xml></xml>"
r1 = self.response_class("http://www.example.com", body=body)
self._assert_response_values(r1, settings['DEFAULT_RESPONSE_ENCODING'], body)
self._assert_response_values(r1, self.response_class._DEFAULT_ENCODING, body)
body = """<?xml version="1.0" encoding="iso-8859-1"?><xml></xml>"""
r2 = self.response_class("http://www.example.com", body=body)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册