提交 180c091f 编写于 作者: P Pablo Hoffman

Fixed encoding issue (reported in #135) when the encoding declared in the HTTP...

Fixed encoding issue (reported in #135) when the encoding declared in the HTTP header is unknown. This is the patch proposed by Rolando, with an update to the Request/Response documentation.
上级 bbef0fe8
......@@ -466,12 +466,14 @@ TextResponse objects
.. attribute:: TextResponse.encoding
A string with the encoding of this response. The encoding is resolved in the
following order:
A string with the encoding of this response. The encoding is resolved by
trying the following mechanisms, in order:
1. the encoding passed in the constructor `encoding` argument
2. the encoding declared in the Content-Type HTTP header
2. the encoding declared in the Content-Type HTTP header. If this
encoding is not valid (ie. unknown), it is ignored and the next
resolution mechanism is tried.
3. the encoding declared in the response body. The TextResponse class
doesn't provide any special functionality for this. However, the
......
......@@ -5,6 +5,7 @@ discovering (through HTTP headers) to base Response class.
See documentation in docs/topics/request-response.rst
"""
import codecs
import re
from scrapy.xlib.BeautifulSoup import UnicodeDammit
......@@ -64,7 +65,12 @@ class TextResponse(Response):
if content_type:
encoding = self._ENCODING_RE.search(content_type)
if encoding:
return encoding.group(1)
enc = encoding.group(1)
try:
codecs.lookup(enc) # check if the encoding is valid
return enc
except LookupError:
pass
@memoizemethod_noargs
def body_as_unicode(self):
......
......@@ -175,6 +175,8 @@ class TextResponseTest(BaseResponseTest):
r2 = self.response_class("http://www.example.com", encoding='utf-8', body=u"\xa3")
r3 = self.response_class("http://www.example.com", headers={"Content-type": ["text/html; charset=iso-8859-1"]}, body="\xa3")
r4 = self.response_class("http://www.example.com", body="\xa2\xa3")
r5 = self.response_class("http://www.example.com",
headers={"Content-type": ["text/html; charset=None"]}, body="\xc2\xa3")
self.assertEqual(r1.headers_encoding(), "utf-8")
self.assertEqual(r2.headers_encoding(), None)
......@@ -182,6 +184,8 @@ class TextResponseTest(BaseResponseTest):
self.assertEqual(r3.headers_encoding(), "iso-8859-1")
self.assertEqual(r3.encoding, 'iso-8859-1')
self.assertEqual(r4.headers_encoding(), None)
self.assertEqual(r5.headers_encoding(), None)
self.assertEqual(r5.encoding, "utf-8")
assert r4.body_encoding() is not None and r4.body_encoding() != 'ascii'
self._assert_response_values(r1, 'utf-8', u"\xa3")
self._assert_response_values(r2, 'utf-8', u"\xa3")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册