diff --git a/docs/topics/request-response.rst b/docs/topics/request-response.rst index 3472b1937ad88a295764903b2e11287e117240cc..9f4dcdc46556d68743041946dc4b9c38f45bba97 100644 --- a/docs/topics/request-response.rst +++ b/docs/topics/request-response.rst @@ -466,12 +466,14 @@ TextResponse objects .. attribute:: TextResponse.encoding - A string with the encoding of this response. The encoding is resolved in the - following order: + A string with the encoding of this response. The encoding is resolved by + trying the following mechanisms, in order: 1. the encoding passed in the constructor `encoding` argument - 2. the encoding declared in the Content-Type HTTP header + 2. the encoding declared in the Content-Type HTTP header. If this + encoding is not valid (ie. unknown), it is ignored and the next + resolution mechanism is tried. 3. the encoding declared in the response body. The TextResponse class doesn't provide any special functionality for this. However, the diff --git a/scrapy/http/response/text.py b/scrapy/http/response/text.py index 1c13e729cfdb2bbab8316600108db287c42de3b3..d42a891972716011538cee0e5b0d676514802196 100644 --- a/scrapy/http/response/text.py +++ b/scrapy/http/response/text.py @@ -5,6 +5,7 @@ discovering (through HTTP headers) to base Response class. See documentation in docs/topics/request-response.rst """ +import codecs import re from scrapy.xlib.BeautifulSoup import UnicodeDammit @@ -64,7 +65,12 @@ class TextResponse(Response): if content_type: encoding = self._ENCODING_RE.search(content_type) if encoding: - return encoding.group(1) + enc = encoding.group(1) + try: + codecs.lookup(enc) # check if the encoding is valid + return enc + except LookupError: + pass @memoizemethod_noargs def body_as_unicode(self): diff --git a/scrapy/tests/test_http_response.py b/scrapy/tests/test_http_response.py index 3b0a144d2532c75d00d899d038c232d4deca8d52..5851572ac017410dd13c85b74e276e332b613b4f 100644 --- a/scrapy/tests/test_http_response.py +++ b/scrapy/tests/test_http_response.py @@ -175,6 +175,8 @@ class TextResponseTest(BaseResponseTest): r2 = self.response_class("http://www.example.com", encoding='utf-8', body=u"\xa3") r3 = self.response_class("http://www.example.com", headers={"Content-type": ["text/html; charset=iso-8859-1"]}, body="\xa3") r4 = self.response_class("http://www.example.com", body="\xa2\xa3") + r5 = self.response_class("http://www.example.com", + headers={"Content-type": ["text/html; charset=None"]}, body="\xc2\xa3") self.assertEqual(r1.headers_encoding(), "utf-8") self.assertEqual(r2.headers_encoding(), None) @@ -182,6 +184,8 @@ class TextResponseTest(BaseResponseTest): self.assertEqual(r3.headers_encoding(), "iso-8859-1") self.assertEqual(r3.encoding, 'iso-8859-1') self.assertEqual(r4.headers_encoding(), None) + self.assertEqual(r5.headers_encoding(), None) + self.assertEqual(r5.encoding, "utf-8") assert r4.body_encoding() is not None and r4.body_encoding() != 'ascii' self._assert_response_values(r1, 'utf-8', u"\xa3") self._assert_response_values(r2, 'utf-8', u"\xa3")