From 798169805ad13a01c057cda0b66a18605a59a529 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Gra=C3=B1a?= Date: Tue, 28 Feb 2012 14:32:55 -0200 Subject: [PATCH] Adapt response encoding detection to pass test cases --- scrapy/http/response/text.py | 34 +++++++++++++----------------- scrapy/tests/test_http_response.py | 4 ++-- scrapy/utils/encoding.py | 11 ---------- 3 files changed, 17 insertions(+), 32 deletions(-) delete mode 100644 scrapy/utils/encoding.py diff --git a/scrapy/http/response/text.py b/scrapy/http/response/text.py index 00053f5ec..175ba02b3 100644 --- a/scrapy/http/response/text.py +++ b/scrapy/http/response/text.py @@ -6,10 +6,9 @@ See documentation in docs/topics/request-response.rst """ from w3lib.encoding import html_to_unicode, resolve_encoding, \ - html_body_declared_encoding, http_content_type_encoding + html_body_declared_encoding, http_content_type_encoding, to_unicode from scrapy.http.response import Response from scrapy.utils.python import memoizemethod_noargs -from scrapy.utils.encoding import encoding_exists from scrapy.conf import settings @@ -48,17 +47,7 @@ class TextResponse(Response): @property def encoding(self): - return self._get_encoding(infer=True) - - def _get_encoding(self, infer=False): - enc = self._declared_encoding() - if enc and not encoding_exists(enc): - enc = None - if not enc and infer: - enc = self._body_inferred_encoding() - if not enc: - enc = self._DEFAULT_ENCODING - return resolve_encoding(enc) + return self._declared_encoding() or self._body_inferred_encoding() def _declared_encoding(self): return self._encoding or self._headers_encoding() \ @@ -67,7 +56,7 @@ class TextResponse(Response): def body_as_unicode(self): """Return body as unicode""" if self._cached_ubody is None: - self._cached_ubody = self.body.decode(self.encoding, 'scrapy_replace') + self._cached_ubody = to_unicode(self.body, self.encoding) return self._cached_ubody @memoizemethod_noargs @@ -78,14 +67,21 @@ class TextResponse(Response): def _body_inferred_encoding(self): if self._cached_benc is None: content_type = self.headers.get('Content-Type') - benc, _ = html_to_unicode(content_type, self.body, default_encoding=self._DEFAULT_ENCODING) + benc, ubody = html_to_unicode(content_type, self.body, \ + auto_detect_fun=self._auto_detect_fun, \ + default_encoding=self._DEFAULT_ENCODING) self._cached_benc = benc - # XXX: is this needed? - # UnicodeDammit is buggy decoding utf-16 - #if self._cached_ubody is None and benc != 'utf-16': - # self._cached_ubody = dammit.unicode + self._cached_ubody = ubody return self._cached_benc + def _auto_detect_fun(self, text): + for enc in (self._DEFAULT_ENCODING, 'utf-8', 'cp1252'): + try: + text.decode(enc) + except UnicodeError: + continue + return resolve_encoding(enc) + @memoizemethod_noargs def _body_declared_encoding(self): return html_body_declared_encoding(self.body) diff --git a/scrapy/tests/test_http_response.py b/scrapy/tests/test_http_response.py index fdf923996..2d3ca5ecf 100644 --- a/scrapy/tests/test_http_response.py +++ b/scrapy/tests/test_http_response.py @@ -171,8 +171,8 @@ class TextResponseTest(BaseResponseTest): self.assertEqual(r2._headers_encoding(), None) self.assertEqual(r2._declared_encoding(), 'utf-8') self._assert_response_encoding(r2, 'utf-8') - self.assertEqual(r3._headers_encoding(), "iso-8859-1") - self.assertEqual(r3._declared_encoding(), "iso-8859-1") + self.assertEqual(r3._headers_encoding(), "cp1252") + self.assertEqual(r3._declared_encoding(), "cp1252") self.assertEqual(r4._headers_encoding(), None) self.assertEqual(r5._headers_encoding(), None) self._assert_response_encoding(r5, "utf-8") diff --git a/scrapy/utils/encoding.py b/scrapy/utils/encoding.py deleted file mode 100644 index 7e33943cd..000000000 --- a/scrapy/utils/encoding.py +++ /dev/null @@ -1,11 +0,0 @@ -import codecs - -from w3lib.encoding import resolve_encoding - -def encoding_exists(encoding): - """Returns ``True`` if encoding is valid, otherwise returns ``False``""" - try: - codecs.lookup(resolve_encoding(encoding)) - except LookupError: - return False - return True -- GitLab