Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
looyolo
scrapy
提交
83d5eff0
S
scrapy
项目概览
looyolo
/
scrapy
与 Fork 源项目一致
从无法访问的项目Fork
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
S
scrapy
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
83d5eff0
编写于
3月 31, 2010
作者:
P
Pablo Hoffman
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
More refactoring to encoding handling in TextResponse and subclasses
上级
de896fa6
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
49 addition
and
21 deletion
+49
-21
scrapy/http/response/text.py
scrapy/http/response/text.py
+21
-11
scrapy/tests/test_http_request.py
scrapy/tests/test_http_request.py
+0
-7
scrapy/tests/test_http_response.py
scrapy/tests/test_http_response.py
+28
-3
未找到文件。
scrapy/http/response/text.py
浏览文件 @
83d5eff0
...
...
@@ -19,12 +19,13 @@ class TextResponse(Response):
_DEFAULT_ENCODING
=
settings
[
'DEFAULT_RESPONSE_ENCODING'
]
_ENCODING_RE
=
re
.
compile
(
r
'charset=([\w-]+)'
,
re
.
I
)
__slots__
=
[
'_encoding'
,
'_cached_benc'
]
__slots__
=
[
'_encoding'
,
'_cached_benc'
,
'_cached_ubody'
]
def
__init__
(
self
,
url
,
status
=
200
,
headers
=
None
,
body
=
None
,
meta
=
None
,
\
flags
=
None
,
encoding
=
None
):
self
.
_encoding
=
encoding
self
.
_cached_benc
=
None
self
.
_cached_ubody
=
None
super
(
TextResponse
,
self
).
__init__
(
url
,
status
,
headers
,
body
,
meta
,
flags
)
def
_get_url
(
self
):
...
...
@@ -57,24 +58,27 @@ class TextResponse(Response):
@
property
def
encoding
(
self
):
return
self
.
_get_encoding
(
infer
=
True
)
def
_get_encoding
(
self
,
infer
=
False
):
enc
=
self
.
_declared_encoding
()
if
not
(
enc
and
encoding_exists
(
enc
)):
enc
=
self
.
_body_inferred_encoding
()
or
self
.
_DEFAULT_ENCODING
if
enc
and
not
encoding_exists
(
enc
):
enc
=
None
if
not
enc
and
infer
:
enc
=
self
.
_body_inferred_encoding
()
if
not
enc
:
enc
=
self
.
_DEFAULT_ENCODING
return
resolve_encoding
(
enc
)
def
_declared_encoding
(
self
):
return
self
.
_encoding
or
self
.
_headers_encoding
()
\
or
self
.
_body_declared_encoding
()
@
memoizemethod_noargs
def
body_as_unicode
(
self
):
"""Return body as unicode"""
denc
=
self
.
_declared_encoding
()
dencs
=
[
resolve_encoding
(
denc
)]
if
denc
else
[]
dammit
=
UnicodeDammit
(
self
.
body
,
dencs
)
benc
=
dammit
.
originalEncoding
self
.
_cached_benc
=
benc
if
benc
!=
'ascii'
else
None
return
self
.
body
.
decode
(
benc
)
if
benc
==
'utf-16'
else
dammit
.
unicode
if
self
.
_cached_ubody
is
None
:
self
.
_cached_ubody
=
self
.
body
.
decode
(
self
.
encoding
,
'replace'
)
return
self
.
_cached_ubody
@
memoizemethod_noargs
def
_headers_encoding
(
self
):
...
...
@@ -88,7 +92,13 @@ class TextResponse(Response):
def
_body_inferred_encoding
(
self
):
if
self
.
_cached_benc
is
None
:
self
.
body_as_unicode
()
enc
=
self
.
_get_encoding
()
dammit
=
UnicodeDammit
(
self
.
body
,
[
enc
])
benc
=
dammit
.
originalEncoding
self
.
_cached_benc
=
benc
# UnicodeDammit is buggy decoding utf-16
if
self
.
_cached_ubody
is
None
and
benc
!=
'utf-16'
:
self
.
_cached_ubody
=
dammit
.
unicode
return
self
.
_cached_benc
def
_body_declared_encoding
(
self
):
...
...
scrapy/tests/test_http_request.py
浏览文件 @
83d5eff0
...
...
@@ -171,13 +171,6 @@ class RequestTest(unittest.TestCase):
self
.
assertEqual
(
r4
.
meta
,
{})
assert
r4
.
dont_filter
is
False
# __init__ and replace() signatures must be equal unles *args,**kwargs is used
i_args
,
i_varargs
,
i_varkwargs
,
_
=
getargspec
(
self
.
request_class
.
__init__
)
self
.
assertFalse
(
bool
(
i_varargs
)
^
bool
(
i_varkwargs
))
if
not
i_varargs
:
r_args
,
_
,
_
,
_
=
getargspec
(
self
.
request_class
.
replace
)
self
.
assertEqual
(
i_args
,
r_args
)
def
test_weakref_slots
(
self
):
"""Check that classes are using slots and are weak-referenceable"""
x
=
self
.
request_class
(
'http://www.example.com'
)
...
...
scrapy/tests/test_http_response.py
浏览文件 @
83d5eff0
...
...
@@ -3,7 +3,6 @@ import weakref
from
scrapy.http
import
Response
,
TextResponse
,
HtmlResponse
,
XmlResponse
,
Headers
from
scrapy.utils.encoding
import
resolve_encoding
from
scrapy.conf
import
settings
class
BaseResponseTest
(
unittest
.
TestCase
):
...
...
@@ -145,7 +144,7 @@ class TextResponseTest(BaseResponseTest):
def
test_unicode_url
(
self
):
# instantiate with unicode url without encoding (should set default encoding)
resp
=
self
.
response_class
(
u
"http://www.example.com/"
)
self
.
_assert_response_encoding
(
resp
,
se
ttings
[
'DEFAULT_RESPONSE_ENCODING'
]
)
self
.
_assert_response_encoding
(
resp
,
se
lf
.
response_class
.
_DEFAULT_ENCODING
)
# make sure urls are converted to str
resp
=
self
.
response_class
(
url
=
u
"http://www.example.com/"
,
encoding
=
'utf-8'
)
...
...
@@ -198,6 +197,32 @@ class TextResponseTest(BaseResponseTest):
# TextResponse (and subclasses) must be passed a encoding when instantiating with unicode bodies
self
.
assertRaises
(
TypeError
,
self
.
response_class
,
"http://www.example.com"
,
body
=
u
"
\xa3
"
)
def
test_declared_encoding_invalid
(
self
):
"""Check that unknown declared encodings are ignored"""
r
=
self
.
response_class
(
"http://www.example.com"
,
headers
=
{
"Content-type"
:
[
"text/html; charset=UKNOWN"
]},
body
=
"
\xc2\xa3
"
)
self
.
assertEqual
(
r
.
_declared_encoding
(),
None
)
self
.
_assert_response_values
(
r
,
'utf-8'
,
u
"
\xa3
"
)
def
test_utf16
(
self
):
"""Test utf-16 because UnicodeDammit is known to have problems with"""
r
=
self
.
response_class
(
"http://www.example.com"
,
body
=
'
\xff\xfe
h
\x00
i
\x00
'
,
encoding
=
'utf-16'
)
self
.
_assert_response_values
(
r
,
'utf-16'
,
u
"hi"
)
def
test_invalid_utf8_encoded_body_with_valid_utf8_BOM
(
self
):
r6
=
self
.
response_class
(
"http://www.example.com"
,
headers
=
{
"Content-type"
:
[
"text/html; charset=utf-8"
]},
body
=
"
\xef\xbb\xbf
WORD
\xe3\xab
"
)
self
.
assertEqual
(
r6
.
encoding
,
'utf-8'
)
self
.
assertEqual
(
r6
.
body_as_unicode
(),
u
'
\ufeff
WORD
\ufffd
'
)
def
test_replace_wrong_encoding
(
self
):
"""Test invalid chars are replaced properly"""
# XXX: Policy for replacing invalid chars may change without prior notice
r
=
self
.
response_class
(
"http://www.example.com"
,
encoding
=
'utf-8'
,
body
=
'PREFIX
\xe3\xab
SUFFIX'
)
assert
u
'
\ufffd
'
in
r
.
body_as_unicode
(),
repr
(
r
.
body_as_unicode
())
# FIXME: This test should pass once we stop using BeautifulSoup's UnicodeDammit in TextResponse
#r = self.response_class("http://www.example.com", body='PREFIX\xe3\xabSUFFIX')
#assert u'\ufffd' in r.body_as_unicode(), repr(r.body_as_unicode())
class
HtmlResponseTest
(
TextResponseTest
):
response_class
=
HtmlResponse
...
...
@@ -239,7 +264,7 @@ class XmlResponseTest(TextResponseTest):
body
=
"<xml></xml>"
r1
=
self
.
response_class
(
"http://www.example.com"
,
body
=
body
)
self
.
_assert_response_values
(
r1
,
se
ttings
[
'DEFAULT_RESPONSE_ENCODING'
]
,
body
)
self
.
_assert_response_values
(
r1
,
se
lf
.
response_class
.
_DEFAULT_ENCODING
,
body
)
body
=
"""<?xml version="1.0" encoding="iso-8859-1"?><xml></xml>"""
r2
=
self
.
response_class
(
"http://www.example.com"
,
body
=
body
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录