Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
looyolo
scrapy
提交
12f2006b
S
scrapy
项目概览
looyolo
/
scrapy
与 Fork 源项目一致
从无法访问的项目Fork
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
S
scrapy
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
12f2006b
编写于
4月 01, 2021
作者:
M
Mikhail Korobov
提交者:
GitHub
4月 01, 2021
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #4799 from GeorgeA92/patch-2
httpcompression stats added
上级
f0c8d311
f3064254
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
124 addition
and
7 deletion
+124
-7
scrapy/downloadermiddlewares/httpcompression.py
scrapy/downloadermiddlewares/httpcompression.py
+22
-3
tests/test_downloadermiddleware_httpcompression.py
tests/test_downloadermiddleware_httpcompression.py
+102
-4
未找到文件。
scrapy/downloadermiddlewares/httpcompression.py
浏览文件 @
12f2006b
import
io
import
io
import
warnings
import
zlib
import
zlib
from
scrapy.
utils.gz
import
gunzip
from
scrapy.
exceptions
import
NotConfigured
from
scrapy.http
import
Response
,
TextResponse
from
scrapy.http
import
Response
,
TextResponse
from
scrapy.responsetypes
import
responsetypes
from
scrapy.responsetypes
import
responsetypes
from
scrapy.exceptions
import
NotConfigured
from
scrapy.utils.deprecate
import
ScrapyDeprecationWarning
from
scrapy.utils.gz
import
gunzip
ACCEPTED_ENCODINGS
=
[
b
'gzip'
,
b
'deflate'
]
ACCEPTED_ENCODINGS
=
[
b
'gzip'
,
b
'deflate'
]
...
@@ -25,11 +27,25 @@ except ImportError:
...
@@ -25,11 +27,25 @@ except ImportError:
class
HttpCompressionMiddleware
:
class
HttpCompressionMiddleware
:
"""This middleware allows compressed (gzip, deflate) traffic to be
"""This middleware allows compressed (gzip, deflate) traffic to be
sent/received from web sites"""
sent/received from web sites"""
def
__init__
(
self
,
stats
=
None
):
self
.
stats
=
stats
@
classmethod
@
classmethod
def
from_crawler
(
cls
,
crawler
):
def
from_crawler
(
cls
,
crawler
):
if
not
crawler
.
settings
.
getbool
(
'COMPRESSION_ENABLED'
):
if
not
crawler
.
settings
.
getbool
(
'COMPRESSION_ENABLED'
):
raise
NotConfigured
raise
NotConfigured
return
cls
()
try
:
return
cls
(
stats
=
crawler
.
stats
)
except
TypeError
:
warnings
.
warn
(
"HttpCompressionMiddleware subclasses must either modify "
"their '__init__' method to support a 'stats' parameter or "
"reimplement the 'from_crawler' method."
,
ScrapyDeprecationWarning
,
)
result
=
cls
()
result
.
stats
=
crawler
.
stats
return
result
def
process_request
(
self
,
request
,
spider
):
def
process_request
(
self
,
request
,
spider
):
request
.
headers
.
setdefault
(
'Accept-Encoding'
,
request
.
headers
.
setdefault
(
'Accept-Encoding'
,
...
@@ -44,6 +60,9 @@ class HttpCompressionMiddleware:
...
@@ -44,6 +60,9 @@ class HttpCompressionMiddleware:
if
content_encoding
:
if
content_encoding
:
encoding
=
content_encoding
.
pop
()
encoding
=
content_encoding
.
pop
()
decoded_body
=
self
.
_decode
(
response
.
body
,
encoding
.
lower
())
decoded_body
=
self
.
_decode
(
response
.
body
,
encoding
.
lower
())
if
self
.
stats
:
self
.
stats
.
inc_value
(
'httpcompression/response_bytes'
,
len
(
decoded_body
),
spider
=
spider
)
self
.
stats
.
inc_value
(
'httpcompression/response_count'
,
spider
=
spider
)
respcls
=
responsetypes
.
from_args
(
respcls
=
responsetypes
.
from_args
(
headers
=
response
.
headers
,
url
=
response
.
url
,
body
=
decoded_body
headers
=
response
.
headers
,
url
=
response
.
url
,
body
=
decoded_body
)
)
...
...
tests/test_downloadermiddleware_httpcompression.py
浏览文件 @
12f2006b
from
gzip
import
GzipFile
from
io
import
BytesIO
from
io
import
BytesIO
from
unittest
import
TestCase
,
SkipTest
from
os.path
import
join
from
os.path
import
join
from
gzip
import
GzipFile
from
unittest
import
TestCase
,
SkipTest
from
warnings
import
catch_warnings
from
scrapy.spiders
import
Spider
from
scrapy.spiders
import
Spider
from
scrapy.http
import
Response
,
Request
,
HtmlResponse
from
scrapy.http
import
Response
,
Request
,
HtmlResponse
from
scrapy.downloadermiddlewares.httpcompression
import
HttpCompressionMiddleware
,
ACCEPTED_ENCODINGS
from
scrapy.downloadermiddlewares.httpcompression
import
HttpCompressionMiddleware
,
ACCEPTED_ENCODINGS
from
scrapy.exceptions
import
NotConfigured
,
ScrapyDeprecationWarning
from
scrapy.responsetypes
import
responsetypes
from
scrapy.responsetypes
import
responsetypes
from
scrapy.utils.gz
import
gunzip
from
scrapy.utils.gz
import
gunzip
from
scrapy.utils.test
import
get_crawler
from
tests
import
tests_datadir
from
tests
import
tests_datadir
from
w3lib.encoding
import
resolve_encoding
from
w3lib.encoding
import
resolve_encoding
...
@@ -32,8 +35,10 @@ FORMAT = {
...
@@ -32,8 +35,10 @@ FORMAT = {
class
HttpCompressionTest
(
TestCase
):
class
HttpCompressionTest
(
TestCase
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
spider
=
Spider
(
'foo'
)
self
.
crawler
=
get_crawler
(
Spider
)
self
.
mw
=
HttpCompressionMiddleware
()
self
.
spider
=
self
.
crawler
.
_create_spider
(
'scrapytest.org'
)
self
.
mw
=
HttpCompressionMiddleware
.
from_crawler
(
self
.
crawler
)
self
.
crawler
.
stats
.
open_spider
(
self
.
spider
)
def
_getresponse
(
self
,
coding
):
def
_getresponse
(
self
,
coding
):
if
coding
not
in
FORMAT
:
if
coding
not
in
FORMAT
:
...
@@ -56,6 +61,34 @@ class HttpCompressionTest(TestCase):
...
@@ -56,6 +61,34 @@ class HttpCompressionTest(TestCase):
response
.
request
=
Request
(
'http://scrapytest.org'
,
headers
=
{
'Accept-Encoding'
:
'gzip, deflate'
})
response
.
request
=
Request
(
'http://scrapytest.org'
,
headers
=
{
'Accept-Encoding'
:
'gzip, deflate'
})
return
response
return
response
def
assertStatsEqual
(
self
,
key
,
value
):
self
.
assertEqual
(
self
.
crawler
.
stats
.
get_value
(
key
,
spider
=
self
.
spider
),
value
,
str
(
self
.
crawler
.
stats
.
get_stats
(
self
.
spider
))
)
def
test_setting_false_compression_enabled
(
self
):
self
.
assertRaises
(
NotConfigured
,
HttpCompressionMiddleware
.
from_crawler
,
get_crawler
(
settings_dict
=
{
'COMPRESSION_ENABLED'
:
False
})
)
def
test_setting_default_compression_enabled
(
self
):
self
.
assertIsInstance
(
HttpCompressionMiddleware
.
from_crawler
(
get_crawler
()),
HttpCompressionMiddleware
)
def
test_setting_true_compression_enabled
(
self
):
self
.
assertIsInstance
(
HttpCompressionMiddleware
.
from_crawler
(
get_crawler
(
settings_dict
=
{
'COMPRESSION_ENABLED'
:
True
})
),
HttpCompressionMiddleware
)
def
test_process_request
(
self
):
def
test_process_request
(
self
):
request
=
Request
(
'http://scrapytest.org'
)
request
=
Request
(
'http://scrapytest.org'
)
assert
'Accept-Encoding'
not
in
request
.
headers
assert
'Accept-Encoding'
not
in
request
.
headers
...
@@ -72,6 +105,20 @@ class HttpCompressionTest(TestCase):
...
@@ -72,6 +105,20 @@ class HttpCompressionTest(TestCase):
assert
newresponse
is
not
response
assert
newresponse
is
not
response
assert
newresponse
.
body
.
startswith
(
b
'<!DOCTYPE'
)
assert
newresponse
.
body
.
startswith
(
b
'<!DOCTYPE'
)
assert
'Content-Encoding'
not
in
newresponse
.
headers
assert
'Content-Encoding'
not
in
newresponse
.
headers
self
.
assertStatsEqual
(
'httpcompression/response_count'
,
1
)
self
.
assertStatsEqual
(
'httpcompression/response_bytes'
,
74837
)
def
test_process_response_gzip_no_stats
(
self
):
mw
=
HttpCompressionMiddleware
()
response
=
self
.
_getresponse
(
'gzip'
)
request
=
response
.
request
self
.
assertEqual
(
response
.
headers
[
'Content-Encoding'
],
b
'gzip'
)
newresponse
=
mw
.
process_response
(
request
,
response
,
self
.
spider
)
self
.
assertEqual
(
mw
.
stats
,
None
)
assert
newresponse
is
not
response
assert
newresponse
.
body
.
startswith
(
b
'<!DOCTYPE'
)
assert
'Content-Encoding'
not
in
newresponse
.
headers
def
test_process_response_br
(
self
):
def
test_process_response_br
(
self
):
try
:
try
:
...
@@ -85,6 +132,8 @@ class HttpCompressionTest(TestCase):
...
@@ -85,6 +132,8 @@ class HttpCompressionTest(TestCase):
assert
newresponse
is
not
response
assert
newresponse
is
not
response
assert
newresponse
.
body
.
startswith
(
b
"<!DOCTYPE"
)
assert
newresponse
.
body
.
startswith
(
b
"<!DOCTYPE"
)
assert
'Content-Encoding'
not
in
newresponse
.
headers
assert
'Content-Encoding'
not
in
newresponse
.
headers
self
.
assertStatsEqual
(
'httpcompression/response_count'
,
1
)
self
.
assertStatsEqual
(
'httpcompression/response_bytes'
,
74837
)
def
test_process_response_zstd
(
self
):
def
test_process_response_zstd
(
self
):
try
:
try
:
...
@@ -116,6 +165,8 @@ class HttpCompressionTest(TestCase):
...
@@ -116,6 +165,8 @@ class HttpCompressionTest(TestCase):
assert
newresponse
is
not
response
assert
newresponse
is
not
response
assert
newresponse
.
body
.
startswith
(
b
'<!DOCTYPE'
)
assert
newresponse
.
body
.
startswith
(
b
'<!DOCTYPE'
)
assert
'Content-Encoding'
not
in
newresponse
.
headers
assert
'Content-Encoding'
not
in
newresponse
.
headers
self
.
assertStatsEqual
(
'httpcompression/response_count'
,
1
)
self
.
assertStatsEqual
(
'httpcompression/response_bytes'
,
74840
)
def
test_process_response_zlibdelate
(
self
):
def
test_process_response_zlibdelate
(
self
):
response
=
self
.
_getresponse
(
'zlibdeflate'
)
response
=
self
.
_getresponse
(
'zlibdeflate'
)
...
@@ -126,6 +177,8 @@ class HttpCompressionTest(TestCase):
...
@@ -126,6 +177,8 @@ class HttpCompressionTest(TestCase):
assert
newresponse
is
not
response
assert
newresponse
is
not
response
assert
newresponse
.
body
.
startswith
(
b
'<!DOCTYPE'
)
assert
newresponse
.
body
.
startswith
(
b
'<!DOCTYPE'
)
assert
'Content-Encoding'
not
in
newresponse
.
headers
assert
'Content-Encoding'
not
in
newresponse
.
headers
self
.
assertStatsEqual
(
'httpcompression/response_count'
,
1
)
self
.
assertStatsEqual
(
'httpcompression/response_bytes'
,
74840
)
def
test_process_response_plain
(
self
):
def
test_process_response_plain
(
self
):
response
=
Response
(
'http://scrapytest.org'
,
body
=
b
'<!DOCTYPE...'
)
response
=
Response
(
'http://scrapytest.org'
,
body
=
b
'<!DOCTYPE...'
)
...
@@ -135,6 +188,8 @@ class HttpCompressionTest(TestCase):
...
@@ -135,6 +188,8 @@ class HttpCompressionTest(TestCase):
newresponse
=
self
.
mw
.
process_response
(
request
,
response
,
self
.
spider
)
newresponse
=
self
.
mw
.
process_response
(
request
,
response
,
self
.
spider
)
assert
newresponse
is
response
assert
newresponse
is
response
assert
newresponse
.
body
.
startswith
(
b
'<!DOCTYPE'
)
assert
newresponse
.
body
.
startswith
(
b
'<!DOCTYPE'
)
self
.
assertStatsEqual
(
'httpcompression/response_count'
,
None
)
self
.
assertStatsEqual
(
'httpcompression/response_bytes'
,
None
)
def
test_multipleencodings
(
self
):
def
test_multipleencodings
(
self
):
response
=
self
.
_getresponse
(
'gzip'
)
response
=
self
.
_getresponse
(
'gzip'
)
...
@@ -162,6 +217,8 @@ class HttpCompressionTest(TestCase):
...
@@ -162,6 +217,8 @@ class HttpCompressionTest(TestCase):
assert
isinstance
(
newresponse
,
HtmlResponse
)
assert
isinstance
(
newresponse
,
HtmlResponse
)
self
.
assertEqual
(
newresponse
.
body
,
plainbody
)
self
.
assertEqual
(
newresponse
.
body
,
plainbody
)
self
.
assertEqual
(
newresponse
.
encoding
,
resolve_encoding
(
'gb2312'
))
self
.
assertEqual
(
newresponse
.
encoding
,
resolve_encoding
(
'gb2312'
))
self
.
assertStatsEqual
(
'httpcompression/response_count'
,
1
)
self
.
assertStatsEqual
(
'httpcompression/response_bytes'
,
104
)
def
test_process_response_force_recalculate_encoding
(
self
):
def
test_process_response_force_recalculate_encoding
(
self
):
headers
=
{
headers
=
{
...
@@ -181,6 +238,8 @@ class HttpCompressionTest(TestCase):
...
@@ -181,6 +238,8 @@ class HttpCompressionTest(TestCase):
assert
isinstance
(
newresponse
,
HtmlResponse
)
assert
isinstance
(
newresponse
,
HtmlResponse
)
self
.
assertEqual
(
newresponse
.
body
,
plainbody
)
self
.
assertEqual
(
newresponse
.
body
,
plainbody
)
self
.
assertEqual
(
newresponse
.
encoding
,
resolve_encoding
(
'gb2312'
))
self
.
assertEqual
(
newresponse
.
encoding
,
resolve_encoding
(
'gb2312'
))
self
.
assertStatsEqual
(
'httpcompression/response_count'
,
1
)
self
.
assertStatsEqual
(
'httpcompression/response_bytes'
,
104
)
def
test_process_response_no_content_type_header
(
self
):
def
test_process_response_no_content_type_header
(
self
):
headers
=
{
headers
=
{
...
@@ -196,6 +255,8 @@ class HttpCompressionTest(TestCase):
...
@@ -196,6 +255,8 @@ class HttpCompressionTest(TestCase):
assert
isinstance
(
newresponse
,
respcls
)
assert
isinstance
(
newresponse
,
respcls
)
self
.
assertEqual
(
newresponse
.
body
,
plainbody
)
self
.
assertEqual
(
newresponse
.
body
,
plainbody
)
self
.
assertEqual
(
newresponse
.
encoding
,
resolve_encoding
(
'gb2312'
))
self
.
assertEqual
(
newresponse
.
encoding
,
resolve_encoding
(
'gb2312'
))
self
.
assertStatsEqual
(
'httpcompression/response_count'
,
1
)
self
.
assertStatsEqual
(
'httpcompression/response_bytes'
,
104
)
def
test_process_response_gzipped_contenttype
(
self
):
def
test_process_response_gzipped_contenttype
(
self
):
response
=
self
.
_getresponse
(
'gzip'
)
response
=
self
.
_getresponse
(
'gzip'
)
...
@@ -206,6 +267,8 @@ class HttpCompressionTest(TestCase):
...
@@ -206,6 +267,8 @@ class HttpCompressionTest(TestCase):
self
.
assertIsNot
(
newresponse
,
response
)
self
.
assertIsNot
(
newresponse
,
response
)
self
.
assertTrue
(
newresponse
.
body
.
startswith
(
b
'<!DOCTYPE'
))
self
.
assertTrue
(
newresponse
.
body
.
startswith
(
b
'<!DOCTYPE'
))
self
.
assertNotIn
(
'Content-Encoding'
,
newresponse
.
headers
)
self
.
assertNotIn
(
'Content-Encoding'
,
newresponse
.
headers
)
self
.
assertStatsEqual
(
'httpcompression/response_count'
,
1
)
self
.
assertStatsEqual
(
'httpcompression/response_bytes'
,
74837
)
def
test_process_response_gzip_app_octetstream_contenttype
(
self
):
def
test_process_response_gzip_app_octetstream_contenttype
(
self
):
response
=
self
.
_getresponse
(
'gzip'
)
response
=
self
.
_getresponse
(
'gzip'
)
...
@@ -216,6 +279,8 @@ class HttpCompressionTest(TestCase):
...
@@ -216,6 +279,8 @@ class HttpCompressionTest(TestCase):
self
.
assertIsNot
(
newresponse
,
response
)
self
.
assertIsNot
(
newresponse
,
response
)
self
.
assertTrue
(
newresponse
.
body
.
startswith
(
b
'<!DOCTYPE'
))
self
.
assertTrue
(
newresponse
.
body
.
startswith
(
b
'<!DOCTYPE'
))
self
.
assertNotIn
(
'Content-Encoding'
,
newresponse
.
headers
)
self
.
assertNotIn
(
'Content-Encoding'
,
newresponse
.
headers
)
self
.
assertStatsEqual
(
'httpcompression/response_count'
,
1
)
self
.
assertStatsEqual
(
'httpcompression/response_bytes'
,
74837
)
def
test_process_response_gzip_binary_octetstream_contenttype
(
self
):
def
test_process_response_gzip_binary_octetstream_contenttype
(
self
):
response
=
self
.
_getresponse
(
'x-gzip'
)
response
=
self
.
_getresponse
(
'x-gzip'
)
...
@@ -226,6 +291,8 @@ class HttpCompressionTest(TestCase):
...
@@ -226,6 +291,8 @@ class HttpCompressionTest(TestCase):
self
.
assertIsNot
(
newresponse
,
response
)
self
.
assertIsNot
(
newresponse
,
response
)
self
.
assertTrue
(
newresponse
.
body
.
startswith
(
b
'<!DOCTYPE'
))
self
.
assertTrue
(
newresponse
.
body
.
startswith
(
b
'<!DOCTYPE'
))
self
.
assertNotIn
(
'Content-Encoding'
,
newresponse
.
headers
)
self
.
assertNotIn
(
'Content-Encoding'
,
newresponse
.
headers
)
self
.
assertStatsEqual
(
'httpcompression/response_count'
,
1
)
self
.
assertStatsEqual
(
'httpcompression/response_bytes'
,
74837
)
def
test_process_response_gzipped_gzip_file
(
self
):
def
test_process_response_gzipped_gzip_file
(
self
):
"""Test that a gzip Content-Encoded .gz file is gunzipped
"""Test that a gzip Content-Encoded .gz file is gunzipped
...
@@ -268,6 +335,8 @@ class HttpCompressionTest(TestCase):
...
@@ -268,6 +335,8 @@ class HttpCompressionTest(TestCase):
newresponse
=
self
.
mw
.
process_response
(
request
,
response
,
self
.
spider
)
newresponse
=
self
.
mw
.
process_response
(
request
,
response
,
self
.
spider
)
self
.
assertEqual
(
gunzip
(
newresponse
.
body
),
plainbody
)
self
.
assertEqual
(
gunzip
(
newresponse
.
body
),
plainbody
)
self
.
assertStatsEqual
(
'httpcompression/response_count'
,
1
)
self
.
assertStatsEqual
(
'httpcompression/response_bytes'
,
230
)
def
test_process_response_head_request_no_decode_required
(
self
):
def
test_process_response_head_request_no_decode_required
(
self
):
response
=
self
.
_getresponse
(
'gzip'
)
response
=
self
.
_getresponse
(
'gzip'
)
...
@@ -278,3 +347,32 @@ class HttpCompressionTest(TestCase):
...
@@ -278,3 +347,32 @@ class HttpCompressionTest(TestCase):
newresponse
=
self
.
mw
.
process_response
(
request
,
response
,
self
.
spider
)
newresponse
=
self
.
mw
.
process_response
(
request
,
response
,
self
.
spider
)
self
.
assertIs
(
newresponse
,
response
)
self
.
assertIs
(
newresponse
,
response
)
self
.
assertEqual
(
response
.
body
,
b
''
)
self
.
assertEqual
(
response
.
body
,
b
''
)
self
.
assertStatsEqual
(
'httpcompression/response_count'
,
None
)
self
.
assertStatsEqual
(
'httpcompression/response_bytes'
,
None
)
class
HttpCompressionSubclassTest
(
TestCase
):
def
test_init_missing_stats
(
self
):
class
HttpCompressionMiddlewareSubclass
(
HttpCompressionMiddleware
):
def
__init__
(
self
):
super
().
__init__
()
crawler
=
get_crawler
(
Spider
)
with
catch_warnings
(
record
=
True
)
as
caught_warnings
:
HttpCompressionMiddlewareSubclass
.
from_crawler
(
crawler
)
messages
=
tuple
(
str
(
warning
.
message
)
for
warning
in
caught_warnings
if
warning
.
category
is
ScrapyDeprecationWarning
)
self
.
assertEqual
(
messages
,
(
(
"HttpCompressionMiddleware subclasses must either modify "
"their '__init__' method to support a 'stats' parameter "
"or reimplement the 'from_crawler' method."
),
)
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录