未验证 提交 12f2006b 编写于 作者: M Mikhail Korobov 提交者: GitHub

Merge pull request #4799 from GeorgeA92/patch-2

httpcompression stats added
import io import io
import warnings
import zlib import zlib
from scrapy.utils.gz import gunzip from scrapy.exceptions import NotConfigured
from scrapy.http import Response, TextResponse from scrapy.http import Response, TextResponse
from scrapy.responsetypes import responsetypes from scrapy.responsetypes import responsetypes
from scrapy.exceptions import NotConfigured from scrapy.utils.deprecate import ScrapyDeprecationWarning
from scrapy.utils.gz import gunzip
ACCEPTED_ENCODINGS = [b'gzip', b'deflate'] ACCEPTED_ENCODINGS = [b'gzip', b'deflate']
...@@ -25,11 +27,25 @@ except ImportError: ...@@ -25,11 +27,25 @@ except ImportError:
class HttpCompressionMiddleware: class HttpCompressionMiddleware:
"""This middleware allows compressed (gzip, deflate) traffic to be """This middleware allows compressed (gzip, deflate) traffic to be
sent/received from web sites""" sent/received from web sites"""
def __init__(self, stats=None):
self.stats = stats
@classmethod @classmethod
def from_crawler(cls, crawler): def from_crawler(cls, crawler):
if not crawler.settings.getbool('COMPRESSION_ENABLED'): if not crawler.settings.getbool('COMPRESSION_ENABLED'):
raise NotConfigured raise NotConfigured
return cls() try:
return cls(stats=crawler.stats)
except TypeError:
warnings.warn(
"HttpCompressionMiddleware subclasses must either modify "
"their '__init__' method to support a 'stats' parameter or "
"reimplement the 'from_crawler' method.",
ScrapyDeprecationWarning,
)
result = cls()
result.stats = crawler.stats
return result
def process_request(self, request, spider): def process_request(self, request, spider):
request.headers.setdefault('Accept-Encoding', request.headers.setdefault('Accept-Encoding',
...@@ -44,6 +60,9 @@ class HttpCompressionMiddleware: ...@@ -44,6 +60,9 @@ class HttpCompressionMiddleware:
if content_encoding: if content_encoding:
encoding = content_encoding.pop() encoding = content_encoding.pop()
decoded_body = self._decode(response.body, encoding.lower()) decoded_body = self._decode(response.body, encoding.lower())
if self.stats:
self.stats.inc_value('httpcompression/response_bytes', len(decoded_body), spider=spider)
self.stats.inc_value('httpcompression/response_count', spider=spider)
respcls = responsetypes.from_args( respcls = responsetypes.from_args(
headers=response.headers, url=response.url, body=decoded_body headers=response.headers, url=response.url, body=decoded_body
) )
......
from gzip import GzipFile
from io import BytesIO from io import BytesIO
from unittest import TestCase, SkipTest
from os.path import join from os.path import join
from gzip import GzipFile from unittest import TestCase, SkipTest
from warnings import catch_warnings
from scrapy.spiders import Spider from scrapy.spiders import Spider
from scrapy.http import Response, Request, HtmlResponse from scrapy.http import Response, Request, HtmlResponse
from scrapy.downloadermiddlewares.httpcompression import HttpCompressionMiddleware, ACCEPTED_ENCODINGS from scrapy.downloadermiddlewares.httpcompression import HttpCompressionMiddleware, ACCEPTED_ENCODINGS
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
from scrapy.responsetypes import responsetypes from scrapy.responsetypes import responsetypes
from scrapy.utils.gz import gunzip from scrapy.utils.gz import gunzip
from scrapy.utils.test import get_crawler
from tests import tests_datadir from tests import tests_datadir
from w3lib.encoding import resolve_encoding from w3lib.encoding import resolve_encoding
...@@ -32,8 +35,10 @@ FORMAT = { ...@@ -32,8 +35,10 @@ FORMAT = {
class HttpCompressionTest(TestCase): class HttpCompressionTest(TestCase):
def setUp(self): def setUp(self):
self.spider = Spider('foo') self.crawler = get_crawler(Spider)
self.mw = HttpCompressionMiddleware() self.spider = self.crawler._create_spider('scrapytest.org')
self.mw = HttpCompressionMiddleware.from_crawler(self.crawler)
self.crawler.stats.open_spider(self.spider)
def _getresponse(self, coding): def _getresponse(self, coding):
if coding not in FORMAT: if coding not in FORMAT:
...@@ -56,6 +61,34 @@ class HttpCompressionTest(TestCase): ...@@ -56,6 +61,34 @@ class HttpCompressionTest(TestCase):
response.request = Request('http://scrapytest.org', headers={'Accept-Encoding': 'gzip, deflate'}) response.request = Request('http://scrapytest.org', headers={'Accept-Encoding': 'gzip, deflate'})
return response return response
def assertStatsEqual(self, key, value):
self.assertEqual(
self.crawler.stats.get_value(key, spider=self.spider),
value,
str(self.crawler.stats.get_stats(self.spider))
)
def test_setting_false_compression_enabled(self):
self.assertRaises(
NotConfigured,
HttpCompressionMiddleware.from_crawler,
get_crawler(settings_dict={'COMPRESSION_ENABLED': False})
)
def test_setting_default_compression_enabled(self):
self.assertIsInstance(
HttpCompressionMiddleware.from_crawler(get_crawler()),
HttpCompressionMiddleware
)
def test_setting_true_compression_enabled(self):
self.assertIsInstance(
HttpCompressionMiddleware.from_crawler(
get_crawler(settings_dict={'COMPRESSION_ENABLED': True})
),
HttpCompressionMiddleware
)
def test_process_request(self): def test_process_request(self):
request = Request('http://scrapytest.org') request = Request('http://scrapytest.org')
assert 'Accept-Encoding' not in request.headers assert 'Accept-Encoding' not in request.headers
...@@ -72,6 +105,20 @@ class HttpCompressionTest(TestCase): ...@@ -72,6 +105,20 @@ class HttpCompressionTest(TestCase):
assert newresponse is not response assert newresponse is not response
assert newresponse.body.startswith(b'<!DOCTYPE') assert newresponse.body.startswith(b'<!DOCTYPE')
assert 'Content-Encoding' not in newresponse.headers assert 'Content-Encoding' not in newresponse.headers
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74837)
def test_process_response_gzip_no_stats(self):
mw = HttpCompressionMiddleware()
response = self._getresponse('gzip')
request = response.request
self.assertEqual(response.headers['Content-Encoding'], b'gzip')
newresponse = mw.process_response(request, response, self.spider)
self.assertEqual(mw.stats, None)
assert newresponse is not response
assert newresponse.body.startswith(b'<!DOCTYPE')
assert 'Content-Encoding' not in newresponse.headers
def test_process_response_br(self): def test_process_response_br(self):
try: try:
...@@ -85,6 +132,8 @@ class HttpCompressionTest(TestCase): ...@@ -85,6 +132,8 @@ class HttpCompressionTest(TestCase):
assert newresponse is not response assert newresponse is not response
assert newresponse.body.startswith(b"<!DOCTYPE") assert newresponse.body.startswith(b"<!DOCTYPE")
assert 'Content-Encoding' not in newresponse.headers assert 'Content-Encoding' not in newresponse.headers
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74837)
def test_process_response_zstd(self): def test_process_response_zstd(self):
try: try:
...@@ -116,6 +165,8 @@ class HttpCompressionTest(TestCase): ...@@ -116,6 +165,8 @@ class HttpCompressionTest(TestCase):
assert newresponse is not response assert newresponse is not response
assert newresponse.body.startswith(b'<!DOCTYPE') assert newresponse.body.startswith(b'<!DOCTYPE')
assert 'Content-Encoding' not in newresponse.headers assert 'Content-Encoding' not in newresponse.headers
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74840)
def test_process_response_zlibdelate(self): def test_process_response_zlibdelate(self):
response = self._getresponse('zlibdeflate') response = self._getresponse('zlibdeflate')
...@@ -126,6 +177,8 @@ class HttpCompressionTest(TestCase): ...@@ -126,6 +177,8 @@ class HttpCompressionTest(TestCase):
assert newresponse is not response assert newresponse is not response
assert newresponse.body.startswith(b'<!DOCTYPE') assert newresponse.body.startswith(b'<!DOCTYPE')
assert 'Content-Encoding' not in newresponse.headers assert 'Content-Encoding' not in newresponse.headers
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74840)
def test_process_response_plain(self): def test_process_response_plain(self):
response = Response('http://scrapytest.org', body=b'<!DOCTYPE...') response = Response('http://scrapytest.org', body=b'<!DOCTYPE...')
...@@ -135,6 +188,8 @@ class HttpCompressionTest(TestCase): ...@@ -135,6 +188,8 @@ class HttpCompressionTest(TestCase):
newresponse = self.mw.process_response(request, response, self.spider) newresponse = self.mw.process_response(request, response, self.spider)
assert newresponse is response assert newresponse is response
assert newresponse.body.startswith(b'<!DOCTYPE') assert newresponse.body.startswith(b'<!DOCTYPE')
self.assertStatsEqual('httpcompression/response_count', None)
self.assertStatsEqual('httpcompression/response_bytes', None)
def test_multipleencodings(self): def test_multipleencodings(self):
response = self._getresponse('gzip') response = self._getresponse('gzip')
...@@ -162,6 +217,8 @@ class HttpCompressionTest(TestCase): ...@@ -162,6 +217,8 @@ class HttpCompressionTest(TestCase):
assert isinstance(newresponse, HtmlResponse) assert isinstance(newresponse, HtmlResponse)
self.assertEqual(newresponse.body, plainbody) self.assertEqual(newresponse.body, plainbody)
self.assertEqual(newresponse.encoding, resolve_encoding('gb2312')) self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 104)
def test_process_response_force_recalculate_encoding(self): def test_process_response_force_recalculate_encoding(self):
headers = { headers = {
...@@ -181,6 +238,8 @@ class HttpCompressionTest(TestCase): ...@@ -181,6 +238,8 @@ class HttpCompressionTest(TestCase):
assert isinstance(newresponse, HtmlResponse) assert isinstance(newresponse, HtmlResponse)
self.assertEqual(newresponse.body, plainbody) self.assertEqual(newresponse.body, plainbody)
self.assertEqual(newresponse.encoding, resolve_encoding('gb2312')) self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 104)
def test_process_response_no_content_type_header(self): def test_process_response_no_content_type_header(self):
headers = { headers = {
...@@ -196,6 +255,8 @@ class HttpCompressionTest(TestCase): ...@@ -196,6 +255,8 @@ class HttpCompressionTest(TestCase):
assert isinstance(newresponse, respcls) assert isinstance(newresponse, respcls)
self.assertEqual(newresponse.body, plainbody) self.assertEqual(newresponse.body, plainbody)
self.assertEqual(newresponse.encoding, resolve_encoding('gb2312')) self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 104)
def test_process_response_gzipped_contenttype(self): def test_process_response_gzipped_contenttype(self):
response = self._getresponse('gzip') response = self._getresponse('gzip')
...@@ -206,6 +267,8 @@ class HttpCompressionTest(TestCase): ...@@ -206,6 +267,8 @@ class HttpCompressionTest(TestCase):
self.assertIsNot(newresponse, response) self.assertIsNot(newresponse, response)
self.assertTrue(newresponse.body.startswith(b'<!DOCTYPE')) self.assertTrue(newresponse.body.startswith(b'<!DOCTYPE'))
self.assertNotIn('Content-Encoding', newresponse.headers) self.assertNotIn('Content-Encoding', newresponse.headers)
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74837)
def test_process_response_gzip_app_octetstream_contenttype(self): def test_process_response_gzip_app_octetstream_contenttype(self):
response = self._getresponse('gzip') response = self._getresponse('gzip')
...@@ -216,6 +279,8 @@ class HttpCompressionTest(TestCase): ...@@ -216,6 +279,8 @@ class HttpCompressionTest(TestCase):
self.assertIsNot(newresponse, response) self.assertIsNot(newresponse, response)
self.assertTrue(newresponse.body.startswith(b'<!DOCTYPE')) self.assertTrue(newresponse.body.startswith(b'<!DOCTYPE'))
self.assertNotIn('Content-Encoding', newresponse.headers) self.assertNotIn('Content-Encoding', newresponse.headers)
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74837)
def test_process_response_gzip_binary_octetstream_contenttype(self): def test_process_response_gzip_binary_octetstream_contenttype(self):
response = self._getresponse('x-gzip') response = self._getresponse('x-gzip')
...@@ -226,6 +291,8 @@ class HttpCompressionTest(TestCase): ...@@ -226,6 +291,8 @@ class HttpCompressionTest(TestCase):
self.assertIsNot(newresponse, response) self.assertIsNot(newresponse, response)
self.assertTrue(newresponse.body.startswith(b'<!DOCTYPE')) self.assertTrue(newresponse.body.startswith(b'<!DOCTYPE'))
self.assertNotIn('Content-Encoding', newresponse.headers) self.assertNotIn('Content-Encoding', newresponse.headers)
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74837)
def test_process_response_gzipped_gzip_file(self): def test_process_response_gzipped_gzip_file(self):
"""Test that a gzip Content-Encoded .gz file is gunzipped """Test that a gzip Content-Encoded .gz file is gunzipped
...@@ -268,6 +335,8 @@ class HttpCompressionTest(TestCase): ...@@ -268,6 +335,8 @@ class HttpCompressionTest(TestCase):
newresponse = self.mw.process_response(request, response, self.spider) newresponse = self.mw.process_response(request, response, self.spider)
self.assertEqual(gunzip(newresponse.body), plainbody) self.assertEqual(gunzip(newresponse.body), plainbody)
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 230)
def test_process_response_head_request_no_decode_required(self): def test_process_response_head_request_no_decode_required(self):
response = self._getresponse('gzip') response = self._getresponse('gzip')
...@@ -278,3 +347,32 @@ class HttpCompressionTest(TestCase): ...@@ -278,3 +347,32 @@ class HttpCompressionTest(TestCase):
newresponse = self.mw.process_response(request, response, self.spider) newresponse = self.mw.process_response(request, response, self.spider)
self.assertIs(newresponse, response) self.assertIs(newresponse, response)
self.assertEqual(response.body, b'') self.assertEqual(response.body, b'')
self.assertStatsEqual('httpcompression/response_count', None)
self.assertStatsEqual('httpcompression/response_bytes', None)
class HttpCompressionSubclassTest(TestCase):
def test_init_missing_stats(self):
class HttpCompressionMiddlewareSubclass(HttpCompressionMiddleware):
def __init__(self):
super().__init__()
crawler = get_crawler(Spider)
with catch_warnings(record=True) as caught_warnings:
HttpCompressionMiddlewareSubclass.from_crawler(crawler)
messages = tuple(
str(warning.message) for warning in caught_warnings
if warning.category is ScrapyDeprecationWarning
)
self.assertEqual(
messages,
(
(
"HttpCompressionMiddleware subclasses must either modify "
"their '__init__' method to support a 'stats' parameter "
"or reimplement the 'from_crawler' method."
),
)
)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册