未验证 提交 12f2006b 编写于 作者: M Mikhail Korobov 提交者: GitHub

Merge pull request #4799 from GeorgeA92/patch-2

httpcompression stats added
import io
import warnings
import zlib
from scrapy.utils.gz import gunzip
from scrapy.exceptions import NotConfigured
from scrapy.http import Response, TextResponse
from scrapy.responsetypes import responsetypes
from scrapy.exceptions import NotConfigured
from scrapy.utils.deprecate import ScrapyDeprecationWarning
from scrapy.utils.gz import gunzip
ACCEPTED_ENCODINGS = [b'gzip', b'deflate']
......@@ -25,11 +27,25 @@ except ImportError:
class HttpCompressionMiddleware:
"""This middleware allows compressed (gzip, deflate) traffic to be
sent/received from web sites"""
def __init__(self, stats=None):
self.stats = stats
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('COMPRESSION_ENABLED'):
raise NotConfigured
return cls()
try:
return cls(stats=crawler.stats)
except TypeError:
warnings.warn(
"HttpCompressionMiddleware subclasses must either modify "
"their '__init__' method to support a 'stats' parameter or "
"reimplement the 'from_crawler' method.",
ScrapyDeprecationWarning,
)
result = cls()
result.stats = crawler.stats
return result
def process_request(self, request, spider):
request.headers.setdefault('Accept-Encoding',
......@@ -44,6 +60,9 @@ class HttpCompressionMiddleware:
if content_encoding:
encoding = content_encoding.pop()
decoded_body = self._decode(response.body, encoding.lower())
if self.stats:
self.stats.inc_value('httpcompression/response_bytes', len(decoded_body), spider=spider)
self.stats.inc_value('httpcompression/response_count', spider=spider)
respcls = responsetypes.from_args(
headers=response.headers, url=response.url, body=decoded_body
)
......
from gzip import GzipFile
from io import BytesIO
from unittest import TestCase, SkipTest
from os.path import join
from gzip import GzipFile
from unittest import TestCase, SkipTest
from warnings import catch_warnings
from scrapy.spiders import Spider
from scrapy.http import Response, Request, HtmlResponse
from scrapy.downloadermiddlewares.httpcompression import HttpCompressionMiddleware, ACCEPTED_ENCODINGS
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
from scrapy.responsetypes import responsetypes
from scrapy.utils.gz import gunzip
from scrapy.utils.test import get_crawler
from tests import tests_datadir
from w3lib.encoding import resolve_encoding
......@@ -32,8 +35,10 @@ FORMAT = {
class HttpCompressionTest(TestCase):
def setUp(self):
self.spider = Spider('foo')
self.mw = HttpCompressionMiddleware()
self.crawler = get_crawler(Spider)
self.spider = self.crawler._create_spider('scrapytest.org')
self.mw = HttpCompressionMiddleware.from_crawler(self.crawler)
self.crawler.stats.open_spider(self.spider)
def _getresponse(self, coding):
if coding not in FORMAT:
......@@ -56,6 +61,34 @@ class HttpCompressionTest(TestCase):
response.request = Request('http://scrapytest.org', headers={'Accept-Encoding': 'gzip, deflate'})
return response
def assertStatsEqual(self, key, value):
self.assertEqual(
self.crawler.stats.get_value(key, spider=self.spider),
value,
str(self.crawler.stats.get_stats(self.spider))
)
def test_setting_false_compression_enabled(self):
self.assertRaises(
NotConfigured,
HttpCompressionMiddleware.from_crawler,
get_crawler(settings_dict={'COMPRESSION_ENABLED': False})
)
def test_setting_default_compression_enabled(self):
self.assertIsInstance(
HttpCompressionMiddleware.from_crawler(get_crawler()),
HttpCompressionMiddleware
)
def test_setting_true_compression_enabled(self):
self.assertIsInstance(
HttpCompressionMiddleware.from_crawler(
get_crawler(settings_dict={'COMPRESSION_ENABLED': True})
),
HttpCompressionMiddleware
)
def test_process_request(self):
request = Request('http://scrapytest.org')
assert 'Accept-Encoding' not in request.headers
......@@ -72,6 +105,20 @@ class HttpCompressionTest(TestCase):
assert newresponse is not response
assert newresponse.body.startswith(b'<!DOCTYPE')
assert 'Content-Encoding' not in newresponse.headers
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74837)
def test_process_response_gzip_no_stats(self):
mw = HttpCompressionMiddleware()
response = self._getresponse('gzip')
request = response.request
self.assertEqual(response.headers['Content-Encoding'], b'gzip')
newresponse = mw.process_response(request, response, self.spider)
self.assertEqual(mw.stats, None)
assert newresponse is not response
assert newresponse.body.startswith(b'<!DOCTYPE')
assert 'Content-Encoding' not in newresponse.headers
def test_process_response_br(self):
try:
......@@ -85,6 +132,8 @@ class HttpCompressionTest(TestCase):
assert newresponse is not response
assert newresponse.body.startswith(b"<!DOCTYPE")
assert 'Content-Encoding' not in newresponse.headers
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74837)
def test_process_response_zstd(self):
try:
......@@ -116,6 +165,8 @@ class HttpCompressionTest(TestCase):
assert newresponse is not response
assert newresponse.body.startswith(b'<!DOCTYPE')
assert 'Content-Encoding' not in newresponse.headers
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74840)
def test_process_response_zlibdelate(self):
response = self._getresponse('zlibdeflate')
......@@ -126,6 +177,8 @@ class HttpCompressionTest(TestCase):
assert newresponse is not response
assert newresponse.body.startswith(b'<!DOCTYPE')
assert 'Content-Encoding' not in newresponse.headers
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74840)
def test_process_response_plain(self):
response = Response('http://scrapytest.org', body=b'<!DOCTYPE...')
......@@ -135,6 +188,8 @@ class HttpCompressionTest(TestCase):
newresponse = self.mw.process_response(request, response, self.spider)
assert newresponse is response
assert newresponse.body.startswith(b'<!DOCTYPE')
self.assertStatsEqual('httpcompression/response_count', None)
self.assertStatsEqual('httpcompression/response_bytes', None)
def test_multipleencodings(self):
response = self._getresponse('gzip')
......@@ -162,6 +217,8 @@ class HttpCompressionTest(TestCase):
assert isinstance(newresponse, HtmlResponse)
self.assertEqual(newresponse.body, plainbody)
self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 104)
def test_process_response_force_recalculate_encoding(self):
headers = {
......@@ -181,6 +238,8 @@ class HttpCompressionTest(TestCase):
assert isinstance(newresponse, HtmlResponse)
self.assertEqual(newresponse.body, plainbody)
self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 104)
def test_process_response_no_content_type_header(self):
headers = {
......@@ -196,6 +255,8 @@ class HttpCompressionTest(TestCase):
assert isinstance(newresponse, respcls)
self.assertEqual(newresponse.body, plainbody)
self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 104)
def test_process_response_gzipped_contenttype(self):
response = self._getresponse('gzip')
......@@ -206,6 +267,8 @@ class HttpCompressionTest(TestCase):
self.assertIsNot(newresponse, response)
self.assertTrue(newresponse.body.startswith(b'<!DOCTYPE'))
self.assertNotIn('Content-Encoding', newresponse.headers)
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74837)
def test_process_response_gzip_app_octetstream_contenttype(self):
response = self._getresponse('gzip')
......@@ -216,6 +279,8 @@ class HttpCompressionTest(TestCase):
self.assertIsNot(newresponse, response)
self.assertTrue(newresponse.body.startswith(b'<!DOCTYPE'))
self.assertNotIn('Content-Encoding', newresponse.headers)
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74837)
def test_process_response_gzip_binary_octetstream_contenttype(self):
response = self._getresponse('x-gzip')
......@@ -226,6 +291,8 @@ class HttpCompressionTest(TestCase):
self.assertIsNot(newresponse, response)
self.assertTrue(newresponse.body.startswith(b'<!DOCTYPE'))
self.assertNotIn('Content-Encoding', newresponse.headers)
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74837)
def test_process_response_gzipped_gzip_file(self):
"""Test that a gzip Content-Encoded .gz file is gunzipped
......@@ -268,6 +335,8 @@ class HttpCompressionTest(TestCase):
newresponse = self.mw.process_response(request, response, self.spider)
self.assertEqual(gunzip(newresponse.body), plainbody)
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 230)
def test_process_response_head_request_no_decode_required(self):
response = self._getresponse('gzip')
......@@ -278,3 +347,32 @@ class HttpCompressionTest(TestCase):
newresponse = self.mw.process_response(request, response, self.spider)
self.assertIs(newresponse, response)
self.assertEqual(response.body, b'')
self.assertStatsEqual('httpcompression/response_count', None)
self.assertStatsEqual('httpcompression/response_bytes', None)
class HttpCompressionSubclassTest(TestCase):
def test_init_missing_stats(self):
class HttpCompressionMiddlewareSubclass(HttpCompressionMiddleware):
def __init__(self):
super().__init__()
crawler = get_crawler(Spider)
with catch_warnings(record=True) as caught_warnings:
HttpCompressionMiddlewareSubclass.from_crawler(crawler)
messages = tuple(
str(warning.message) for warning in caught_warnings
if warning.category is ScrapyDeprecationWarning
)
self.assertEqual(
messages,
(
(
"HttpCompressionMiddleware subclasses must either modify "
"their '__init__' method to support a 'stats' parameter "
"or reimplement the 'from_crawler' method."
),
)
)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册