Add a LevelDB cache backend

752787e6 · Daniel Graña · 1c9effd7 · 752787e6 · 752787e6 · 752787e6
Showing with 71 addition and 0 deletion

scrapy/contrib/httpcache.py scrapy/contrib/httpcache.py +63 -0

scrapy/tests/test_downloadermiddleware_httpcache.py scrapy/tests/test_downloadermiddleware_httpcache.py +7 -0

tox.ini tox.ini +1 -0

未找到文件。
--- a/scrapy/contrib/httpcache.py
+++ b/scrapy/contrib/httpcache.py
@@ -285,6 +285,69 @@ class FilesystemCacheStorage(object):
            return pickle.load(f)


+class LeveldbCacheStorage(object):
+
+    def __init__(self, settings):
+        import leveldb
+        self._leveldb = leveldb
+        self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
+        self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
+        self.db = None
+
+    def open_spider(self, spider):
+        dbpath = os.path.join(self.cachedir, '%s.leveldb' % spider.name)
+        self.db = self._leveldb.LevelDB(dbpath)
+
+    def close_spider(self, spider):
+        del self.db
+
+    def retrieve_response(self, spider, request):
+        data = self._read_data(spider, request)
+        if data is None:
+            return  # not cached
+        url = data['url']
+        status = data['status']
+        headers = Headers(data['headers'])
+        body = data['body']
+        respcls = responsetypes.from_args(headers=headers, url=url)
+        response = respcls(url=url, headers=headers, status=status, body=body)
+        return response
+
+    def store_response(self, spider, request, response):
+        key = self._request_key(request)
+        data = {
+            'status': response.status,
+            'url': response.url,
+            'headers': dict(response.headers),
+            'body': response.body,
+        }
+        batch = self._leveldb.WriteBatch()
+        batch.Put('%s_data' % key, pickle.dumps(data, protocol=2))
+        batch.Put('%s_time' % key, str(time()))
+        self.db.Write(batch)
+
+    def _read_data(self, spider, request):
+        key = self._request_key(request)
+        try:
+            ts = self.db.Get('%s_time' % key)
+        except KeyError:
+            return  # not found or invalid entry
+
+        if 0 < self.expiration_secs < time() - float(ts):
+            return  # expired
+
+        try:
+            data = self.db.Get('%s_data' % key)
+        except KeyError:
+            return  # invalid entry
+        else:
+            return pickle.loads(data)
+
+    def _request_key(self, request):
+        return request_fingerprint(request)
+
+
+
 def parse_cachecontrol(header):
    """Parse Cache-Control header


--- a/scrapy/tests/test_downloadermiddleware_httpcache.py
+++ b/scrapy/tests/test_downloadermiddleware_httpcache.py
@@ -5,6 +5,7 @@ import shutil
 import unittest
 import email.utils
 from contextlib import contextmanager
+import pytest

 from scrapy.http import Response, HtmlResponse, Request
 from scrapy.spider import Spider
@@ -136,6 +137,12 @@ class FilesystemStorageTest(DefaultStorageTest):
    storage_class = 'scrapy.contrib.httpcache.FilesystemCacheStorage'


+class LeveldbStorageTest(DefaultStorageTest):
+
+    pytest.importorskip('leveldb')
+    storage_class = 'scrapy.contrib.httpcache.LeveldbCacheStorage'
+
+
 class DummyPolicyTest(_BaseTest):

    policy_class = 'scrapy.contrib.httpcache.DummyPolicy'

--- a/tox.ini
+++ b/tox.ini
@@ -15,6 +15,7 @@ deps =
    boto
    Pillow
    django
+    leveldb
    -rtests-requirements.txt
 commands =
    py.test --twisted {posargs:scrapy}