提交 b76c5c59 编写于 作者: P Pablo Hoffman

* Added support for project data storage (closes #276)

* Documented project file structure
* Moved default location of SQLite database to project data storage dir (closes #277)
上级 dfa6745e
......@@ -13,6 +13,44 @@ just call "commands", or "Scrapy commands".
The Scrapy tool provides several commands, for multiple purposes, and each one
accepts a different set of arguments and options.
.. _topics-project-structure:
Default structure of Scrapy projects
====================================
Before delving into the command-line tool and its sub-commands, let's first
understand the directory structure of a Scrapy project.
Even thought it can be modified, all Scrapy projects have the same file
structure by default, similar to this::
scrapy.cfg
myproject/
__init__.py
items.py
pipelines.py
settings.py
spiders/
__init__.py
spider1.py
spider2.py
...
.scrapy/
scrapy.db
The directory where the ``scrapy.cfg`` file resides is known as the *project
root directory*. That file contains the name of the python module that defines
the project settings. Here is an example::
[settings]
default = myproject.settings
By default, Scrapy projects use a SQLite_ database to store persistent runtime
data of the project, such as the spider queue (the list of spiders that are
scheduled to run). By default, this SQLite database is stored in the *project
data directory* which, by default, is the ``.scrapy`` directory inside the
project root directory mentioned above.
Using the ``scrapy`` tool
=========================
......@@ -444,3 +482,5 @@ commands for your Scrapy project.
Example::
COMMANDS_MODULE = 'mybot.commands'
.. _SQLite: http://en.wikipedia.org/wiki/SQLite
......@@ -878,6 +878,18 @@ Example::
SPIDER_MODULES = ['mybot.spiders_prod', 'mybot.spiders_dev']
.. setting:: SQLITE_DB
SQLITE_DB
---------
Default: ``'scrapy.db'``
The location of the project SQLite database, used for storing the spider queue
and other persistent data of the project. If a relative path is given, is taken
relative to the project data dir. For more info see:
:ref:`topics-project-structure`.
.. setting:: STATS_CLASS
STATS_CLASS
......
import sqlite3
from zope.interface import Interface, implements
from scrapy.xlib.pydispatch import dispatcher
from scrapy.exceptions import NotConfigured
from scrapy.utils.misc import load_object
from scrapy.utils.sqlite import JsonSqliteDict
from scrapy import log, signals
from scrapy.utils.project import sqlite_db
from scrapy import signals
class ISpiderContextStorage(Interface):
......@@ -24,16 +23,11 @@ class SqliteSpiderContextStorage(object):
sqlite_dict_class = JsonSqliteDict
def __init__(self, database=None, table='contexts'):
try:
self.d = self.sqlite_dict_class(database, table)
except sqlite3.Error, e:
self.d = self.sqlite_dict_class(':memory:', table)
log.msg("Cannot open SQLite %r - using in-memory context storage " \
"instead. Error was: %r" % (database, str(e)), log.WARNING)
self.d = self.sqlite_dict_class(database, table)
@classmethod
def from_settings(cls, settings):
return cls(settings['SQLITE_DB'])
return cls(sqlite_db(settings['SQLITE_DB']))
def get(self, spider):
if spider.name in self.d:
......
import sqlite3
from zope.interface import implements
from scrapy import log
from scrapy.interfaces import ISpiderQueue
from scrapy.utils.sqlite import JsonSqlitePriorityQueue
from scrapy.utils.project import sqlite_db
class SqliteSpiderQueue(object):
......@@ -12,16 +10,11 @@ class SqliteSpiderQueue(object):
implements(ISpiderQueue)
def __init__(self, database=None, table='spider_queue'):
try:
self.q = JsonSqlitePriorityQueue(database, table)
except sqlite3.Error, e:
self.q = JsonSqlitePriorityQueue(':memory:', table)
log.msg("Cannot open SQLite %r - using in-memory spider queue " \
"instead. Error was: %r" % (database, str(e)), log.WARNING)
self.q = JsonSqlitePriorityQueue(database, table)
@classmethod
def from_settings(cls, settings):
return cls(settings['SQLITE_DB'])
return cls(sqlite_db(settings['SQLITE_DB']))
def add(self, name, **spider_args):
d = spider_args.copy()
......
from os.path import join, dirname, abspath, isabs, exists
from os import makedirs
import warnings
from scrapy.utils.conf import closest_scrapy_cfg, get_config
from scrapy.utils.python import is_writable
DATADIR_CFG_SECTION = 'datadir'
def inside_project():
return bool(closest_scrapy_cfg())
def project_data_dir(project='default'):
"""Return the current project data dir, creating it if it doesn't exist"""
assert inside_project(), "Not inside project"
scrapy_cfg = closest_scrapy_cfg()
d = abspath(join(dirname(scrapy_cfg), '.scrapy'))
cfg = get_config()
if cfg.has_option(DATADIR_CFG_SECTION, project):
d = cfg.get(DATADIR_CFG_SECTION, project)
if not exists(d):
makedirs(d)
return d
def expand_data_path(path):
"""If path is relative, return the given path inside the project data dir,
otherwise return the path unmodified
"""
if isabs(path):
return path
return join(project_data_dir(), path)
def sqlite_db(path, nonwritable_fallback=True):
"""Get the SQLite database to use. If path is relative, returns the given
path inside the project data dir, otherwise returns the path unmodified. If
not inside a project returns :memory: to use an in-memory database.
If nonwritable_fallback is True, and the path is not writable it issues a
warning and returns :memory:
"""
if not inside_project() or path == ':memory:':
db = ':memory:'
else:
db = expand_data_path(path)
if not is_writable(db) and nonwritable_fallback:
warnings.warn("%r is not writable - using in-memory SQLite instead" % db)
db = ':memory:'
return db
......@@ -5,6 +5,7 @@ It also contains functions (or functionality) which is in Python versions
higher than 2.5 which is the lowest version supported by Scrapy.
"""
import os
import re
import inspect
import weakref
......@@ -205,3 +206,12 @@ def stringify_dict(dct_or_tuples, encoding='utf-8', keys_only=True):
v = v.encode(encoding) if isinstance(v, unicode) else v
d[k] = v
return d
def is_writable(path):
"""Return True if the given path can be written (if it exists) or created
(if it doesn't exist)
"""
if os.path.exists(path):
return os.access(path, os.W_OK)
else:
return os.access(os.path.dirname(path), os.W_OK)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册