* Added support for project data storage (closes #276)

* Documented project file structure * Moved default location of SQLite database to project data storage dir (closes #277)

* Added support for project data storage (closes #276)
* Documented project file structure * Moved default location of SQLite database to project data storage dir (closes #277)
b76c5c59 · Pablo Hoffman · dfa6745e · b76c5c59 · b76c5c59 · b76c5c59
6 changed file
--- a/docs/topics/commands.rst
+++ b/docs/topics/commands.rst
@@ -13,6 +13,44 @@ just call "commands", or "Scrapy commands".
 The Scrapy tool provides several commands, for multiple purposes, and each one
 accepts a different set of arguments and options.

+.. _topics-project-structure:
+
+Default structure of Scrapy projects
+====================================
+
+Before delving into the command-line tool and its sub-commands, let's first
+understand the directory structure of a Scrapy project.
+
+Even thought it can be modified, all Scrapy projects have the same file
+structure by default, similar to this::
+
+   scrapy.cfg
+   myproject/
+       __init__.py
+       items.py
+       pipelines.py
+       settings.py
+       spiders/
+           __init__.py
+           spider1.py
+           spider2.py
+           ...
+   .scrapy/
+       scrapy.db
+
+The directory where the ``scrapy.cfg`` file resides is known as the *project
+root directory*. That file contains the name of the python module that defines
+the project settings. Here is an example::
+
+    [settings]
+    default = myproject.settings
+
+By default, Scrapy projects use a SQLite_ database to store persistent runtime
+data of the project, such as the spider queue (the list of spiders that are
+scheduled to run).  By default, this SQLite database is stored in the *project
+data directory* which, by default, is the ``.scrapy`` directory inside the
+project root directory mentioned above.
+
 Using the ``scrapy`` tool
 =========================

@@ -444,3 +482,5 @@ commands for your Scrapy project.
 Example::

    COMMANDS_MODULE = 'mybot.commands'
+
+.. _SQLite: http://en.wikipedia.org/wiki/SQLite
--- a/docs/topics/settings.rst
+++ b/docs/topics/settings.rst
@@ -878,6 +878,18 @@ Example::

    SPIDER_MODULES = ['mybot.spiders_prod', 'mybot.spiders_dev']

+.. setting:: SQLITE_DB
+
+SQLITE_DB
+---------
+
+Default: ``'scrapy.db'``
+
+The location of the project SQLite database, used for storing the spider queue
+and other persistent data of the project. If a relative path is given, is taken
+relative to the project data dir. For more info see:
+:ref:`topics-project-structure`.
+
 .. setting:: STATS_CLASS

 STATS_CLASS

--- a/scrapy/contrib/spidercontext.py
+++ b/scrapy/contrib/spidercontext.py
-import sqlite3
-
 from zope.interface import Interface, implements

 from scrapy.xlib.pydispatch import dispatcher
 from scrapy.exceptions import NotConfigured
 from scrapy.utils.misc import load_object
 from scrapy.utils.sqlite import JsonSqliteDict
-from scrapy import log, signals
+from scrapy.utils.project import sqlite_db
+from scrapy import signals

 class ISpiderContextStorage(Interface):

@@ -24,16 +23,11 @@ class SqliteSpiderContextStorage(object):
    sqlite_dict_class = JsonSqliteDict

    def __init__(self, database=None, table='contexts'):
-        try:
-            self.d = self.sqlite_dict_class(database, table)
-        except sqlite3.Error, e:
-            self.d = self.sqlite_dict_class(':memory:', table)
-            log.msg("Cannot open SQLite %r - using in-memory context storage " \
-                "instead. Error was: %r" % (database, str(e)), log.WARNING)
+        self.d = self.sqlite_dict_class(database, table)

    @classmethod
    def from_settings(cls, settings):
-        return cls(settings['SQLITE_DB'])
+        return cls(sqlite_db(settings['SQLITE_DB']))

    def get(self, spider):
        if spider.name in self.d:

--- a/scrapy/spiderqueue.py
+++ b/scrapy/spiderqueue.py
-import sqlite3
-
 from zope.interface import implements

-from scrapy import log
 from scrapy.interfaces import ISpiderQueue
 from scrapy.utils.sqlite import JsonSqlitePriorityQueue
+from scrapy.utils.project import sqlite_db


 class SqliteSpiderQueue(object):
@@ -12,16 +10,11 @@ class SqliteSpiderQueue(object):
    implements(ISpiderQueue)

    def __init__(self, database=None, table='spider_queue'):
-        try:
-            self.q = JsonSqlitePriorityQueue(database, table)
-        except sqlite3.Error, e:
-            self.q = JsonSqlitePriorityQueue(':memory:', table)
-            log.msg("Cannot open SQLite %r - using in-memory spider queue " \
-                "instead. Error was: %r" % (database, str(e)), log.WARNING)
+        self.q = JsonSqlitePriorityQueue(database, table)

    @classmethod
    def from_settings(cls, settings):
-        return cls(settings['SQLITE_DB'])
+        return cls(sqlite_db(settings['SQLITE_DB']))

    def add(self, name, **spider_args):
        d = spider_args.copy()

--- a/scrapy/utils/project.py
+++ b/scrapy/utils/project.py
+from os.path import join, dirname, abspath, isabs, exists
+from os import makedirs
+import warnings
+
+from scrapy.utils.conf import closest_scrapy_cfg, get_config
+from scrapy.utils.python import is_writable
+
+DATADIR_CFG_SECTION = 'datadir'
+
+def inside_project():
+    return bool(closest_scrapy_cfg())
+
+def project_data_dir(project='default'):
+    """Return the current project data dir, creating it if it doesn't exist"""
+    assert inside_project(), "Not inside project"
+    scrapy_cfg = closest_scrapy_cfg()
+    d = abspath(join(dirname(scrapy_cfg), '.scrapy'))
+    cfg = get_config()
+    if cfg.has_option(DATADIR_CFG_SECTION, project):
+        d = cfg.get(DATADIR_CFG_SECTION, project)
+    if not exists(d):
+        makedirs(d)
+    return d
+
+def expand_data_path(path):
+    """If path is relative, return the given path inside the project data dir,
+    otherwise return the path unmodified
+    """
+    if isabs(path):
+        return path
+    return join(project_data_dir(), path)
+
+def sqlite_db(path, nonwritable_fallback=True):
+    """Get the SQLite database to use. If path is relative, returns the given
+    path inside the project data dir, otherwise returns the path unmodified. If
+    not inside a project returns :memory: to use an in-memory database.
+
+    If nonwritable_fallback is True, and the path is not writable it issues a
+    warning and returns :memory:
+    """
+    if not inside_project() or path == ':memory:':
+        db = ':memory:'
+    else:
+        db = expand_data_path(path)
+        if not is_writable(db) and nonwritable_fallback:
+            warnings.warn("%r is not writable - using in-memory SQLite instead" % db)
+            db = ':memory:'
+    return db
--- a/scrapy/utils/python.py
+++ b/scrapy/utils/python.py
@@ -5,6 +5,7 @@ It also contains functions (or functionality) which is in Python versions
 higher than 2.5 which is the lowest version supported by Scrapy.

 """
+import os
 import re
 import inspect
 import weakref
@@ -205,3 +206,12 @@ def stringify_dict(dct_or_tuples, encoding='utf-8', keys_only=True):
            v = v.encode(encoding) if isinstance(v, unicode) else v
        d[k] = v
    return d
+
+def is_writable(path):
+    """Return True if the given path can be written (if it exists) or created
+    (if it doesn't exist)
+    """
+    if os.path.exists(path):
+        return os.access(path, os.W_OK)
+    else:
+        return os.access(os.path.dirname(path), os.W_OK)