added File Export Pipeline, a wrapper to use Item Exporters as Item Pipelines

0b152c99 · Pablo Hoffman · 3c6deaef · 0b152c99 · 0b152c99
显示空白变更内容
内联并排

Showing with 178 addition and 4 deletion

docs/topics/item-pipeline.rst docs/topics/item-pipeline.rst +123 -4

scrapy/contrib/pipeline/fileexport.py scrapy/contrib/pipeline/fileexport.py +55 -0

未找到文件。
--- a/docs/topics/item-pipeline.rst
+++ b/docs/topics/item-pipeline.rst
@@ -4,15 +4,18 @@
 Item Pipeline
 =============

+.. module:: scrapy.contrib.pipeline
+   :synopsis: Item Pipeline manager and built-in pipelines
+
 After an item has been scraped by a spider it is sent to the Item Pipeline
 which process it through several components that are executed sequentially.

 Item pipeline are usually implemented on each project. Typical usage for item
 pipelines are:

- * HTML cleansing
- * validation
- * persistence (storing the scraped item)
+* HTML cleansing
+* validation
+* persistence (storing the scraped item)


 Writing your own item pipeline
@@ -88,3 +91,119 @@ spider returns multiples items with the same id::
            else:
                self.duplicates[domain].add(item.id)
                return item
+
+Built-in Item Pipelines reference
+=================================
+
+Here is a list of item pipelines bundled with Scrapy.
+
+File Export Pipeline
+--------------------
+
+.. module:: scrapy.contrib.pipeline.fileexport
+
+.. class:: FileExportPipeline
+
+This pipeline exports all scraped items into a file, using different formats.
+
+It is simple but convenient wrapper to use :doc:`Item Exporters
+</experimental/exporters>` as :ref:`Item Pipelines <topics-item-pipeline>`. If
+you need more custom/advanced functionality you can write your own pipeline or
+subclass the :doc:`Item Exporters </experimental/exporters>` .
+
+It supports the following settings:
+
+* :setting:`EXPORT_FORMAT` (mandatory)
+* :setting:`EXPORT_FILE` (mandatory)
+* :setting:`EXPORT_FIELDS`
+* :setting:`EXPORT_EMPTY`
+* :setting:`EXPORT_ENCODING`
+
+If any mandatory setting is not set, this pipeline will be automatically
+disabled.
+
+File Export Pipeline examples
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here are some usage examples of the File Export Pipeline.
+
+To export all scraped items into a XML file::
+
+    EXPORT_FORMAT = 'xml'
+    EXPORT_FILE = 'scraped_items.xml'
+
+To export all scraped items into a CSV file (without a headers line)::
+
+    EXPORT_FORMAT = 'csv'
+    EXPORT_FILE = 'scraped_items.csv'
+
+To export all scraped items into a CSV file (with a headers line)::
+
+    EXPORT_FORMAT = 'csv_headers'
+    EXPORT_FILE = 'scraped_items_with_headers.csv'
+    EXPORT_FILEDS = ['name', 'price', 'description']
+
+File Export Pipeline settings
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: scrapy.contrib.exporter
+
+.. setting:: EXPORT_FORMAT
+
+EXPORT_FORMAT
+^^^^^^^^^^^^^
+
+The format to use for exporting. Here is a list of all available formats. Click
+on the respective Item Exporter to get more info.
+
+* ``xml``: uses a :class:`XmlItemExporter`
+
+* ``csv``: uses a :class:`CsvItemExporter`
+
+* ``csv_headers``: uses a :class:`CsvItemExporter` with a the column headers on
+  the first line. This format requires you to specify the fields to export
+  using the :setting:`EXPORT_FIELDS` setting.
+
+* ``jsonlines``: uses a :class:`jsonlines.JsonLinesItemExporter`
+
+* ``pickle``: uses a :class:`PickleItemExporter`
+
+* ``pprint``: uses a :class:`PprintItemExporter`
+
+This setting is mandatory in order to use the File Export Pipeline.
+
+.. setting:: EXPORT_FILE
+
+EXPORT_FILE
+^^^^^^^^^^^
+
+The name of the file where the items will be exported. This setting is
+mandatory in order to use the File Export Pipeline.
+
+.. setting:: EXPORT_FIELDS
+
+EXPORT_FIELDS
+^^^^^^^^^^^^^
+
+Default: ``None``
+
+The name of the item fields that will be exported. This will be use for the
+:attr:`~BaseItemExporter.fields_to_export` Item Exporter attribute. If
+``None``, all fields will be exported.
+
+.. setting:: EXPORT_EMPTY
+
+EXPORT_EMPTY
+^^^^^^^^^^^^
+
+Whether to export empty (non populated) fields. This will be used for the
+:attr:`~BaseItemExporter.export_empty_fields` Item Exporter attribute.
+
+.. setting:: EXPORT_ENCODING
+
+EXPORT_ENCODING
+^^^^^^^^^^^^^^^
+
+The encoding to use for exporting. Ths will be used for the
+:attr:`~BaseItemExporter.encoding` Item Exporter attribute.
+
--- a/scrapy/contrib/pipeline/fileexport.py
+++ b/scrapy/contrib/pipeline/fileexport.py
+"""
+File Export Pipeline
+
+See documentation in docs/topics/item-pipeline.rst
+"""
+
+from scrapy.xlib.pydispatch import dispatcher
+from scrapy.core import signals
+from scrapy.core.exceptions import NotConfigured
+from scrapy.contrib import exporter
+from scrapy.conf import settings
+
+class FileExportPipeline(object):
+
+    def __init__(self):
+        self.exporter, self.file = self.get_exporter_and_file()
+        self.exporter.start_exporting()
+        dispatcher.connect(self.engine_stopped, signals.engine_stopped)
+
+    def process_item(self, domain, item):
+        self.exporter.export_item(item)
+        return item
+
+    def engine_stopped(self):
+        self.exporter.finish_exporting()
+        self.file.close()
+
+    def get_exporter_and_file(self):
+        format = settings['EXPORT_FORMAT']
+        filename = settings['EXPORT_FILE']
+        if not format or not filename:
+            raise NotConfigured
+        exp_kwargs = {
+            'fields_to_export': settings.getlist('EXPORT_FIELDS') or None,
+            'export_empty_fields': settings.getbool('EXPORT_EMPTY', False),
+            'encoding': settings.get('EXPORT_ENCODING', 'utf-8'),
+        }
+        file = open(filename, 'wb')
+        if format == 'xml':
+            exp = exporter.XmlItemExporter(file, **exp_kwargs)
+        elif format == 'csv':
+            exp = exporter.CsvItemExporter(file, **exp_kwargs)
+        elif format == 'csv_headers':
+            exp = exporter.CsvItemExporter(file, include_headers_line=True, \
+                **exp_kwargs)
+        elif format == 'pprint':
+            exp = exporter.PprintItemExporter(file, **exp_kwargs)
+        elif format == 'pickle':
+            exp = exporter.PickleItemExporter(file, **exp_kwargs)
+        elif format == 'json':
+            from scrapy.contrib.exporter import jsonlines
+            exp = jsonlines.JsonLinesItemExporter(file, **exp_kwargs)
+        else:
+            raise NotConfigured("Unsupported export format: %s" % format)
+        return exp, file