From 0b152c99b5b8b02d48509347de902d4dc5e406a2 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Mon, 31 Aug 2009 20:40:41 -0300 Subject: [PATCH] added File Export Pipeline, a wrapper to use Item Exporters as Item Pipelines --- docs/topics/item-pipeline.rst | 127 +++++++++++++++++++++++++- scrapy/contrib/pipeline/fileexport.py | 55 +++++++++++ 2 files changed, 178 insertions(+), 4 deletions(-) create mode 100644 scrapy/contrib/pipeline/fileexport.py diff --git a/docs/topics/item-pipeline.rst b/docs/topics/item-pipeline.rst index 0a57bff8d..cd0ce085f 100644 --- a/docs/topics/item-pipeline.rst +++ b/docs/topics/item-pipeline.rst @@ -4,15 +4,18 @@ Item Pipeline ============= +.. module:: scrapy.contrib.pipeline + :synopsis: Item Pipeline manager and built-in pipelines + After an item has been scraped by a spider it is sent to the Item Pipeline which process it through several components that are executed sequentially. Item pipeline are usually implemented on each project. Typical usage for item -pipelines are: +pipelines are: - * HTML cleansing - * validation - * persistence (storing the scraped item) +* HTML cleansing +* validation +* persistence (storing the scraped item) Writing your own item pipeline @@ -88,3 +91,119 @@ spider returns multiples items with the same id:: else: self.duplicates[domain].add(item.id) return item + +Built-in Item Pipelines reference +================================= + +Here is a list of item pipelines bundled with Scrapy. + +File Export Pipeline +-------------------- + +.. module:: scrapy.contrib.pipeline.fileexport + +.. class:: FileExportPipeline + +This pipeline exports all scraped items into a file, using different formats. + +It is simple but convenient wrapper to use :doc:`Item Exporters +` as :ref:`Item Pipelines `. If +you need more custom/advanced functionality you can write your own pipeline or +subclass the :doc:`Item Exporters ` . + +It supports the following settings: + +* :setting:`EXPORT_FORMAT` (mandatory) +* :setting:`EXPORT_FILE` (mandatory) +* :setting:`EXPORT_FIELDS` +* :setting:`EXPORT_EMPTY` +* :setting:`EXPORT_ENCODING` + +If any mandatory setting is not set, this pipeline will be automatically +disabled. + +File Export Pipeline examples +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Here are some usage examples of the File Export Pipeline. + +To export all scraped items into a XML file:: + + EXPORT_FORMAT = 'xml' + EXPORT_FILE = 'scraped_items.xml' + +To export all scraped items into a CSV file (without a headers line):: + + EXPORT_FORMAT = 'csv' + EXPORT_FILE = 'scraped_items.csv' + +To export all scraped items into a CSV file (with a headers line):: + + EXPORT_FORMAT = 'csv_headers' + EXPORT_FILE = 'scraped_items_with_headers.csv' + EXPORT_FILEDS = ['name', 'price', 'description'] + +File Export Pipeline settings +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. currentmodule:: scrapy.contrib.exporter + +.. setting:: EXPORT_FORMAT + +EXPORT_FORMAT +^^^^^^^^^^^^^ + +The format to use for exporting. Here is a list of all available formats. Click +on the respective Item Exporter to get more info. + +* ``xml``: uses a :class:`XmlItemExporter` + +* ``csv``: uses a :class:`CsvItemExporter` + +* ``csv_headers``: uses a :class:`CsvItemExporter` with a the column headers on + the first line. This format requires you to specify the fields to export + using the :setting:`EXPORT_FIELDS` setting. + +* ``jsonlines``: uses a :class:`jsonlines.JsonLinesItemExporter` + +* ``pickle``: uses a :class:`PickleItemExporter` + +* ``pprint``: uses a :class:`PprintItemExporter` + +This setting is mandatory in order to use the File Export Pipeline. + +.. setting:: EXPORT_FILE + +EXPORT_FILE +^^^^^^^^^^^ + +The name of the file where the items will be exported. This setting is +mandatory in order to use the File Export Pipeline. + +.. setting:: EXPORT_FIELDS + +EXPORT_FIELDS +^^^^^^^^^^^^^ + +Default: ``None`` + +The name of the item fields that will be exported. This will be use for the +:attr:`~BaseItemExporter.fields_to_export` Item Exporter attribute. If +``None``, all fields will be exported. + +.. setting:: EXPORT_EMPTY + +EXPORT_EMPTY +^^^^^^^^^^^^ + +Whether to export empty (non populated) fields. This will be used for the +:attr:`~BaseItemExporter.export_empty_fields` Item Exporter attribute. + +.. setting:: EXPORT_ENCODING + +EXPORT_ENCODING +^^^^^^^^^^^^^^^ + +The encoding to use for exporting. Ths will be used for the +:attr:`~BaseItemExporter.encoding` Item Exporter attribute. + diff --git a/scrapy/contrib/pipeline/fileexport.py b/scrapy/contrib/pipeline/fileexport.py new file mode 100644 index 000000000..461d6e492 --- /dev/null +++ b/scrapy/contrib/pipeline/fileexport.py @@ -0,0 +1,55 @@ +""" +File Export Pipeline + +See documentation in docs/topics/item-pipeline.rst +""" + +from scrapy.xlib.pydispatch import dispatcher +from scrapy.core import signals +from scrapy.core.exceptions import NotConfigured +from scrapy.contrib import exporter +from scrapy.conf import settings + +class FileExportPipeline(object): + + def __init__(self): + self.exporter, self.file = self.get_exporter_and_file() + self.exporter.start_exporting() + dispatcher.connect(self.engine_stopped, signals.engine_stopped) + + def process_item(self, domain, item): + self.exporter.export_item(item) + return item + + def engine_stopped(self): + self.exporter.finish_exporting() + self.file.close() + + def get_exporter_and_file(self): + format = settings['EXPORT_FORMAT'] + filename = settings['EXPORT_FILE'] + if not format or not filename: + raise NotConfigured + exp_kwargs = { + 'fields_to_export': settings.getlist('EXPORT_FIELDS') or None, + 'export_empty_fields': settings.getbool('EXPORT_EMPTY', False), + 'encoding': settings.get('EXPORT_ENCODING', 'utf-8'), + } + file = open(filename, 'wb') + if format == 'xml': + exp = exporter.XmlItemExporter(file, **exp_kwargs) + elif format == 'csv': + exp = exporter.CsvItemExporter(file, **exp_kwargs) + elif format == 'csv_headers': + exp = exporter.CsvItemExporter(file, include_headers_line=True, \ + **exp_kwargs) + elif format == 'pprint': + exp = exporter.PprintItemExporter(file, **exp_kwargs) + elif format == 'pickle': + exp = exporter.PickleItemExporter(file, **exp_kwargs) + elif format == 'json': + from scrapy.contrib.exporter import jsonlines + exp = jsonlines.JsonLinesItemExporter(file, **exp_kwargs) + else: + raise NotConfigured("Unsupported export format: %s" % format) + return exp, file -- GitLab