提交 dfe6d3d5 编写于 作者: P Paul Tremberth 提交者: GitHub

Merge pull request #2456 from elacuesta/feed_export_beautify

[MRG+1] Feed exports: beautify JSON and XML
......@@ -140,7 +140,7 @@ output examples, which assume you're exporting these two items::
BaseItemExporter
----------------
.. class:: BaseItemExporter(fields_to_export=None, export_empty_fields=False, encoding='utf-8')
.. class:: BaseItemExporter(fields_to_export=None, export_empty_fields=False, encoding='utf-8', indent=0)
This is the (abstract) base class for all Item Exporters. It provides
support for common features used by all (concrete) Item Exporters, such as
......@@ -149,7 +149,7 @@ BaseItemExporter
These features can be configured through the constructor arguments which
populate their respective instance attributes: :attr:`fields_to_export`,
:attr:`export_empty_fields`, :attr:`encoding`.
:attr:`export_empty_fields`, :attr:`encoding`, :attr:`indent`.
.. method:: export_item(item)
......@@ -216,6 +216,15 @@ BaseItemExporter
encoding). Other value types are passed unchanged to the specific
serialization library.
.. attribute:: indent
Amount of spaces used to indent the output on each level. Defaults to ``0``.
* ``indent=None`` selects the most compact representation,
all items in the same line with no indentation
* ``indent<=0`` each item on its own line, no indentation
* ``indent>0`` each item on its own line, indented with the provided numeric value
.. highlight:: none
XmlItemExporter
......
......@@ -209,6 +209,7 @@ These are the settings used for configuring the feed exports:
* :setting:`FEED_STORE_EMPTY`
* :setting:`FEED_EXPORT_ENCODING`
* :setting:`FEED_EXPORT_FIELDS`
* :setting:`FEED_EXPORT_INDENT`
.. currentmodule:: scrapy.extensions.feedexport
......@@ -266,6 +267,22 @@ If an exporter requires a fixed set of fields (this is the case for
is empty or None, then Scrapy tries to infer field names from the
exported data - currently it uses field names from the first item.
.. setting:: FEED_EXPORT_INDENT
FEED_EXPORT_INDENT
------------------
Default: ``0``
Amount of spaces used to indent the output on each level. If ``FEED_EXPORT_INDENT``
is a non-negative integer, then array elements and object members will be pretty-printed
with that indent level. An indent level of ``0`` (the default), or negative,
will put each item on a new line. ``None`` selects the most compact representation.
Currently implemented only by :class:`~scrapy.exporters.JsonItemExporter`
and :class:`~scrapy.exporters.XmlItemExporter`, i.e. when you are exporting
to ``.json`` or ``.xml``.
.. setting:: FEED_STORE_EMPTY
FEED_STORE_EMPTY
......
......@@ -36,6 +36,7 @@ class BaseItemExporter(object):
self.encoding = options.pop('encoding', None)
self.fields_to_export = options.pop('fields_to_export', None)
self.export_empty_fields = options.pop('export_empty_fields', False)
self.indent = options.pop('indent', None)
if not dont_fail and options:
raise TypeError("Unexpected options: %s" % ', '.join(options.keys()))
......@@ -98,21 +99,33 @@ class JsonItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
self._configure(kwargs, dont_fail=True)
self.file = file
# there is a small difference between the behaviour or JsonItemExporter.indent
# and ScrapyJSONEncoder.indent. ScrapyJSONEncoder.indent=None is needed to prevent
# the addition of newlines everywhere
json_indent = self.indent if self.indent is not None and self.indent > 0 else None
kwargs.setdefault('indent', json_indent)
kwargs.setdefault('ensure_ascii', not self.encoding)
self.encoder = ScrapyJSONEncoder(**kwargs)
self.first_item = True
def _beautify_newline(self):
if self.indent is not None:
self.file.write(b'\n')
def start_exporting(self):
self.file.write(b"[\n")
self.file.write(b"[")
self._beautify_newline()
def finish_exporting(self):
self.file.write(b"\n]")
self._beautify_newline()
self.file.write(b"]")
def export_item(self, item):
if self.first_item:
self.first_item = False
else:
self.file.write(b',\n')
self.file.write(b',')
self._beautify_newline()
itemdict = dict(self._get_serialized_fields(item))
data = self.encoder.encode(itemdict)
self.file.write(to_bytes(data, self.encoding))
......@@ -128,33 +141,52 @@ class XmlItemExporter(BaseItemExporter):
self.encoding = 'utf-8'
self.xg = XMLGenerator(file, encoding=self.encoding)
def _beautify_newline(self, new_item=False):
if self.indent is not None and (self.indent > 0 or new_item):
self._xg_characters('\n')
def _beautify_indent(self, depth=1):
if self.indent:
self._xg_characters(' ' * self.indent * depth)
def start_exporting(self):
self.xg.startDocument()
self.xg.startElement(self.root_element, {})
self._beautify_newline(new_item=True)
def export_item(self, item):
self._beautify_indent(depth=1)
self.xg.startElement(self.item_element, {})
self._beautify_newline()
for name, value in self._get_serialized_fields(item, default_value=''):
self._export_xml_field(name, value)
self._export_xml_field(name, value, depth=2)
self._beautify_indent(depth=1)
self.xg.endElement(self.item_element)
self._beautify_newline(new_item=True)
def finish_exporting(self):
self.xg.endElement(self.root_element)
self.xg.endDocument()
def _export_xml_field(self, name, serialized_value):
def _export_xml_field(self, name, serialized_value, depth):
self._beautify_indent(depth=depth)
self.xg.startElement(name, {})
if hasattr(serialized_value, 'items'):
self._beautify_newline()
for subname, value in serialized_value.items():
self._export_xml_field(subname, value)
self._export_xml_field(subname, value, depth=depth+1)
self._beautify_indent(depth=depth)
elif is_listlike(serialized_value):
self._beautify_newline()
for value in serialized_value:
self._export_xml_field('value', value)
self._export_xml_field('value', value, depth=depth+1)
self._beautify_indent(depth=depth)
elif isinstance(serialized_value, six.text_type):
self._xg_characters(serialized_value)
else:
self._xg_characters(str(serialized_value))
self.xg.endElement(name)
self._beautify_newline()
# Workaround for http://bugs.python.org/issue17606
# Before Python 2.7.4 xml.sax.saxutils required bytes;
......
......@@ -172,6 +172,9 @@ class FeedExporter(object):
self.store_empty = settings.getbool('FEED_STORE_EMPTY')
self._exporting = False
self.export_fields = settings.getlist('FEED_EXPORT_FIELDS') or None
self.indent = None
if settings.get('FEED_EXPORT_INDENT') is not None:
self.indent = settings.getint('FEED_EXPORT_INDENT')
uripar = settings['FEED_URI_PARAMS']
self._uripar = load_object(uripar) if uripar else lambda x, y: None
......@@ -188,7 +191,7 @@ class FeedExporter(object):
storage = self._get_storage(uri)
file = storage.open(spider)
exporter = self._get_exporter(file, fields_to_export=self.export_fields,
encoding=self.export_encoding)
encoding=self.export_encoding, indent=self.indent)
if self.store_empty:
exporter.start_exporting()
self._exporting = True
......
......@@ -161,6 +161,7 @@ FEED_EXPORTERS_BASE = {
'marshal': 'scrapy.exporters.MarshalItemExporter',
'pickle': 'scrapy.exporters.PickleItemExporter',
}
FEED_EXPORT_INDENT = 0
FILES_STORE_S3_ACL = 'private'
......
......@@ -319,14 +319,14 @@ class FeedExportTest(unittest.TestCase):
@defer.inlineCallbacks
def test_export_no_items_store_empty(self):
formats = (
('json', b'[\n\n]'),
('json', b'[]'),
('jsonlines', b''),
('xml', b'<?xml version="1.0" encoding="utf-8"?>\n<items></items>'),
('csv', b''),
)
for fmt, expctd in formats:
settings = {'FEED_FORMAT': fmt, 'FEED_STORE_EMPTY': True}
settings = {'FEED_FORMAT': fmt, 'FEED_STORE_EMPTY': True, 'FEED_EXPORT_INDENT': None}
data = yield self.exported_no_data(settings)
self.assertEqual(data, expctd)
......@@ -425,25 +425,177 @@ class FeedExportTest(unittest.TestCase):
header = ['foo']
formats = {
'json': u'[\n{"foo": "Test\\u00d6"}\n]'.encode('utf-8'),
'json': u'[{"foo": "Test\\u00d6"}]'.encode('utf-8'),
'jsonlines': u'{"foo": "Test\\u00d6"}\n'.encode('utf-8'),
'xml': u'<?xml version="1.0" encoding="utf-8"?>\n<items><item><foo>Test\xd6</foo></item></items>'.encode('utf-8'),
'csv': u'foo\r\nTest\xd6\r\n'.encode('utf-8'),
}
for format in formats:
settings = {'FEED_FORMAT': format}
for format, expected in formats.items():
settings = {'FEED_FORMAT': format, 'FEED_EXPORT_INDENT': None}
data = yield self.exported_data(items, settings)
self.assertEqual(formats[format], data)
self.assertEqual(expected, data)
formats = {
'json': u'[\n{"foo": "Test\xd6"}\n]'.encode('latin-1'),
'json': u'[{"foo": "Test\xd6"}]'.encode('latin-1'),
'jsonlines': u'{"foo": "Test\xd6"}\n'.encode('latin-1'),
'xml': u'<?xml version="1.0" encoding="latin-1"?>\n<items><item><foo>Test\xd6</foo></item></items>'.encode('latin-1'),
'csv': u'foo\r\nTest\xd6\r\n'.encode('latin-1'),
}
for format in formats:
settings = {'FEED_FORMAT': format, 'FEED_EXPORT_ENCODING': 'latin-1'}
settings = {'FEED_EXPORT_INDENT': None, 'FEED_EXPORT_ENCODING': 'latin-1'}
for format, expected in formats.items():
settings['FEED_FORMAT'] = format
data = yield self.exported_data(items, settings)
self.assertEqual(formats[format], data)
self.assertEqual(expected, data)
@defer.inlineCallbacks
def test_export_indentation(self):
items = [
{'foo': ['bar']},
{'key': 'value'},
]
test_cases = [
# JSON
{
'format': 'json',
'indent': None,
'expected': b'[{"foo": ["bar"]},{"key": "value"}]',
},
{
'format': 'json',
'indent': -1,
'expected': b"""[
{"foo": ["bar"]},
{"key": "value"}
]""",
},
{
'format': 'json',
'indent': 0,
'expected': b"""[
{"foo": ["bar"]},
{"key": "value"}
]""",
},
{
'format': 'json',
'indent': 2,
'expected': b"""[
{
"foo": [
"bar"
]
},
{
"key": "value"
}
]""",
},
{
'format': 'json',
'indent': 4,
'expected': b"""[
{
"foo": [
"bar"
]
},
{
"key": "value"
}
]""",
},
{
'format': 'json',
'indent': 5,
'expected': b"""[
{
"foo": [
"bar"
]
},
{
"key": "value"
}
]""",
},
# XML
{
'format': 'xml',
'indent': None,
'expected': b"""<?xml version="1.0" encoding="utf-8"?>
<items><item><foo><value>bar</value></foo></item><item><key>value</key></item></items>""",
},
{
'format': 'xml',
'indent': -1,
'expected': b"""<?xml version="1.0" encoding="utf-8"?>
<items>
<item><foo><value>bar</value></foo></item>
<item><key>value</key></item>
</items>""",
},
{
'format': 'xml',
'indent': 0,
'expected': b"""<?xml version="1.0" encoding="utf-8"?>
<items>
<item><foo><value>bar</value></foo></item>
<item><key>value</key></item>
</items>""",
},
{
'format': 'xml',
'indent': 2,
'expected': b"""<?xml version="1.0" encoding="utf-8"?>
<items>
<item>
<foo>
<value>bar</value>
</foo>
</item>
<item>
<key>value</key>
</item>
</items>""",
},
{
'format': 'xml',
'indent': 4,
'expected': b"""<?xml version="1.0" encoding="utf-8"?>
<items>
<item>
<foo>
<value>bar</value>
</foo>
</item>
<item>
<key>value</key>
</item>
</items>""",
},
{
'format': 'xml',
'indent': 5,
'expected': b"""<?xml version="1.0" encoding="utf-8"?>
<items>
<item>
<foo>
<value>bar</value>
</foo>
</item>
<item>
<key>value</key>
</item>
</items>""",
},
]
for row in test_cases:
settings = {'FEED_FORMAT': row['format'], 'FEED_EXPORT_INDENT': row['indent']}
data = yield self.exported_data(items, settings)
print(row['format'], row['indent'])
self.assertEqual(row['expected'], data)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册