Merge pull request #4200 from DarkWanderer/feature/row-binary-with-headers

Added RowBinaryWithNamesAndTypes format

Merge pull request #4200 from DarkWanderer/feature/row-binary-with-headers
Added RowBinaryWithNamesAndTypes format
f58e0c7c · alexey-milovidov · GitHub · ae533808 · 6e921838 · f58e0c7c
7 changed file
--- a/dbms/src/Formats/BinaryRowInputStream.cpp
+++ b/dbms/src/Formats/BinaryRowInputStream.cpp
-#include <Core/Block.h>
 #include <IO/ReadBuffer.h>
+#include <IO/ReadHelpers.h>
 #include <Formats/BinaryRowInputStream.h>
 #include <Formats/FormatFactory.h>
 #include <Formats/BlockInputStreamFromRowInputStream.h>
@@ -8,8 +8,8 @@
 namespace DB
 {

-BinaryRowInputStream::BinaryRowInputStream(ReadBuffer & istr_, const Block & header_)
-    : istr(istr_), header(header_)
+BinaryRowInputStream::BinaryRowInputStream(ReadBuffer & istr_, const Block & header_, bool with_names_, bool with_types_)
+    : istr(istr_), header(header_), with_names(with_names_), with_types(with_types_)
 {
 }

@@ -27,6 +27,34 @@ bool BinaryRowInputStream::read(MutableColumns & columns, RowReadExtension &)
 }


+void BinaryRowInputStream::readPrefix()
+{
+    UInt64 columns = 0;
+    String tmp;
+
+    if (with_names || with_types)
+    {
+        readVarUInt(columns, istr);
+    }
+
+    if (with_names)
+    {
+        for (size_t i = 0; i < columns; ++i)
+        {
+            readStringBinary(tmp, istr);
+        }
+    }
+
+    if (with_types)
+    {
+        for (size_t i = 0; i < columns; ++i)
+        {
+            readStringBinary(tmp, istr);
+        }
+    }
+}
+
+
 void registerInputFormatRowBinary(FormatFactory & factory)
 {
    factory.registerInputFormat("RowBinary", [](
@@ -37,7 +65,19 @@ void registerInputFormatRowBinary(FormatFactory & factory)
        const FormatSettings & settings)
    {
        return std::make_shared<BlockInputStreamFromRowInputStream>(
-            std::make_shared<BinaryRowInputStream>(buf, sample),
+            std::make_shared<BinaryRowInputStream>(buf, sample, false, false),
+            sample, max_block_size, settings);
+    });
+
+    factory.registerInputFormat("RowBinaryWithNamesAndTypes", [](
+        ReadBuffer & buf,
+        const Block & sample,
+        const Context &,
+        size_t max_block_size,
+        const FormatSettings & settings)
+    {
+        return std::make_shared<BlockInputStreamFromRowInputStream>(
+            std::make_shared<BinaryRowInputStream>(buf, sample, true, true),
            sample, max_block_size, settings);
    });
 }

--- a/dbms/src/Formats/BinaryRowInputStream.h
+++ b/dbms/src/Formats/BinaryRowInputStream.h
@@ -15,13 +15,16 @@ class ReadBuffer;
 class BinaryRowInputStream : public IRowInputStream
 {
 public:
-    BinaryRowInputStream(ReadBuffer & istr_, const Block & header_);
+    BinaryRowInputStream(ReadBuffer & istr_, const Block & sample_, bool with_names_, bool with_types_);

    bool read(MutableColumns & columns, RowReadExtension &) override;
+    void readPrefix() override;

 private:
    ReadBuffer & istr;
    Block header;
+    bool with_names;
+    bool with_types;
 };

 }
--- a/dbms/src/Formats/BinaryRowOutputStream.cpp
+++ b/dbms/src/Formats/BinaryRowOutputStream.cpp
 #include <IO/WriteBuffer.h>
+#include <IO/WriteHelpers.h>
 #include <Columns/IColumn.h>
 #include <DataTypes/IDataType.h>
 #include <Formats/BinaryRowOutputStream.h>
@@ -9,11 +10,37 @@
 namespace DB
 {

-BinaryRowOutputStream::BinaryRowOutputStream(WriteBuffer & ostr_)
-    : ostr(ostr_)
+BinaryRowOutputStream::BinaryRowOutputStream(WriteBuffer & ostr_, const Block & sample_, bool with_names_, bool with_types_)
+    : ostr(ostr_), with_names(with_names_), with_types(with_types_), sample(sample_)
 {
 }

+void BinaryRowOutputStream::writePrefix()
+{
+    size_t columns = sample.columns();
+
+    if (with_names || with_types)
+    {
+        writeVarUInt(columns, ostr);
+    }
+
+    if (with_names)
+    {
+        for (size_t i = 0; i < columns; ++i)
+        {
+            writeStringBinary(sample.safeGetByPosition(i).name, ostr);
+        }
+    }
+
+    if (with_types)
+    {
+        for (size_t i = 0; i < columns; ++i)
+        {
+            writeStringBinary(sample.safeGetByPosition(i).type->getName(), ostr);
+        }
+    }
+}
+
 void BinaryRowOutputStream::flush()
 {
    ostr.next();
@@ -33,7 +60,17 @@ void registerOutputFormatRowBinary(FormatFactory & factory)
        const FormatSettings &)
    {
        return std::make_shared<BlockOutputStreamFromRowOutputStream>(
-            std::make_shared<BinaryRowOutputStream>(buf), sample);
+            std::make_shared<BinaryRowOutputStream>(buf, sample, false, false), sample);
+    });
+
+    factory.registerOutputFormat("RowBinaryWithNamesAndTypes", [](
+        WriteBuffer & buf,
+        const Block & sample,
+        const Context &,
+        const FormatSettings &)
+    {
+        return std::make_shared<BlockOutputStreamFromRowOutputStream>(
+            std::make_shared<BinaryRowOutputStream>(buf, sample, true, true), sample);
    });
 }


--- a/dbms/src/Formats/BinaryRowOutputStream.h
+++ b/dbms/src/Formats/BinaryRowOutputStream.h
 #pragma once

 #include <Formats/IRowOutputStream.h>
+#include <Core/Block.h>


 namespace DB
@@ -16,9 +17,10 @@ class WriteBuffer;
 class BinaryRowOutputStream : public IRowOutputStream
 {
 public:
-    BinaryRowOutputStream(WriteBuffer & ostr_);
+    BinaryRowOutputStream(WriteBuffer & ostr_, const Block & sample_, bool with_names_, bool with_types_);

    void writeField(const IColumn & column, const IDataType & type, size_t row_num) override;
+    void writePrefix() override;

    void flush() override;

@@ -26,6 +28,9 @@ public:

 protected:
    WriteBuffer & ostr;
+    bool with_names;
+    bool with_types;
+    const Block sample;
 };

 }

--- a/dbms/tests/queries/0_stateless/00309_formats.reference
+++ b/dbms/tests/queries/0_stateless/00309_formats.reference
--- a/dbms/tests/queries/0_stateless/00309_formats.sql
+++ b/dbms/tests/queries/0_stateless/00309_formats.sql
 SET output_format_write_statistics = 0;
 SELECT number * 246 + 10 AS n, toDate('2000-01-01') + n AS d, range(n) AS arr, arrayStringConcat(arrayMap(x -> reinterpretAsString(x), arr)) AS s, (n, d) AS tuple FROM system.numbers LIMIT 2 FORMAT RowBinary;
+SELECT number * 246 + 10 AS n, toDate('2000-01-01') + n AS d, range(n) AS arr, arrayStringConcat(arrayMap(x -> reinterpretAsString(x), arr)) AS s, (n, d) AS tuple FROM system.numbers LIMIT 2 FORMAT RowBinaryWithNamesAndTypes;
 SELECT number * 246 + 10 AS n, toDate('2000-01-01') + n AS d, range(n) AS arr, arrayStringConcat(arrayMap(x -> reinterpretAsString(x), arr)) AS s, (n, d) AS tuple FROM system.numbers LIMIT 2 FORMAT TabSeparatedWithNamesAndTypes;
 SELECT number * 246 + 10 AS n, toDate('2000-01-01') + n AS d, range(n) AS arr, arrayStringConcat(arrayMap(x -> reinterpretAsString(x), arr)) AS s, (n, d) AS tuple FROM system.numbers LIMIT 2 FORMAT TabSeparatedRaw;
 SELECT number * 246 + 10 AS n, toDate('2000-01-01') + n AS d, range(n) AS arr, arrayStringConcat(arrayMap(x -> reinterpretAsString(x), arr)) AS s, (n, d) AS tuple FROM system.numbers LIMIT 2 FORMAT CSV;

--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@@ -448,6 +448,13 @@ Array is represented as a varint length (unsigned [LEB128](https://en.wikipedia.

 For [NULL](../query_language/syntax.md#null-literal) support, an additional byte containing 1 or 0 is added before each [Nullable](../data_types/nullable.md) value. If 1, then the value is `NULL` and this byte is interpreted as a separate value. If 0, the value after the byte is not `NULL`.

+## RowBinaryWithNamesAndTypes {#rowbinarywithnamesandtypes}
+
+Similar to [RowBinary](#rowbinary), but with added header:
+* [LEB128](https://en.wikipedia.org/wiki/LEB128)-encoded number of columns (N)
+* N `String`s specifying column names
+* N `String`s specifying column types
+
 ## Values

 Prints every row in brackets. Rows are separated by commas. There is no comma after the last row. The values inside the brackets are also comma-separated. Numbers are output in decimal format without quotes. Arrays are output in square brackets. Strings, dates, and dates with times are output in quotes. Escaping rules and parsing are similar to the [TabSeparated](#tabseparated) format. During formatting, extra spaces aren't inserted, but during parsing, they are allowed and skipped (except for spaces inside array values, which are not allowed). [NULL](../query_language/syntax.md) is represented as `NULL`.