Merge pull request #9759 from Vxider/distributed_insert_select

Distributed INSERT SELECT

Merge pull request #9759 from Vxider/distributed_insert_select
Distributed INSERT SELECT
25cf8273 · alexey-milovidov · GitHub · 9a2760d9 · 9f72c280 · 25cf8273
4 changed file
--- a/dbms/src/Core/Settings.h
+++ b/dbms/src/Core/Settings.h
@@ -112,6 +112,7 @@ struct Settings : public SettingsCollection<Settings>
    M(SettingBool, skip_unavailable_shards, false, "If 1, ClickHouse silently skips unavailable shards and nodes unresolvable through DNS. Shard is marked as unavailable when none of the replicas can be reached.", 0) \
    \
    M(SettingBool, distributed_group_by_no_merge, false, "Do not merge aggregation states from different servers for distributed query processing - in case it is for certain that there are different keys on different shards.", 0) \
+    M(SettingBool, parallel_distributed_insert_select, false, "If true, distributed insert select query in the same cluster will be processed on local tables on every shard", 0) \
    M(SettingBool, optimize_skip_unused_shards, false, "Assumes that data is distributed by sharding_key. Optimization to skip unused shards if SELECT query filters by sharding_key.", 0) \
    M(SettingUInt64, force_optimize_skip_unused_shards, 0, "Throw an exception if unused shards cannot be skipped (1 - throw only if the table has the sharding key, 2 - always throw.", 0) \
    M(SettingBool, force_optimize_skip_unused_shards_no_nested, false, "Do not apply force_optimize_skip_unused_shards for nested Distributed tables.", 0) \

--- a/dbms/src/Interpreters/InterpreterInsertQuery.cpp
+++ b/dbms/src/Interpreters/InterpreterInsertQuery.cpp
@@ -7,18 +7,24 @@
 #include <DataStreams/CountingBlockOutputStream.h>
 #include <DataStreams/InputStreamFromASTInsertQuery.h>
 #include <DataStreams/NullAndDoCopyBlockInputStream.h>
+#include <DataStreams/NullBlockOutputStream.h>
 #include <DataStreams/OwningBlockInputStream.h>
 #include <DataStreams/PushingToViewsBlockOutputStream.h>
+#include <DataStreams/RemoteBlockInputStream.h>
 #include <DataStreams/SquashingBlockOutputStream.h>
 #include <DataStreams/copyData.h>
 #include <IO/ConcatReadBuffer.h>
 #include <IO/ReadBufferFromMemory.h>
 #include <Interpreters/InterpreterSelectWithUnionQuery.h>
 #include <Access/AccessFlags.h>
+#include <Interpreters/JoinedTables.h>
 #include <Parsers/ASTFunction.h>
 #include <Parsers/ASTInsertQuery.h>
+#include <Parsers/ASTSelectQuery.h>
 #include <Parsers/ASTSelectWithUnionQuery.h>
+#include <Parsers/queryToString.h>
 #include <Storages/Kafka/StorageKafka.h>
+#include <Storages/StorageDistributed.h>
 #include <TableFunctions/TableFunctionFactory.h>
 #include <Common/checkStackSize.h>

@@ -31,6 +37,7 @@ namespace ErrorCodes
    extern const int NO_SUCH_COLUMN_IN_TABLE;
    extern const int ILLEGAL_COLUMN;
    extern const int DUPLICATE_COLUMN;
+    extern const int LOGICAL_ERROR;
 }


@@ -109,61 +116,133 @@ BlockIO InterpreterInsertQuery::execute()
        context.checkAccess(AccessType::INSERT, query.table_id, query_sample_block.getNames());

    BlockInputStreams in_streams;
-    size_t out_streams_size = 1;
-    if (query.select)
-    {
-        /// Passing 1 as subquery_depth will disable limiting size of intermediate result.
-        InterpreterSelectWithUnionQuery interpreter_select{query.select, context, SelectQueryOptions(QueryProcessingStage::Complete, 1)};
+    BlockOutputStreams out_streams;
+    bool is_distributed_insert_select = false;

-        if (table->supportsParallelInsert() && settings.max_insert_threads > 1)
+    if (query.select && table->isRemote() && settings.parallel_distributed_insert_select)
+    {
+        // Distributed INSERT SELECT
+        std::shared_ptr<StorageDistributed> storage_src;
+        auto & select = query.select->as<ASTSelectWithUnionQuery &>();
+        auto new_query = std::dynamic_pointer_cast<ASTInsertQuery>(query.clone());
+        if (select.list_of_selects->children.size() == 1)
        {
-            in_streams = interpreter_select.executeWithMultipleStreams(res.pipeline);
-            out_streams_size = std::min(size_t(settings.max_insert_threads), in_streams.size());
+            auto & select_query = select.list_of_selects->children.at(0)->as<ASTSelectQuery &>();
+            JoinedTables joined_tables(Context(context), select_query);
+
+            if (joined_tables.tablesCount() == 1)
+            {
+                storage_src = std::dynamic_pointer_cast<StorageDistributed>(joined_tables.getLeftTableStorage());
+                if (storage_src)
+                {
+                    const auto select_with_union_query = std::make_shared<ASTSelectWithUnionQuery>();
+                    select_with_union_query->list_of_selects = std::make_shared<ASTExpressionList>();
+
+                    auto new_select_query = std::dynamic_pointer_cast<ASTSelectQuery>(select_query.clone());
+                    select_with_union_query->list_of_selects->children.push_back(new_select_query);
+
+                    new_select_query->replaceDatabaseAndTable(storage_src->getRemoteDatabaseName(), storage_src->getRemoteTableName());
+
+                    new_query->select = select_with_union_query;
+                }
+            }
        }
-        else
+
+        auto storage_dst = std::dynamic_pointer_cast<StorageDistributed>(table);
+
+        if (storage_src && storage_dst && storage_src->cluster_name == storage_dst->cluster_name)
        {
-            res = interpreter_select.execute();
-            in_streams.emplace_back(res.in);
-            res.in = nullptr;
-            res.out = nullptr;
+            is_distributed_insert_select = true;
+
+            const auto & cluster = storage_src->getCluster();
+            const auto & shards_info = cluster->getShardsInfo();
+
+            String new_query_str = queryToString(new_query);
+            for (size_t shard_index : ext::range(0, shards_info.size()))
+            {
+                const auto & shard_info = shards_info[shard_index];
+                if (shard_info.isLocal())
+                {
+                    InterpreterInsertQuery interpreter(new_query, context);
+                    auto block_io = interpreter.execute();
+                    in_streams.push_back(block_io.in);
+                }
+                else
+                {
+                    auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(settings);
+                    auto connections = shard_info.pool->getMany(timeouts, &settings, PoolMode::GET_ONE);
+                    if (connections.empty() || connections.front().isNull())
+                        throw Exception(
+                            "Expected exactly one connection for shard " + toString(shard_info.shard_num), ErrorCodes::LOGICAL_ERROR);
+
+                    ///  INSERT SELECT query returns empty block
+                    auto in_stream = std::make_shared<RemoteBlockInputStream>(*connections.front(), new_query_str, Block{}, context);
+                    in_streams.push_back(in_stream);
+                }
+                out_streams.push_back(std::make_shared<NullBlockOutputStream>(Block()));
+            }
        }
    }

-    BlockOutputStreams out_streams;
-
-    for (size_t i = 0; i < out_streams_size; i++)
+    if (!is_distributed_insert_select)
    {
-        /// We create a pipeline of several streams, into which we will write data.
-        BlockOutputStreamPtr out;
-
-        /// NOTE: we explicitly ignore bound materialized views when inserting into Kafka Storage.
-        ///       Otherwise we'll get duplicates when MV reads same rows again from Kafka.
-        if (table->noPushingToViews() && !no_destination)
-            out = table->write(query_ptr, context);
-        else
-            out = std::make_shared<PushingToViewsBlockOutputStream>(table, context, query_ptr, no_destination);
-
-        /// Do not squash blocks if it is a sync INSERT into Distributed, since it lead to double bufferization on client and server side.
-        /// Client-side bufferization might cause excessive timeouts (especially in case of big blocks).
-        if (!(context.getSettingsRef().insert_distributed_sync && table->isRemote()) && !no_squash)
+        size_t out_streams_size = 1;
+        if (query.select)
        {
-            out = std::make_shared<SquashingBlockOutputStream>(
-                out, out->getHeader(), context.getSettingsRef().min_insert_block_size_rows, context.getSettingsRef().min_insert_block_size_bytes);
+            /// Passing 1 as subquery_depth will disable limiting size of intermediate result.
+            InterpreterSelectWithUnionQuery interpreter_select{ query.select, context, SelectQueryOptions(QueryProcessingStage::Complete, 1)};
+
+            if (table->supportsParallelInsert() && settings.max_insert_threads > 1)
+            {
+                in_streams = interpreter_select.executeWithMultipleStreams(res.pipeline);
+                out_streams_size = std::min(size_t(settings.max_insert_threads), in_streams.size());
+            }
+            else
+            {
+                res = interpreter_select.execute();
+                in_streams.emplace_back(res.in);
+                res.in = nullptr;
+                res.out = nullptr;
+            }
        }

-        /// Actually we don't know structure of input blocks from query/table,
-        /// because some clients break insertion protocol (columns != header)
-        out = std::make_shared<AddingDefaultBlockOutputStream>(
-            out, query_sample_block, out->getHeader(), table->getColumns().getDefaults(), context);
-
-        if (const auto & constraints = table->getConstraints(); !constraints.empty())
-            out = std::make_shared<CheckConstraintsBlockOutputStream>(query.table_id,
-             out, query_sample_block, table->getConstraints(), context);
-
-        auto out_wrapper = std::make_shared<CountingBlockOutputStream>(out);
-        out_wrapper->setProcessListElement(context.getProcessListElement());
-        out = std::move(out_wrapper);
-        out_streams.emplace_back(std::move(out));
+        for (size_t i = 0; i < out_streams_size; i++)
+        {
+            /// We create a pipeline of several streams, into which we will write data.
+            BlockOutputStreamPtr out;
+
+            /// NOTE: we explicitly ignore bound materialized views when inserting into Kafka Storage.
+            ///       Otherwise we'll get duplicates when MV reads same rows again from Kafka.
+            if (table->noPushingToViews() && !no_destination)
+                out = table->write(query_ptr, context);
+            else
+                out = std::make_shared<PushingToViewsBlockOutputStream>(table, context, query_ptr, no_destination);
+
+            /// Do not squash blocks if it is a sync INSERT into Distributed, since it lead to double bufferization on client and server side.
+            /// Client-side bufferization might cause excessive timeouts (especially in case of big blocks).
+            if (!(context.getSettingsRef().insert_distributed_sync && table->isRemote()) && !no_squash)
+            {
+                out = std::make_shared<SquashingBlockOutputStream>(
+                    out,
+                    out->getHeader(),
+                    context.getSettingsRef().min_insert_block_size_rows,
+                    context.getSettingsRef().min_insert_block_size_bytes);
+            }
+
+            /// Actually we don't know structure of input blocks from query/table,
+            /// because some clients break insertion protocol (columns != header)
+            out = std::make_shared<AddingDefaultBlockOutputStream>(
+                out, query_sample_block, out->getHeader(), table->getColumns().getDefaults(), context);
+
+            if (const auto & constraints = table->getConstraints(); !constraints.empty())
+                out = std::make_shared<CheckConstraintsBlockOutputStream>(
+                    query.table_id, out, query_sample_block, table->getConstraints(), context);
+
+            auto out_wrapper = std::make_shared<CountingBlockOutputStream>(out);
+            out_wrapper->setProcessListElement(context.getProcessListElement());
+            out = std::move(out_wrapper);
+            out_streams.emplace_back(std::move(out));
+        }
    }

    /// What type of query: INSERT or INSERT SELECT?

--- a/dbms/tests/queries/0_stateless/01099_parallel_distributed_insert_select.reference
+++ b/dbms/tests/queries/0_stateless/01099_parallel_distributed_insert_select.reference
+test_shard_localhost
+0
+1
+2
+test_cluster_two_shards_localhost
+0	2
+1	2
+2	2
+test_cluster_two_shards
+0	2
+1	2
+2	2
--- a/dbms/tests/queries/0_stateless/01099_parallel_distributed_insert_select.sql
+++ b/dbms/tests/queries/0_stateless/01099_parallel_distributed_insert_select.sql
+DROP TABLE IF EXISTS local_01099_a;
+DROP TABLE IF EXISTS local_01099_b;
+DROP TABLE IF EXISTS distributed_01099_a;
+DROP TABLE IF EXISTS distributed_01099_b;
+
+SET parallel_distributed_insert_select=1;
+
+--
+-- test_shard_localhost
+--
+
+SELECT 'test_shard_localhost';
+
+CREATE TABLE local_01099_a (number UInt64) ENGINE = Log;
+CREATE TABLE local_01099_b (number UInt64) ENGINE = Log;
+CREATE TABLE distributed_01099_a AS local_01099_a ENGINE = Distributed('test_shard_localhost', currentDatabase(), local_01099_a, rand());
+CREATE TABLE distributed_01099_b AS local_01099_b ENGINE = Distributed('test_shard_localhost', currentDatabase(), local_01099_b, rand());
+
+INSERT INTO local_01099_a SELECT number from system.numbers limit 3;
+INSERT INTO distributed_01099_b SELECT * from distributed_01099_a;
+
+SELECT * FROM distributed_01099_b;
+
+DROP TABLE local_01099_a;
+DROP TABLE local_01099_b;
+DROP TABLE distributed_01099_a;
+DROP TABLE distributed_01099_b;
+
+--
+-- test_cluster_two_shards_localhost
+--
+
+SELECT 'test_cluster_two_shards_localhost';
+
+CREATE TABLE local_01099_a (number UInt64) ENGINE = Log;
+CREATE TABLE local_01099_b (number UInt64) ENGINE = Log;
+CREATE TABLE distributed_01099_a AS local_01099_a ENGINE = Distributed('test_cluster_two_shards_localhost', currentDatabase(), local_01099_a, rand());
+CREATE TABLE distributed_01099_b AS local_01099_b ENGINE = Distributed('test_cluster_two_shards_localhost', currentDatabase(), local_01099_b, rand());
+
+INSERT INTO local_01099_a SELECT number from system.numbers limit 3;
+INSERT INTO distributed_01099_b SELECT * from distributed_01099_a;
+
+SELECT number, count(number) FROM local_01099_b group by number order by number;
+
+DROP TABLE local_01099_a;
+DROP TABLE local_01099_b;
+DROP TABLE distributed_01099_a;
+DROP TABLE distributed_01099_b;
+
+--
+-- test_cluster_two_shards
+--
+
+SELECT 'test_cluster_two_shards';
+
+CREATE TABLE local_01099_a (number UInt64) ENGINE = Log;
+CREATE TABLE local_01099_b (number UInt64) ENGINE = Log;
+CREATE TABLE distributed_01099_a AS local_01099_a ENGINE = Distributed('test_cluster_two_shards', currentDatabase(), local_01099_a, rand());
+CREATE TABLE distributed_01099_b AS local_01099_b ENGINE = Distributed('test_cluster_two_shards', currentDatabase(), local_01099_b, rand());
+
+INSERT INTO local_01099_a SELECT number from system.numbers limit 3;
+INSERT INTO distributed_01099_b SELECT * from distributed_01099_a;
+SYSTEM FLUSH DISTRIBUTED distributed_01099_b;
+
+SELECT number, count(number) FROM local_01099_b group by number order by number;
+
+DROP TABLE local_01099_a;
+DROP TABLE local_01099_b;
+DROP TABLE distributed_01099_a;
+DROP TABLE distributed_01099_b;