ExpressionAnalyzer.cpp 41.6 KB
Newer Older
1
#include <Poco/Util/Application.h>
A
Alexey Milovidov 已提交
2
#include <Poco/String.h>
3

4 5
#include <Core/Block.h>

6 7 8 9
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTAsterisk.h>
10
#include <Parsers/ASTQualifiedAsterisk.h>
11 12
#include <Parsers/ASTExpressionList.h>
#include <Parsers/ASTSelectQuery.h>
13
#include <Parsers/ASTSelectWithUnionQuery.h>
14 15
#include <Parsers/ASTSubquery.h>
#include <Parsers/ASTOrderByElement.h>
16
#include <Parsers/formatAST.h>
17
#include <Parsers/DumpASTNode.h>
18

19
#include <DataTypes/DataTypeNullable.h>
20
#include <DataTypes/NestedUtils.h>
21
#include <DataTypes/DataTypesNumber.h>
V
Vadim 已提交
22

23
#include <Columns/IColumn.h>
24

25
#include <Interpreters/InterpreterSelectWithUnionQuery.h>
26 27 28 29
#include <Interpreters/ExpressionAnalyzer.h>
#include <Interpreters/ExpressionActions.h>
#include <Interpreters/InJoinSubqueriesPreprocessor.h>
#include <Interpreters/LogicalExpressionsOptimizer.h>
30
#include <Interpreters/PredicateExpressionsOptimizer.h>
31 32 33
#include <Interpreters/ExternalDictionaries.h>
#include <Interpreters/Set.h>
#include <Interpreters/Join.h>
34

35
#include <AggregateFunctions/AggregateFunctionFactory.h>
36
#include <AggregateFunctions/parseAggregateFunctionParameters.h>
37

38 39 40
#include <Storages/StorageDistributed.h>
#include <Storages/StorageMemory.h>
#include <Storages/StorageJoin.h>
41

42 43
#include <DataStreams/LazyBlockInputStream.h>
#include <DataStreams/copyData.h>
44

45
#include <Dictionaries/IDictionary.h>
46

47
#include <Common/typeid_cast.h>
48
#include <Common/StringUtils/StringUtils.h>
49

50
#include <Parsers/formatAST.h>
51

52
#include <ext/range.h>
53
#include <DataTypes/DataTypeFactory.h>
54
#include <Functions/FunctionsMiscellaneous.h>
55
#include <Parsers/queryToString.h>
56 57
#include <Parsers/ExpressionListParsers.h>
#include <Parsers/parseQuery.h>
N
Nikolai Kochetov 已提交
58
#include <Parsers/queryToString.h>
59
#include <Interpreters/interpretSubquery.h>
60
#include <Interpreters/DatabaseAndTableWithAlias.h>
61
#include <Interpreters/QueryNormalizer.h>
62

63
#include <Interpreters/ActionsVisitor.h>
64 65 66
#include <Interpreters/ExternalTablesVisitor.h>
#include <Interpreters/GlobalSubqueriesVisitor.h>
#include <Interpreters/RequiredSourceColumnsVisitor.h>
67 68 69 70

namespace DB
{

C
chertus 已提交
71
using LogAST = DebugASTLog<false>; /// set to true to enable logs
72 73


74 75
namespace ErrorCodes
{
76 77
    extern const int UNKNOWN_IDENTIFIER;
    extern const int ILLEGAL_AGGREGATION;
78
    extern const int EXPECTED_ALL_OR_ANY;
79 80
}

81
ExpressionAnalyzer::ExpressionAnalyzer(
82
    const ASTPtr & query_,
83
    const SyntaxAnalyzerResultPtr & syntax_analyzer_result_,
84
    const Context & context_,
85
    const NamesAndTypesList & additional_source_columns,
86
    const NameSet & required_result_columns_,
87
    size_t subquery_depth_,
A
Alexey Milovidov 已提交
88
    bool do_global_,
89
    const SubqueriesForSets & subqueries_for_sets_)
90
    : ExpressionAnalyzerData(syntax_analyzer_result_->source_columns, required_result_columns_, subqueries_for_sets_)
91 92
    , query(query_), context(context_), settings(context.getSettings())
    , subquery_depth(subquery_depth_), do_global(do_global_)
N
Nikolai Kochetov 已提交
93
    , syntax(syntax_analyzer_result_)
94
{
95 96
    storage = syntax->storage;
    rewrite_subqueries = syntax->rewrite_subqueries;
97

N
Nikolai Kochetov 已提交
98
    select_query = typeid_cast<ASTSelectQuery *>(query.get());
99

100
    if (!additional_source_columns.empty())
101
    {
102
        source_columns.insert(source_columns.end(), additional_source_columns.begin(), additional_source_columns.end());
103
        removeDuplicateColumns(source_columns);
104
    }
105

106
    /// Delete the unnecessary from `source_columns` list. Form `columns_added_by_join`.
107
    collectUsedColumns();
108

F
f1yegor 已提交
109 110
    /// external_tables, subqueries_for_sets for global subqueries.
    /// Replaces global subqueries with the generated names of temporary tables that will be sent to remote servers.
111
    initGlobalSubqueriesAndExternalTables();
112

113
    /// has_aggregation, aggregation_keys, aggregate_descriptions, aggregated_columns.
F
f1yegor 已提交
114 115 116 117 118 119
    /// This analysis should be performed after processing global subqueries, because otherwise,
    /// if the aggregate function contains a global subquery, then `analyzeAggregation` method will save
    /// in `aggregate_descriptions` the information about the parameters of this aggregate function, among which
    /// global subquery. Then, when you call `initGlobalSubqueriesAndExternalTables` method, this
    /// the global subquery will be replaced with a temporary table, resulting in aggregate_descriptions
    /// will contain out-of-date information, which will lead to an error when the query is executed.
120
    analyzeAggregation();
121 122
}

C
chertus 已提交
123
bool ExpressionAnalyzer::isRemoteStorage() const
124
{
C
chertus 已提交
125
    return storage && storage->isRemote();
126 127 128
}


129 130
void ExpressionAnalyzer::analyzeAggregation()
{
F
f1yegor 已提交
131 132 133
    /** Find aggregation keys (aggregation_keys), information about aggregate functions (aggregate_descriptions),
     *  as well as a set of columns obtained after the aggregation, if any,
     *  or after all the actions that are usually performed before aggregation (aggregated_columns).
134
     *
F
f1yegor 已提交
135
     * Everything below (compiling temporary ExpressionActions) - only for the purpose of query analysis (type output).
136 137 138 139 140
     */

    if (select_query && (select_query->group_expression_list || select_query->having_expression))
        has_aggregation = true;

141
    ExpressionActionsPtr temp_actions = std::make_shared<ExpressionActions>(source_columns, context);
142 143 144

    if (select_query)
    {
145 146 147 148 149 150 151 152 153
        bool is_array_join_left;
        ASTPtr array_join_expression_list = select_query->array_join_expression_list(is_array_join_left);
        if (array_join_expression_list)
        {
            getRootActions(array_join_expression_list, true, temp_actions);
            addMultipleArrayJoinAction(temp_actions, is_array_join_left);
            array_join_columns = temp_actions->getSampleBlock().getNamesAndTypesList();
        }

154 155 156
        const ASTTablesInSelectQueryElement * join = select_query->join();
        if (join)
        {
157 158
            const auto table_join = static_cast<const ASTTableJoin &>(*join->table_join);
            if (table_join.using_expression_list)
159
                getRootActions(table_join.using_expression_list, true, temp_actions);
160
            if (table_join.on_expression)
N
Nikolai Kochetov 已提交
161
                for (const auto & key_ast : analyzedJoin().key_asts_left)
162
                    getRootActions(key_ast, true, temp_actions);
163 164 165 166 167

            addJoinAction(temp_actions, true);
        }
    }

168
    getAggregates(query, temp_actions);
169 170 171 172 173 174 175 176 177 178 179 180 181

    if (has_aggregation)
    {
        assertSelect();

        /// Find out aggregation keys.
        if (select_query->group_expression_list)
        {
            NameSet unique_keys;
            ASTs & group_asts = select_query->group_expression_list->children;
            for (ssize_t i = 0; i < ssize_t(group_asts.size()); ++i)
            {
                ssize_t size = group_asts.size();
182
                getRootActions(group_asts[i], true, temp_actions);
183 184 185 186 187 188 189 190 191 192

                const auto & column_name = group_asts[i]->getColumnName();
                const auto & block = temp_actions->getSampleBlock();

                if (!block.has(column_name))
                    throw Exception("Unknown identifier (in GROUP BY): " + column_name, ErrorCodes::UNKNOWN_IDENTIFIER);

                const auto & col = block.getByName(column_name);

                /// Constant expressions have non-null column pointer at this stage.
193
                if (col.column && col.column->isColumnConst())
194 195 196 197 198 199 200 201 202 203 204 205 206 207
                {
                    /// But don't remove last key column if no aggregate functions, otherwise aggregation will not work.
                    if (!aggregate_descriptions.empty() || size > 1)
                    {
                        if (i + 1 < static_cast<ssize_t>(size))
                            group_asts[i] = std::move(group_asts.back());

                        group_asts.pop_back();

                        --i;
                        continue;
                    }
                }

208
                NameAndTypePair key{column_name, col.type};
209 210 211 212 213

                /// Aggregation keys are uniqued.
                if (!unique_keys.count(key.name))
                {
                    unique_keys.insert(key.name);
214
                    aggregation_keys.push_back(key);
215 216

                    /// Key is no longer needed, therefore we can save a little by moving it.
217
                    aggregated_columns.push_back(std::move(key));
218 219 220 221 222 223
                }
            }

            if (group_asts.empty())
            {
                select_query->group_expression_list = nullptr;
224
                has_aggregation = select_query->having_expression || aggregate_descriptions.size();
225 226 227 228 229 230
            }
        }

        for (size_t i = 0; i < aggregate_descriptions.size(); ++i)
        {
            AggregateDescription & desc = aggregate_descriptions[i];
231
            aggregated_columns.emplace_back(desc.column_name, desc.function->getReturnType());
232 233
        }
    }
234 235 236 237
    else
    {
        aggregated_columns = temp_actions->getSampleBlock().getNamesAndTypesList();
    }
238 239 240
}


241 242
void ExpressionAnalyzer::initGlobalSubqueriesAndExternalTables()
{
F
f1yegor 已提交
243
    /// Adds existing external tables (not subqueries) to the external_tables dictionary.
244
    ExternalTablesVisitor::Data tables_data{context, external_tables};
245
    ExternalTablesVisitor(tables_data).visit(query);
246

C
chertus 已提交
247 248
    if (do_global)
    {
249
        GlobalSubqueriesVisitor::Data subqueries_data(context, subquery_depth, isRemoteStorage(),
C
chertus 已提交
250
                                                   external_tables, subqueries_for_sets, has_global_subqueries);
251
        GlobalSubqueriesVisitor(subqueries_data).visit(query);
C
chertus 已提交
252
    }
253 254 255
}


256
void ExpressionAnalyzer::makeSetsForIndex()
P
Pavel Kartavyy 已提交
257
{
258 259 260
    if (storage && select_query && storage->supportsIndexForIn())
    {
        if (select_query->where_expression)
261
            makeSetsForIndexImpl(select_query->where_expression);
262
        if (select_query->prewhere_expression)
263
            makeSetsForIndexImpl(select_query->prewhere_expression);
264
    }
P
Pavel Kartavyy 已提交
265 266
}

267

268
void ExpressionAnalyzer::tryMakeSetForIndexFromSubquery(const ASTPtr & subquery_or_table_name)
269
{
270 271 272
    auto set_key = PreparedSetKey::forSubquery(*subquery_or_table_name);
    if (prepared_sets.count(set_key))
        return; /// Already prepared.
273

274 275
    auto interpreter_subquery = interpretSubquery(subquery_or_table_name, context, subquery_depth + 1, {});
    BlockIO res = interpreter_subquery->execute();
276

277
    SetPtr set = std::make_shared<Set>(settings.size_limits_for_set, true);
278
    set->setHeader(res.in->getHeader());
279

280 281 282
    while (Block block = res.in->read())
    {
        /// If the limits have been exceeded, give up and let the default subquery processing actions take place.
A
Alexey Milovidov 已提交
283
        if (!set->insertFromBlock(block))
284 285 286
            return;
    }

287
    prepared_sets[set_key] = std::move(set);
288 289 290
}


291
void ExpressionAnalyzer::makeSetsForIndexImpl(const ASTPtr & node)
P
Pavel Kartavyy 已提交
292
{
293
    for (auto & child : node->children)
294
    {
295
        /// Don't descend into subqueries.
296 297 298
        if (typeid_cast<ASTSubquery *>(child.get()))
            continue;

299
        /// Don't descend into lambda functions
300 301 302 303
        const ASTFunction * func = typeid_cast<const ASTFunction *>(child.get());
        if (func && func->name == "lambda")
            continue;

304
        makeSetsForIndexImpl(child);
305
    }
306

307
    const ASTFunction * func = typeid_cast<const ASTFunction *>(node.get());
308
    if (func && functionIsInOperator(func->name))
309
    {
310
        const IAST & args = *func->arguments;
311

312
        if (storage && storage->mayBenefitFromIndexForIn(args.children.at(0)))
313
        {
314
            const ASTPtr & arg = args.children.at(1);
315
            if (typeid_cast<ASTSubquery *>(arg.get()) || isIdentifier(arg))
316
            {
317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
                if (settings.use_index_for_in_with_subqueries)
                    tryMakeSetForIndexFromSubquery(arg);
            }
            else
            {
                NamesAndTypesList temp_columns = source_columns;
                temp_columns.insert(temp_columns.end(), array_join_columns.begin(), array_join_columns.end());
                for (const auto & joined_column : columns_added_by_join)
                    temp_columns.push_back(joined_column.name_and_type);
                ExpressionActionsPtr temp_actions = std::make_shared<ExpressionActions>(temp_columns, context);
                getRootActions(func->arguments->children.at(0), true, temp_actions);

                Block sample_block_with_calculated_columns = temp_actions->getSampleBlock();
                if (sample_block_with_calculated_columns.has(args.children.at(0)->getColumnName()))
                    makeExplicitSet(func, sample_block_with_calculated_columns, true, context,
                        settings.size_limits_for_set, prepared_sets);
333 334 335
            }
        }
    }
336 337
}

338 339

void ExpressionAnalyzer::getRootActions(const ASTPtr & ast, bool no_subqueries, ExpressionActionsPtr & actions, bool only_consts)
340
{
341
    LogAST log;
342
    ActionsVisitor actions_visitor(context, settings.size_limits_for_set, subquery_depth,
343
                                   source_columns, actions, prepared_sets, subqueries_for_sets,
C
chertus 已提交
344
                                   no_subqueries, only_consts, !isRemoteStorage(), log.stream());
345 346 347 348 349 350 351 352
    actions_visitor.visit(ast);
    actions = actions_visitor.popActionsLevel();
}


void ExpressionAnalyzer::getActionsFromJoinKeys(const ASTTableJoin & table_join, bool no_subqueries, ExpressionActionsPtr & actions)
{
    bool only_consts = false;
353

354
    LogAST log;
355
    ActionsVisitor actions_visitor(context, settings.size_limits_for_set, subquery_depth,
356
                                   source_columns, actions, prepared_sets, subqueries_for_sets,
C
chertus 已提交
357
                                   no_subqueries, only_consts, !isRemoteStorage(), log.stream());
358

359
    if (table_join.using_expression_list)
360
        actions_visitor.visit(table_join.using_expression_list);
361 362
    else if (table_join.on_expression)
    {
N
Nikolai Kochetov 已提交
363
        for (const auto & ast : analyzedJoin().key_asts_left)
364
            actions_visitor.visit(ast);
365 366
    }

367
    actions = actions_visitor.popActionsLevel();
368 369
}

370

371
void ExpressionAnalyzer::getAggregates(const ASTPtr & ast, ExpressionActionsPtr & actions)
372
{
F
f1yegor 已提交
373
    /// There can not be aggregate functions inside the WHERE and PREWHERE.
374 375 376 377 378 379
    if (select_query && (ast.get() == select_query->where_expression.get() || ast.get() == select_query->prewhere_expression.get()))
    {
        assertNoAggregates(ast, "in WHERE or PREWHERE");
        return;
    }

F
f1yegor 已提交
380
    /// If we are not analyzing a SELECT query, but a separate expression, then there can not be aggregate functions in it.
381 382 383 384 385 386 387
    if (!select_query)
    {
        assertNoAggregates(ast, "in wrong place");
        return;
    }

    const ASTFunction * node = typeid_cast<const ASTFunction *>(ast.get());
388
    if (node && AggregateFunctionFactory::instance().isAggregateFunctionName(node->name))
389 390 391 392 393
    {
        has_aggregation = true;
        AggregateDescription aggregate;
        aggregate.column_name = node->getColumnName();

F
f1yegor 已提交
394
        /// Make unique aggregate functions.
395 396 397 398 399 400 401 402 403 404
        for (size_t i = 0; i < aggregate_descriptions.size(); ++i)
            if (aggregate_descriptions[i].column_name == aggregate.column_name)
                return;

        const ASTs & arguments = node->arguments->children;
        aggregate.argument_names.resize(arguments.size());
        DataTypes types(arguments.size());

        for (size_t i = 0; i < arguments.size(); ++i)
        {
F
f1yegor 已提交
405
            /// There can not be other aggregate functions within the aggregate functions.
406 407
            assertNoAggregates(arguments[i], "inside another aggregate function");

408
            getRootActions(arguments[i], true, actions);
409 410 411 412 413
            const std::string & name = arguments[i]->getColumnName();
            types[i] = actions->getSampleBlock().getByName(name).type;
            aggregate.argument_names[i] = name;
        }

414 415
        aggregate.parameters = (node->parameters) ? getAggregateFunctionParametersArray(node->parameters) : Array();
        aggregate.function = AggregateFunctionFactory::instance().get(node->name, types, aggregate.parameters);
416 417 418 419 420 421 422 423 424 425

        aggregate_descriptions.push_back(aggregate);
    }
    else
    {
        for (const auto & child : ast->children)
            if (!typeid_cast<const ASTSubquery *>(child.get())
                && !typeid_cast<const ASTSelectQuery *>(child.get()))
                getAggregates(child, actions);
    }
426 427
}

428 429 430

void ExpressionAnalyzer::assertNoAggregates(const ASTPtr & ast, const char * description)
{
431
    const ASTFunction * node = typeid_cast<const ASTFunction *>(ast.get());
432

433
    if (node && AggregateFunctionFactory::instance().isAggregateFunctionName(node->name))
434 435
        throw Exception("Aggregate function " + node->getColumnName()
            + " is found " + String(description) + " in query", ErrorCodes::ILLEGAL_AGGREGATION);
436

437 438 439 440
    for (const auto & child : ast->children)
        if (!typeid_cast<const ASTSubquery *>(child.get())
            && !typeid_cast<const ASTSelectQuery *>(child.get()))
            assertNoAggregates(child, description);
441 442 443
}


444
void ExpressionAnalyzer::assertSelect() const
445
{
446 447
    if (!select_query)
        throw Exception("Not a select query", ErrorCodes::LOGICAL_ERROR);
448
}
449

450
void ExpressionAnalyzer::assertAggregation() const
451
{
452 453
    if (!has_aggregation)
        throw Exception("No aggregation", ErrorCodes::LOGICAL_ERROR);
454
}
455

456
void ExpressionAnalyzer::initChain(ExpressionActionsChain & chain, const NamesAndTypesList & columns) const
457
{
458 459
    if (chain.steps.empty())
    {
460
        chain.steps.emplace_back(std::make_shared<ExpressionActions>(columns, context));
461
    }
462
}
463

464
/// "Big" ARRAY JOIN.
465
void ExpressionAnalyzer::addMultipleArrayJoinAction(ExpressionActionsPtr & actions, bool array_join_is_left) const
466
{
467
    NameSet result_columns;
468
    for (const auto & result_source : syntax->array_join_result_to_source)
469 470 471 472
    {
        /// Assign new names to columns, if needed.
        if (result_source.first != result_source.second)
            actions->add(ExpressionAction::copyColumn(result_source.second, result_source.first));
473

F
f1yegor 已提交
474
        /// Make ARRAY JOIN (replace arrays with their insides) for the columns in these new names.
475 476
        result_columns.insert(result_source.first);
    }
477

478
    actions->add(ExpressionAction::arrayJoin(result_columns, array_join_is_left, context));
479 480
}

481
bool ExpressionAnalyzer::appendArrayJoin(ExpressionActionsChain & chain, bool only_types)
482
{
483
    assertSelect();
484

485 486 487
    bool is_array_join_left;
    ASTPtr array_join_expression_list = select_query->array_join_expression_list(is_array_join_left);
    if (!array_join_expression_list)
488
        return false;
489

490
    initChain(chain, source_columns);
491
    ExpressionActionsChain::Step & step = chain.steps.back();
492

493
    getRootActions(array_join_expression_list, only_types, step.actions);
494

495
    addMultipleArrayJoinAction(step.actions, is_array_join_left);
496

497
    return true;
498 499
}

500
void ExpressionAnalyzer::addJoinAction(ExpressionActionsPtr & actions, bool only_types) const
501
{
N
Nikolai Kochetov 已提交
502 503 504 505
    NamesAndTypesList columns_added_by_join_list;
    for (const auto & joined_column : columns_added_by_join)
        columns_added_by_join_list.push_back(joined_column.name_and_type);

506
    if (only_types)
507
        actions->add(ExpressionAction::ordinaryJoin(nullptr, analyzedJoin().key_names_left, columns_added_by_join_list));
508 509 510
    else
        for (auto & subquery_for_set : subqueries_for_sets)
            if (subquery_for_set.second.join)
N
Nikolai Kochetov 已提交
511
                actions->add(ExpressionAction::ordinaryJoin(subquery_for_set.second.join, analyzedJoin().key_names_left,
512
                                                            columns_added_by_join_list));
513 514 515 516
}

bool ExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, bool only_types)
{
517 518 519 520 521
    assertSelect();

    if (!select_query->join())
        return false;

522
    initChain(chain, source_columns);
523 524
    ExpressionActionsChain::Step & step = chain.steps.back();

525
    const auto & join_element = static_cast<const ASTTablesInSelectQueryElement &>(*select_query->join());
526 527
    auto & join_params = static_cast<ASTTableJoin &>(*join_element.table_join);

528
    if (join_params.strictness == ASTTableJoin::Strictness::Unspecified && join_params.kind != ASTTableJoin::Kind::Cross)
529
    {
530
        if (settings.join_default_strictness == "ANY")
531
            join_params.strictness = ASTTableJoin::Strictness::Any;
532
        else if (settings.join_default_strictness == "ALL")
533 534
            join_params.strictness = ASTTableJoin::Strictness::All;
        else
535
            throw Exception("Expected ANY or ALL in JOIN section, because setting (join_default_strictness) is empty", DB::ErrorCodes::EXPECTED_ALL_OR_ANY);
536 537
    }

538
    const auto & table_to_join = static_cast<const ASTTableExpression &>(*join_element.table_expression);
539

540
    getActionsFromJoinKeys(join_params, only_types, step.actions);
541

F
f1yegor 已提交
542
    /// Two JOINs are not supported with the same subquery, but different USINGs.
A
Alexey Milovidov 已提交
543
    auto join_hash = join_element.getTreeHash();
544

A
Alexey Milovidov 已提交
545
    SubqueryForSet & subquery_for_set = subqueries_for_sets[toString(join_hash.first) + "_" + toString(join_hash.second)];
546

F
f1yegor 已提交
547 548
    /// Special case - if table name is specified on the right of JOIN, then the table has the type Join (the previously prepared mapping).
    /// TODO This syntax does not support specifying a database name.
549 550
    if (table_to_join.database_and_table_name)
    {
551
        DatabaseAndTableWithAlias database_table(table_to_join.database_and_table_name);
552
        StoragePtr table = context.tryGetTable(database_table.database, database_table.table);
553 554 555

        if (table)
        {
A
Alexey Milovidov 已提交
556
            StorageJoin * storage_join = dynamic_cast<StorageJoin *>(table.get());
557 558 559 560

            if (storage_join)
            {
                storage_join->assertCompatible(join_params.kind, join_params.strictness);
F
f1yegor 已提交
561
                /// TODO Check the set of keys.
562 563 564 565 566 567 568 569 570

                JoinPtr & join = storage_join->getJoin();
                subquery_for_set.join = join;
            }
        }
    }

    if (!subquery_for_set.join)
    {
A
Amos Bird 已提交
571 572
        JoinPtr join = std::make_shared<Join>(analyzedJoin().key_names_right, settings.join_use_nulls,
            settings.size_limits_for_join, join_params.kind, join_params.strictness);
573

F
f1yegor 已提交
574 575 576 577
        /** For GLOBAL JOINs (in the case, for example, of the push method for executing GLOBAL subqueries), the following occurs
          * - in the addExternalStorage function, the JOIN (SELECT ...) subquery is replaced with JOIN _data1,
          *   in the subquery_for_set object this subquery is exposed as source and the temporary table _data1 as the `table`.
          * - this function shows the expression JOIN _data1.
578 579 580 581
          */
        if (!subquery_for_set.source)
        {
            ASTPtr table;
582

583
            if (table_to_join.subquery)
584
                table = table_to_join.subquery;
585 586 587 588
            else if (table_to_join.table_function)
                table = table_to_join.table_function;
            else if (table_to_join.database_and_table_name)
                table = table_to_join.database_and_table_name;
589

590
            Names original_columns;
N
Nikolai Kochetov 已提交
591
            for (const auto & column : analyzedJoin().columns_from_joined_table)
N
Nikolai Kochetov 已提交
592
                if (required_columns_from_joined_table.count(column.name_and_type.name))
593 594 595
                    original_columns.emplace_back(column.original_name);

            auto interpreter = interpretSubquery(table, context, subquery_depth, original_columns);
596 597 598
            subquery_for_set.source = std::make_shared<LazyBlockInputStream>(
                interpreter->getSampleBlock(),
                [interpreter]() mutable { return interpreter->execute().in; });
599 600
        }

601
        /// Alias duplicating columns as qualified.
N
Nikolai Kochetov 已提交
602
        for (const auto & column : analyzedJoin().columns_from_joined_table)
N
Nikolai Kochetov 已提交
603
            if (required_columns_from_joined_table.count(column.name_and_type.name))
604
                subquery_for_set.joined_block_aliases.emplace_back(column.original_name, column.name_and_type.name);
N
Nikolai Kochetov 已提交
605

606
        auto sample_block = subquery_for_set.source->getHeader();
N
Nikolai Kochetov 已提交
607 608 609 610 611 612 613 614 615 616 617 618
        for (const auto & name_with_alias : subquery_for_set.joined_block_aliases)
        {
            if (sample_block.has(name_with_alias.first))
            {
                auto pos = sample_block.getPositionByName(name_with_alias.first);
                auto column = sample_block.getByPosition(pos);
                sample_block.erase(pos);
                column.name = name_with_alias.second;
                sample_block.insert(std::move(column));
            }
        }

N
Nikolai Kochetov 已提交
619
        joined_block_actions->execute(sample_block);
N
Nikolai Kochetov 已提交
620

F
f1yegor 已提交
621
        /// TODO You do not need to set this up when JOIN is only needed on remote servers.
622
        subquery_for_set.join = join;
623
        subquery_for_set.join->setSampleBlock(sample_block);
N
Nikolai Kochetov 已提交
624
        subquery_for_set.joined_block_actions = joined_block_actions;
625 626 627 628 629
    }

    addJoinAction(step.actions, false);

    return true;
630 631
}

632 633
bool ExpressionAnalyzer::appendPrewhere(
    ExpressionActionsChain & chain, bool only_types, const Names & additional_required_columns)
634 635 636 637 638 639 640
{
    assertSelect();

    if (!select_query->prewhere_expression)
        return false;

    initChain(chain, source_columns);
N
Nikolai Kochetov 已提交
641
    auto & step = chain.getLastStep();
642
    getRootActions(select_query->prewhere_expression, only_types, step.actions);
N
Nikolai Kochetov 已提交
643 644
    String prewhere_column_name = select_query->prewhere_expression->getColumnName();
    step.required_output.push_back(prewhere_column_name);
645
    step.can_remove_required_output.push_back(true);
N
Nikolai Kochetov 已提交
646 647 648

    {
        /// Remove unused source_columns from prewhere actions.
649
        auto tmp_actions = std::make_shared<ExpressionActions>(source_columns, context);
650
        getRootActions(select_query->prewhere_expression, only_types, tmp_actions);
N
Nikolai Kochetov 已提交
651 652 653 654
        tmp_actions->finalize({prewhere_column_name});
        auto required_columns = tmp_actions->getRequiredColumns();
        NameSet required_source_columns(required_columns.begin(), required_columns.end());

N
Nikolai Kochetov 已提交
655 656
        /// Add required columns to required output in order not to remove them after prewhere execution.
        /// TODO: add sampling and final execution to common chain.
657
        for (const auto & column : additional_required_columns)
658 659 660 661 662 663 664 665
        {
            if (required_source_columns.count(column))
            {
                step.required_output.push_back(column);
                step.can_remove_required_output.push_back(true);
            }
        }

N
Nikolai Kochetov 已提交
666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683
        auto names = step.actions->getSampleBlock().getNames();
        NameSet name_set(names.begin(), names.end());

        for (const auto & column : source_columns)
            if (required_source_columns.count(column.name) == 0)
                name_set.erase(column.name);

        Names required_output(name_set.begin(), name_set.end());
        step.actions->finalize(required_output);
    }

    {
        /// Add empty action with input = {prewhere actions output} + {unused source columns}
        /// Reasons:
        /// 1. Remove remove source columns which are used only in prewhere actions during prewhere actions execution.
        ///    Example: select A prewhere B > 0. B can be removed at prewhere step.
        /// 2. Store side columns which were calculated during prewhere actions execution if they are used.
        ///    Example: select F(A) prewhere F(A) > 0. F(A) can be saved from prewhere step.
N
Nikolai Kochetov 已提交
684
        /// 3. Check if we can remove filter column at prewhere step. If we can, action will store single REMOVE_COLUMN.
N
Nikolai Kochetov 已提交
685 686 687 688 689 690 691 692 693 694 695 696 697 698
        ColumnsWithTypeAndName columns = step.actions->getSampleBlock().getColumnsWithTypeAndName();
        auto required_columns = step.actions->getRequiredColumns();
        NameSet prewhere_input_names(required_columns.begin(), required_columns.end());
        NameSet unused_source_columns;

        for (const auto & column : source_columns)
        {
            if (prewhere_input_names.count(column.name) == 0)
            {
                columns.emplace_back(column.type, column.name);
                unused_source_columns.emplace(column.name);
            }
        }

699
        chain.steps.emplace_back(std::make_shared<ExpressionActions>(std::move(columns), context));
N
Nikolai Kochetov 已提交
700 701
        chain.steps.back().additional_input = std::move(unused_source_columns);
    }
702 703 704

    return true;
}
705

706
bool ExpressionAnalyzer::appendWhere(ExpressionActionsChain & chain, bool only_types)
707
{
708
    assertSelect();
709

710 711
    if (!select_query->where_expression)
        return false;
712

713
    initChain(chain, source_columns);
714
    ExpressionActionsChain::Step & step = chain.steps.back();
715

716
    step.required_output.push_back(select_query->where_expression->getColumnName());
717
    step.can_remove_required_output = {true};
718

719
    getRootActions(select_query->where_expression, only_types, step.actions);
720

721
    return true;
722 723
}

724
bool ExpressionAnalyzer::appendGroupBy(ExpressionActionsChain & chain, bool only_types)
725
{
726
    assertAggregation();
727

728 729
    if (!select_query->group_expression_list)
        return false;
730

731
    initChain(chain, source_columns);
732
    ExpressionActionsChain::Step & step = chain.steps.back();
733

734 735 736 737
    ASTs asts = select_query->group_expression_list->children;
    for (size_t i = 0; i < asts.size(); ++i)
    {
        step.required_output.push_back(asts[i]->getColumnName());
738
        getRootActions(asts[i], only_types, step.actions);
739
    }
740

741
    return true;
742 743
}

744
void ExpressionAnalyzer::appendAggregateFunctionsArguments(ExpressionActionsChain & chain, bool only_types)
745
{
746
    assertAggregation();
747

748
    initChain(chain, source_columns);
749
    ExpressionActionsChain::Step & step = chain.steps.back();
750

751 752 753 754 755 756 757
    for (size_t i = 0; i < aggregate_descriptions.size(); ++i)
    {
        for (size_t j = 0; j < aggregate_descriptions[i].argument_names.size(); ++j)
        {
            step.required_output.push_back(aggregate_descriptions[i].argument_names[j]);
        }
    }
758

759
    getActionsBeforeAggregation(select_query->select_expression_list, step.actions, only_types);
760

761 762
    if (select_query->having_expression)
        getActionsBeforeAggregation(select_query->having_expression, step.actions, only_types);
763

764 765
    if (select_query->order_expression_list)
        getActionsBeforeAggregation(select_query->order_expression_list, step.actions, only_types);
766 767
}

768
bool ExpressionAnalyzer::appendHaving(ExpressionActionsChain & chain, bool only_types)
769
{
770
    assertAggregation();
771

772 773
    if (!select_query->having_expression)
        return false;
774

775 776
    initChain(chain, aggregated_columns);
    ExpressionActionsChain::Step & step = chain.steps.back();
777

778
    step.required_output.push_back(select_query->having_expression->getColumnName());
779
    getRootActions(select_query->having_expression, only_types, step.actions);
780

781
    return true;
782 783
}

784
void ExpressionAnalyzer::appendSelect(ExpressionActionsChain & chain, bool only_types)
785
{
786
    assertSelect();
787

788 789
    initChain(chain, aggregated_columns);
    ExpressionActionsChain::Step & step = chain.steps.back();
790

791
    getRootActions(select_query->select_expression_list, only_types, step.actions);
792

793
    for (const auto & child : select_query->select_expression_list->children)
794
        step.required_output.push_back(child->getColumnName());
795
}
796

797
bool ExpressionAnalyzer::appendOrderBy(ExpressionActionsChain & chain, bool only_types)
798
{
799
    assertSelect();
800

801 802
    if (!select_query->order_expression_list)
        return false;
803

804 805
    initChain(chain, aggregated_columns);
    ExpressionActionsChain::Step & step = chain.steps.back();
806

807
    getRootActions(select_query->order_expression_list, only_types, step.actions);
808

809 810 811 812 813 814 815 816 817
    ASTs asts = select_query->order_expression_list->children;
    for (size_t i = 0; i < asts.size(); ++i)
    {
        ASTOrderByElement * ast = typeid_cast<ASTOrderByElement *>(asts[i].get());
        if (!ast || ast->children.size() < 1)
            throw Exception("Bad order expression AST", ErrorCodes::UNKNOWN_TYPE_OF_AST_NODE);
        ASTPtr order_expression = ast->children.at(0);
        step.required_output.push_back(order_expression->getColumnName());
    }
818

819
    return true;
820 821
}

822 823 824 825 826 827 828 829 830 831
bool ExpressionAnalyzer::appendLimitBy(ExpressionActionsChain & chain, bool only_types)
{
    assertSelect();

    if (!select_query->limit_by_expression_list)
        return false;

    initChain(chain, aggregated_columns);
    ExpressionActionsChain::Step & step = chain.steps.back();

832
    getRootActions(select_query->limit_by_expression_list, only_types, step.actions);
833 834 835 836 837 838 839

    for (const auto & child : select_query->limit_by_expression_list->children)
        step.required_output.push_back(child->getColumnName());

    return true;
}

A
Alexey Milovidov 已提交
840
void ExpressionAnalyzer::appendProjectResult(ExpressionActionsChain & chain) const
841
{
842
    assertSelect();
843

844 845
    initChain(chain, aggregated_columns);
    ExpressionActionsChain::Step & step = chain.steps.back();
846

847
    NamesWithAliases result_columns;
848

849 850 851
    ASTs asts = select_query->select_expression_list->children;
    for (size_t i = 0; i < asts.size(); ++i)
    {
852
        String result_name = asts[i]->getAliasOrColumnName();
853
        if (required_result_columns.empty() || required_result_columns.count(result_name))
854 855 856 857
        {
            result_columns.emplace_back(asts[i]->getColumnName(), result_name);
            step.required_output.push_back(result_columns.back().second);
        }
858
    }
859

860
    step.actions->add(ExpressionAction::project(result_columns));
861 862 863
}


864
void ExpressionAnalyzer::appendExpression(ExpressionActionsChain & chain, const ASTPtr & expr, bool only_types)
865 866 867
{
    initChain(chain, source_columns);
    ExpressionActionsChain::Step & step = chain.steps.back();
868
    getRootActions(expr, only_types, step.actions);
869 870 871 872
    step.required_output.push_back(expr->getColumnName());
}


873
void ExpressionAnalyzer::getActionsBeforeAggregation(const ASTPtr & ast, ExpressionActionsPtr & actions, bool no_subqueries)
874
{
875
    ASTFunction * node = typeid_cast<ASTFunction *>(ast.get());
876

877
    if (node && AggregateFunctionFactory::instance().isAggregateFunctionName(node->name))
878
        for (auto & argument : node->arguments->children)
879
            getRootActions(argument, no_subqueries, actions);
880 881 882
    else
        for (auto & child : ast->children)
            getActionsBeforeAggregation(child, actions, no_subqueries);
883 884 885
}


886
ExpressionActionsPtr ExpressionAnalyzer::getActions(bool add_aliases, bool project_result)
887
{
888
    ExpressionActionsPtr actions = std::make_shared<ExpressionActions>(source_columns, context);
889 890 891 892 893
    NamesWithAliases result_columns;
    Names result_names;

    ASTs asts;

894
    if (auto node = typeid_cast<const ASTExpressionList *>(query.get()))
895 896
        asts = node->children;
    else
897
        asts = ASTs(1, query);
898 899 900 901 902

    for (size_t i = 0; i < asts.size(); ++i)
    {
        std::string name = asts[i]->getColumnName();
        std::string alias;
903
        if (add_aliases)
904 905 906 907 908
            alias = asts[i]->getAliasOrColumnName();
        else
            alias = name;
        result_columns.emplace_back(name, alias);
        result_names.push_back(alias);
909
        getRootActions(asts[i], false, actions);
910 911
    }

912
    if (add_aliases)
913
    {
914 915 916 917
        if (project_result)
            actions->add(ExpressionAction::project(result_columns));
        else
            actions->add(ExpressionAction::addAliases(result_columns));
918
    }
N
Nikolai Kochetov 已提交
919 920

    if (!(add_aliases && project_result))
921
    {
F
f1yegor 已提交
922
        /// We will not delete the original columns.
923
        for (const auto & column_name_type : source_columns)
924 925 926 927 928 929
            result_names.push_back(column_name_type.name);
    }

    actions->finalize(result_names);

    return actions;
930 931 932 933 934
}


ExpressionActionsPtr ExpressionAnalyzer::getConstActions()
{
935
    ExpressionActionsPtr actions = std::make_shared<ExpressionActions>(NamesAndTypesList(), context);
936

937
    getRootActions(query, true, actions, true);
938
    return actions;
939 940
}

941
void ExpressionAnalyzer::getAggregateInfo(Names & key_names, AggregateDescriptions & aggregates) const
942
{
943 944
    for (const auto & name_and_type : aggregation_keys)
        key_names.emplace_back(name_and_type.name);
945

946
    aggregates = aggregate_descriptions;
947 948
}

949 950 951 952 953 954 955 956 957
/// db.table.column -> table.column / table.column -> column
static String cropDatabaseOrTableName(const String & name)
{
    size_t pos = name.find('.', 0);
    if (pos != std::string::npos)
        return name.substr(pos + 1, name.size() - pos - 1);
    return name;
}

958
void ExpressionAnalyzer::collectUsedColumns()
959
{
F
f1yegor 已提交
960 961 962
    /** Calculate which columns are required to execute the expression.
      * Then, delete all other columns from the list of available columns.
      * After execution, columns will only contain the list of columns needed to read from the table.
963 964
      */

965 966
    RequiredSourceColumnsVisitor::Data columns_context;
    RequiredSourceColumnsVisitor(columns_context).visit(query);
967

968
    NameSet required = columns_context.requiredColumns();
969

970 971 972 973 974 975 976 977 978 979 980 981 982
#if 0
    std::cerr << "Query: " << query << std::endl;
    std::cerr << "CTX: " << columns_context << std::endl;
    std::cerr << "source_columns: ";
    for (const auto & name : source_columns)
        std::cerr << "'" << name.name << "' ";
    std::cerr << "required: ";
    for (const auto & name : required)
        std::cerr << "'" << name << "' ";
    std::cerr << std::endl;
#endif

    if (columns_context.has_table_join)
983
    {
984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008
        const AnalyzedJoin & analyzed_join = analyzedJoin();
#if 0
        std::cerr << "key_names_left: ";
        for (const auto & name : analyzed_join.key_names_left)
            std::cerr << "'" << name << "' ";
        std::cerr << "key_names_right: ";
        for (const auto & name : analyzed_join.key_names_right)
            std::cerr << "'" << name << "' ";
        std::cerr << "columns_from_joined_table: ";
        for (const auto & column : analyzed_join.columns_from_joined_table)
            std::cerr << "'" << column.name_and_type.name << '/' << column.original_name << "' ";
        std::cerr << "available_joined_columns: ";
        for (const auto & column : analyzed_join.available_joined_columns)
            std::cerr << "'" << column.name_and_type.name << '/' << column.original_name << "' ";
        std::cerr << std::endl;
#endif
        NameSet avaliable_columns;
        for (const auto & name : source_columns)
            avaliable_columns.insert(name.name);

        /** You also need to ignore the identifiers of the columns that are obtained by JOIN.
        * (Do not assume that they are required for reading from the "left" table).
        */
        columns_added_by_join.clear();
        for (const auto & joined_column : analyzed_join.available_joined_columns)
1009
        {
1010 1011
            auto & name = joined_column.name_and_type.name;
            if (required.count(name) && !avaliable_columns.count(name))
1012
            {
1013 1014 1015
                columns_added_by_join.push_back(joined_column);
                required.erase(name);
            }
1016 1017
        }

1018 1019 1020 1021 1022 1023 1024
        /// @fix filter required columns according to misqualified names in JOIN ON
        if (columns_context.has_table_join &&
            columns_context.tables.size() >= 2 &&
            columns_context.tables[1].join &&
            columns_context.tables[1].join->on_expression)
        {
            NameSet fixed_required;
1025

1026 1027 1028 1029 1030
            for (const auto & req_name : required)
            {
                bool collated = false;
                String cropped_name = req_name;
                static const constexpr size_t max_column_prefix = 2;
1031

1032 1033 1034 1035 1036 1037 1038 1039 1040 1041
                for (size_t i = 0; i < max_column_prefix && !collated; ++i)
                {
                    cropped_name = cropDatabaseOrTableName(cropped_name);

                    if (avaliable_columns.count(cropped_name))
                    {
                        fixed_required.insert(cropped_name);
                        collated = true;
                        break;
                    }
1042

1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054
                    for (const auto & joined_column : analyzed_join.available_joined_columns)
                    {
                        auto & name = joined_column.name_and_type.name;

                        if (cropped_name == name)
                        {
                            columns_added_by_join.push_back(joined_column);
                            collated = true;
                            break;
                        }
                    }
                }
1055

1056 1057 1058
                if (!collated)
                    fixed_required.insert(req_name);
            }
1059

1060 1061
            required.swap(fixed_required);
        }
N
Nikolai Kochetov 已提交
1062

1063 1064
        joined_block_actions = analyzed_join.createJoinedBlockActions(columns_added_by_join, select_query, context);
        required_columns_from_joined_table = analyzed_join.getRequiredColumnsFromJoinedTable(columns_added_by_join, joined_block_actions);
1065
    }
N
Nikolai Kochetov 已提交
1066

1067 1068 1069 1070 1071 1072
    if (columns_context.has_array_join)
    {
        /// Insert the columns required for the ARRAY JOIN calculation into the required columns list.
        NameSet array_join_sources;
        for (const auto & result_source : syntax->array_join_result_to_source)
            array_join_sources.insert(result_source.second);
1073

1074 1075 1076 1077
        for (const auto & column_name_type : source_columns)
            if (array_join_sources.count(column_name_type.name))
                required.insert(column_name_type.name);
    }
1078

F
f1yegor 已提交
1079
    /// You need to read at least one column to find the number of rows.
Z
zhang2014 已提交
1080
    if (select_query && required.empty())
1081
        required.insert(ExpressionActions::getSmallestColumn(source_columns));
1082

1083
    NameSet unknown_required_source_columns = required;
1084

1085
    for (NamesAndTypesList::iterator it = source_columns.begin(); it != source_columns.end();)
1086
    {
1087
        unknown_required_source_columns.erase(it->name);
1088 1089

        if (!required.count(it->name))
1090
            source_columns.erase(it++);
1091 1092
        else
            ++it;
1093 1094
    }

1095 1096
    /// If there are virtual columns among the unknown columns. Remove them from the list of unknown and add
    /// in columns list, so that when further processing they are also considered.
1097 1098
    if (storage)
    {
1099
        for (auto it = unknown_required_source_columns.begin(); it != unknown_required_source_columns.end();)
1100 1101 1102
        {
            if (storage->hasColumn(*it))
            {
1103 1104
                source_columns.push_back(storage->getColumn(*it));
                unknown_required_source_columns.erase(it++);
1105 1106 1107 1108 1109
            }
            else
                ++it;
        }
    }
1110 1111

    if (!unknown_required_source_columns.empty())
1112 1113 1114 1115 1116 1117 1118
    {
        std::stringstream ss;
        ss << columns_context;
        ss << "source_columns: ";
        for (const auto & name : source_columns)
            ss << "'" << name.name << "' ";

1119
        throw Exception("Unknown identifier: " + *unknown_required_source_columns.begin()
1120 1121 1122
            + (select_query && !select_query->tables ? ". Note that there is no tables (FROM clause) in your query" : "")
            + ", context: " + ss.str(), ErrorCodes::UNKNOWN_IDENTIFIER);
    }
1123 1124
}

1125

1126
}