ExpressionAnalyzer.cpp 128.0 KB
Newer Older
1
#include <Poco/Util/Application.h>
A
Alexey Milovidov 已提交
2
#include <Poco/String.h>
3

4
#include <DataTypes/FieldToDataType.h>
5

6 7 8 9
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTAsterisk.h>
10
#include <Parsers/ASTQualifiedAsterisk.h>
11 12
#include <Parsers/ASTExpressionList.h>
#include <Parsers/ASTSelectQuery.h>
13
#include <Parsers/ASTSelectWithUnionQuery.h>
14 15
#include <Parsers/ASTSubquery.h>
#include <Parsers/ASTOrderByElement.h>
16
#include <Parsers/formatAST.h>
17

18
#include <DataTypes/DataTypeSet.h>
19
#include <DataTypes/DataTypeNullable.h>
20
#include <DataTypes/NestedUtils.h>
21
#include <DataTypes/DataTypesNumber.h>
V
Vadim 已提交
22

23
#include <Columns/ColumnSet.h>
24
#include <Columns/ColumnConst.h>
25

26
#include <Interpreters/InterpreterSelectWithUnionQuery.h>
27 28 29 30
#include <Interpreters/ExpressionAnalyzer.h>
#include <Interpreters/ExpressionActions.h>
#include <Interpreters/InJoinSubqueriesPreprocessor.h>
#include <Interpreters/LogicalExpressionsOptimizer.h>
31
#include <Interpreters/PredicateExpressionsOptimizer.h>
32
#include <Interpreters/ExternalDictionaries.h>
A
Alexey Milovidov 已提交
33
#include <Interpreters/convertFieldToType.h>
34 35
#include <Interpreters/Set.h>
#include <Interpreters/Join.h>
36 37
#include <Interpreters/ProjectionManipulation.h>
#include <Interpreters/evaluateConstantExpression.h>
38

39
#include <AggregateFunctions/AggregateFunctionFactory.h>
40
#include <AggregateFunctions/parseAggregateFunctionParameters.h>
41

42 43 44 45
#include <Storages/StorageDistributed.h>
#include <Storages/StorageMemory.h>
#include <Storages/StorageSet.h>
#include <Storages/StorageJoin.h>
46

47 48
#include <DataStreams/LazyBlockInputStream.h>
#include <DataStreams/copyData.h>
49

50
#include <Dictionaries/IDictionary.h>
51

52
#include <Common/typeid_cast.h>
53
#include <Common/StringUtils/StringUtils.h>
54

55
#include <Parsers/formatAST.h>
56

57 58
#include <Functions/FunctionFactory.h>
#include <Functions/IFunction.h>
A
Andrey Mironov 已提交
59

60
#include <ext/range.h>
61
#include <DataTypes/DataTypeFactory.h>
62 63
#include <DataTypes/DataTypeFunction.h>
#include <Functions/FunctionsMiscellaneous.h>
64
#include <DataTypes/DataTypeTuple.h>
65
#include <Parsers/queryToString.h>
66 67
#include <Parsers/ExpressionListParsers.h>
#include <Parsers/parseQuery.h>
N
Nikolai Kochetov 已提交
68
#include <Parsers/queryToString.h>
69
#include <Interpreters/evaluateQualified.h>
70 71
#include <Interpreters/QueryNormalizer.h>
#include <Interpreters/getQueryAliases.h>
72
#include <DataTypes/DataTypeWithDictionary.h>
73

74 75 76 77

namespace DB
{

78 79
namespace ErrorCodes
{
80
    extern const int BAD_ARGUMENTS;
81 82 83 84
    extern const int MULTIPLE_EXPRESSIONS_FOR_ALIAS;
    extern const int UNKNOWN_IDENTIFIER;
    extern const int CYCLIC_ALIASES;
    extern const int INCORRECT_RESULT_OF_SCALAR_SUBQUERY;
A
Alexey Milovidov 已提交
85
    extern const int TOO_MANY_ROWS;
86 87 88 89 90 91 92 93 94 95 96
    extern const int NOT_FOUND_COLUMN_IN_BLOCK;
    extern const int INCORRECT_ELEMENT_OF_SET;
    extern const int ALIAS_REQUIRED;
    extern const int EMPTY_NESTED_TABLE;
    extern const int NOT_AN_AGGREGATE;
    extern const int UNEXPECTED_EXPRESSION;
    extern const int DUPLICATE_COLUMN;
    extern const int FUNCTION_CANNOT_HAVE_PARAMETERS;
    extern const int ILLEGAL_AGGREGATION;
    extern const int SUPPORT_IS_DISABLED;
    extern const int TOO_DEEP_AST;
97
    extern const int TOO_BIG_AST;
98
    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
T
Tsarkova Anastasia 已提交
99
    extern const int CONDITIONAL_TREE_PARENT_NOT_FOUND;
100
    extern const int TYPE_MISMATCH;
101
    extern const int INVALID_JOIN_ON_EXPRESSION;
102
    extern const int EXPECTED_ALL_OR_ANY;
103 104
}

105

106 107
/** Calls to these functions in the GROUP BY statement would be
  * replaced by their immediate argument.
108
  */
A
Alexey Milovidov 已提交
109 110
const std::unordered_set<String> injective_function_names
{
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
    "negate",
    "bitNot",
    "reverse",
    "reverseUTF8",
    "toString",
    "toFixedString",
    "IPv4NumToString",
    "IPv4StringToNum",
    "hex",
    "unhex",
    "bitmaskToList",
    "bitmaskToArray",
    "tuple",
    "regionToName",
    "concatAssumeInjective",
126 127
};

128 129
const std::unordered_set<String> possibly_injective_function_names
{
130 131 132 133 134 135 136 137 138 139 140 141 142
    "dictGetString",
    "dictGetUInt8",
    "dictGetUInt16",
    "dictGetUInt32",
    "dictGetUInt64",
    "dictGetInt8",
    "dictGetInt16",
    "dictGetInt32",
    "dictGetInt64",
    "dictGetFloat32",
    "dictGetFloat64",
    "dictGetDate",
    "dictGetDateTime"
143 144
};

A
Merge  
Alexey Arno 已提交
145 146 147 148
namespace
{

bool functionIsInOperator(const String & name)
149
{
150
    return name == "in" || name == "notIn";
151 152
}

A
Merge  
Alexey Arno 已提交
153
bool functionIsInOrGlobalInOperator(const String & name)
154
{
155
    return name == "in" || name == "notIn" || name == "globalIn" || name == "globalNotIn";
156 157
}

158
void removeDuplicateColumns(NamesAndTypesList & columns)
159
{
160
    std::set<String> names;
161 162 163 164 165 166 167
    for (auto it = columns.begin(); it != columns.end();)
    {
        if (names.emplace(it->name).second)
            ++it;
        else
            columns.erase(it++);
    }
168 169
}

A
Merge  
Alexey Arno 已提交
170
}
171

172
ExpressionAnalyzer::ExpressionAnalyzer(
173
    const ASTPtr & query_,
174
    const Context & context_,
175
    const StoragePtr & storage_,
176 177
    const NamesAndTypesList & source_columns_,
    const Names & required_result_columns_,
178
    size_t subquery_depth_,
A
Alexey Milovidov 已提交
179 180
    bool do_global_,
    const SubqueriesForSets & subqueries_for_set_)
181
    : query(query_), context(context_), settings(context.getSettings()),
182
    subquery_depth(subquery_depth_),
183
    source_columns(source_columns_), required_result_columns(required_result_columns_),
184
    storage(storage_),
A
Alexey Milovidov 已提交
185
    do_global(do_global_), subqueries_for_sets(subqueries_for_set_)
186
{
187
    select_query = typeid_cast<ASTSelectQuery *>(query.get());
188

189 190 191 192
    if (!storage && select_query)
    {
        auto select_database = select_query->database();
        auto select_table = select_query->table();
193

194 195 196 197 198 199 200 201 202 203 204
        if (select_table
            && !typeid_cast<const ASTSelectWithUnionQuery *>(select_table.get())
            && !typeid_cast<const ASTFunction *>(select_table.get()))
        {
            String database = select_database
                ? typeid_cast<const ASTIdentifier &>(*select_database).name
                : "";
            const String & table = typeid_cast<const ASTIdentifier &>(*select_table).name;
            storage = context.tryGetTable(database, table);
        }
    }
205

206
    if (storage && source_columns.empty())
207 208 209 210 211 212 213 214 215 216
    {
        auto physical_columns = storage->getColumns().getAllPhysical();
        if (source_columns.empty())
            source_columns.swap(physical_columns);
        else
        {
            source_columns.insert(source_columns.end(), physical_columns.begin(), physical_columns.end());
            removeDuplicateColumns(source_columns);
        }
    }
217 218 219
    else
        removeDuplicateColumns(source_columns);

220
    addAliasColumns();
221

222 223
    translateQualifiedNames();

F
f1yegor 已提交
224 225
    /// Depending on the user's profile, check for the execution rights
    /// distributed subqueries inside the IN or JOIN sections and process these subqueries.
226
    InJoinSubqueriesPreprocessor(context).process(select_query);
A
Merge  
Alexey Arno 已提交
227

F
f1yegor 已提交
228
    /// Optimizes logical expressions.
229
    LogicalExpressionsOptimizer(select_query, settings).perform();
230

F
f1yegor 已提交
231
    /// Creates a dictionary `aliases`: alias -> ASTPtr
232
    getQueryAliases(query, aliases);
233

234 235
    /// Common subexpression elimination. Rewrite rules.
    normalizeTree();
236

A
Alexey Milovidov 已提交
237
    /// Remove unneeded columns according to 'required_result_columns'.
238 239 240 241 242
    /// Leave all selected columns in case of DISTINCT; columns that contain arrayJoin function inside.
    /// Must be after 'normalizeTree' (after expanding aliases, for aliases not get lost)
    ///  and before 'executeScalarSubqueries', 'analyzeAggregation', etc. to avoid excessive calculations.
    removeUnneededColumnsFromSelectClause();

F
f1yegor 已提交
243
    /// Executing scalar subqueries - replacing them with constant values.
244
    executeScalarSubqueries();
245

246
    /// Optimize if with constant condition after constants was substituted instead of sclalar subqueries.
247
    optimizeIfWithConstantCondition();
248

249 250
    /// GROUP BY injective function elimination.
    optimizeGroupBy();
251

F
f1yegor 已提交
252
    /// Remove duplicate items from ORDER BY.
253
    optimizeOrderBy();
254

255 256
    // Remove duplicated elements from LIMIT BY clause.
    optimizeLimitBy();
257

258 259 260
    /// Remove duplicated columns from USING(...).
    optimizeUsing();

261 262
    /// array_join_alias_to_name, array_join_result_to_source.
    getArrayJoinedColumns();
263

A
Alexey Milovidov 已提交
264
    /// Push the predicate expression down to the subqueries.
265
    rewrite_subqueries = PredicateExpressionsOptimizer(select_query, settings, context).optimize();
266

267
    /// Delete the unnecessary from `source_columns` list. Create `unknown_required_source_columns`. Form `columns_added_by_join`.
268
    collectUsedColumns();
269

F
f1yegor 已提交
270 271
    /// external_tables, subqueries_for_sets for global subqueries.
    /// Replaces global subqueries with the generated names of temporary tables that will be sent to remote servers.
272
    initGlobalSubqueriesAndExternalTables();
273

274
    /// has_aggregation, aggregation_keys, aggregate_descriptions, aggregated_columns.
F
f1yegor 已提交
275 276 277 278 279 280
    /// This analysis should be performed after processing global subqueries, because otherwise,
    /// if the aggregate function contains a global subquery, then `analyzeAggregation` method will save
    /// in `aggregate_descriptions` the information about the parameters of this aggregate function, among which
    /// global subquery. Then, when you call `initGlobalSubqueriesAndExternalTables` method, this
    /// the global subquery will be replaced with a temporary table, resulting in aggregate_descriptions
    /// will contain out-of-date information, which will lead to an error when the query is executed.
281
    analyzeAggregation();
282 283
}

284
static std::vector<ASTTableExpression> getTableExpressions(const ASTPtr & query)
285
{
286
    ASTSelectQuery * select_query = typeid_cast<ASTSelectQuery *>(query.get());
287

288
    std::vector<ASTTableExpression> tables_expression;
289

290 291 292 293 294
    if (select_query && select_query->tables)
    {
        for (const auto & element : select_query->tables->children)
        {
            ASTTablesInSelectQueryElement & select_element = static_cast<ASTTablesInSelectQueryElement &>(*element);
295

296 297 298 299
            if (select_element.table_expression)
                tables_expression.emplace_back(static_cast<ASTTableExpression &>(*select_element.table_expression));
        }
    }
300

301 302
    return tables_expression;
}
303

304 305 306 307 308 309 310 311 312 313
void ExpressionAnalyzer::translateQualifiedNames()
{
    if (!select_query || !select_query->tables || select_query->tables->children.empty())
        return;

    std::vector<DatabaseAndTableWithAlias> tables;
    std::vector<ASTTableExpression> tables_expression = getTableExpressions(query);

    for (const auto & table_expression : tables_expression)
        tables.emplace_back(getTableNameWithAliasFromTableExpression(table_expression, context));
314

315
    translateQualifiedNamesImpl(query, tables);
316 317
}

318
void ExpressionAnalyzer::translateQualifiedNamesImpl(ASTPtr & ast, const std::vector<DatabaseAndTableWithAlias> & tables)
319 320 321
{
    if (auto * identifier = typeid_cast<ASTIdentifier *>(ast.get()))
    {
C
chertus 已提交
322
        if (identifier->general())
323
        {
324 325 326 327 328 329 330 331 332 333
            /// Select first table name with max number of qualifiers which can be stripped.
            size_t max_num_qualifiers_to_strip = 0;
            size_t best_table_pos = 0;

            for (size_t table_pos = 0; table_pos < tables.size(); ++table_pos)
            {
                const auto & table = tables[table_pos];
                auto num_qualifiers_to_strip = getNumComponentsToStripInOrderToTranslateQualifiedName(*identifier, table);

                if (num_qualifiers_to_strip > max_num_qualifiers_to_strip)
334
                {
335 336
                    max_num_qualifiers_to_strip = num_qualifiers_to_strip;
                    best_table_pos = table_pos;
337 338
                }
            }
339 340

            stripIdentifier(ast, max_num_qualifiers_to_strip);
341

342 343 344
            /// In case if column from the joined table are in source columns, change it's name to qualified.
            if (best_table_pos && source_columns.contains(ast->getColumnName()))
                tables[best_table_pos].makeQualifiedName(ast);
345 346
        }
    }
A
Alexey Milovidov 已提交
347
    else if (typeid_cast<ASTQualifiedAsterisk *>(ast.get()))
348 349 350 351 352 353 354 355 356 357 358 359
    {
        if (ast->children.size() != 1)
            throw Exception("Logical error: qualified asterisk must have exactly one child", ErrorCodes::LOGICAL_ERROR);

        ASTIdentifier * ident = typeid_cast<ASTIdentifier *>(ast->children[0].get());
        if (!ident)
            throw Exception("Logical error: qualified asterisk must have identifier as its child", ErrorCodes::LOGICAL_ERROR);

        size_t num_components = ident->children.size();
        if (num_components > 2)
            throw Exception("Qualified asterisk cannot have more than two qualifiers", ErrorCodes::UNKNOWN_ELEMENT_IN_AST);

360
        for (const auto & table_names : tables)
361
        {
362 363 364 365 366 367 368 369 370
            /// database.table.*, table.* or alias.*
            if ((num_components == 2
                 && !table_names.database.empty()
                 && static_cast<const ASTIdentifier &>(*ident->children[0]).name == table_names.database
                 && static_cast<const ASTIdentifier &>(*ident->children[1]).name == table_names.table)
                || (num_components == 0
                    && ((!table_names.table.empty() && ident->name == table_names.table)
                        || (!table_names.alias.empty() && ident->name == table_names.alias))))
            {
371
                return;
372
            }
373
        }
374 375

        throw Exception("Unknown qualified identifier: " + ident->getAliasOrColumnName(), ErrorCodes::UNKNOWN_IDENTIFIER);
376
    }
377 378
    else if (auto * join = typeid_cast<ASTTableJoin *>(ast.get()))
    {
A
alexey-milovidov 已提交
379
        /// Don't translate on_expression here in order to resolve equation parts later.
380 381 382
        if (join->using_expression_list)
            translateQualifiedNamesImpl(join->using_expression_list, tables);
    }
383 384
    else
    {
385 386 387 388 389 390 391 392 393
        /// If the WHERE clause or HAVING consists of a single quailified column, the reference must be translated not only in children, but also in where_expression and having_expression.
        if (ASTSelectQuery * select = typeid_cast<ASTSelectQuery *>(ast.get()))
        {
            if (select->prewhere_expression)
                translateQualifiedNamesImpl(select->prewhere_expression, tables);
            if (select->where_expression)
                translateQualifiedNamesImpl(select->where_expression, tables);
            if (select->having_expression)
                translateQualifiedNamesImpl(select->having_expression, tables);
394 395
        }

396 397
        for (auto & child : ast->children)
        {
398
            /// Do not go to FROM, JOIN, subqueries.
399
            if (!typeid_cast<const ASTTableExpression *>(child.get())
400
                && !typeid_cast<const ASTSelectWithUnionQuery *>(child.get()))
401
            {
402
                translateQualifiedNamesImpl(child, tables);
403 404 405 406 407
            }
        }
    }
}

408 409
void ExpressionAnalyzer::optimizeIfWithConstantCondition()
{
410
    optimizeIfWithConstantConditionImpl(query);
411 412 413 414
}

bool ExpressionAnalyzer::tryExtractConstValueFromCondition(const ASTPtr & condition, bool & value) const
{
415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444
    /// numeric constant in condition
    if (const ASTLiteral * literal = typeid_cast<ASTLiteral *>(condition.get()))
    {
        if (literal->value.getType() == Field::Types::Int64 ||
            literal->value.getType() == Field::Types::UInt64)
        {
            value = literal->value.get<Int64>();
            return true;
        }
    }

    /// cast of numeric constant in condition to UInt8
    if (const ASTFunction * function = typeid_cast<ASTFunction * >(condition.get()))
    {
        if (function->name == "CAST")
        {
            if (ASTExpressionList * expr_list = typeid_cast<ASTExpressionList *>(function->arguments.get()))
            {
                const ASTPtr & type_ast = expr_list->children.at(1);
                if (const ASTLiteral * type_literal = typeid_cast<ASTLiteral *>(type_ast.get()))
                {
                    if (type_literal->value.getType() == Field::Types::String &&
                        type_literal->value.get<std::string>() == "UInt8")
                        return tryExtractConstValueFromCondition(expr_list->children.at(0), value);
                }
            }
        }
    }

    return false;
445 446
}

447
void ExpressionAnalyzer::optimizeIfWithConstantConditionImpl(ASTPtr & current_ast)
448
{
449 450 451 452 453 454 455 456
    if (!current_ast)
        return;

    for (ASTPtr & child : current_ast->children)
    {
        ASTFunction * function_node = typeid_cast<ASTFunction *>(child.get());
        if (!function_node || function_node->name != "if")
        {
457
            optimizeIfWithConstantConditionImpl(child);
458 459 460
            continue;
        }

461
        optimizeIfWithConstantConditionImpl(function_node->arguments);
462 463
        ASTExpressionList * args = typeid_cast<ASTExpressionList *>(function_node->arguments.get());

464 465 466
        if (args->children.size() != 3)
            throw Exception("Wrong number of arguments for function 'if' (" + toString(args->children.size()) + " instead of 3)",
                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
467

468 469 470
        ASTPtr condition_expr = args->children[0];
        ASTPtr then_expr = args->children[1];
        ASTPtr else_expr = args->children[2];
471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502

        bool condition;
        if (tryExtractConstValueFromCondition(condition_expr, condition))
        {
            ASTPtr replace_ast = condition ? then_expr : else_expr;
            ASTPtr child_copy = child;
            String replace_alias = replace_ast->tryGetAlias();
            String if_alias = child->tryGetAlias();

            if (replace_alias.empty())
            {
                replace_ast->setAlias(if_alias);
                child = replace_ast;
            }
            else
            {
                /// Only copy of one node is required here.
                /// But IAST has only method for deep copy of subtree.
                /// This can be a reason of performance degradation in case of deep queries.
                ASTPtr replace_ast_deep_copy = replace_ast->clone();
                replace_ast_deep_copy->setAlias(if_alias);
                child = replace_ast_deep_copy;
            }

            if (!if_alias.empty())
            {
                auto alias_it = aliases.find(if_alias);
                if (alias_it != aliases.end() && alias_it->second.get() == child_copy.get())
                    alias_it->second = child;
            }
        }
    }
503
}
504 505 506

void ExpressionAnalyzer::analyzeAggregation()
{
F
f1yegor 已提交
507 508 509
    /** Find aggregation keys (aggregation_keys), information about aggregate functions (aggregate_descriptions),
     *  as well as a set of columns obtained after the aggregation, if any,
     *  or after all the actions that are usually performed before aggregation (aggregated_columns).
510
     *
F
f1yegor 已提交
511
     * Everything below (compiling temporary ExpressionActions) - only for the purpose of query analysis (type output).
512 513 514 515 516
     */

    if (select_query && (select_query->group_expression_list || select_query->having_expression))
        has_aggregation = true;

517
    ExpressionActionsPtr temp_actions = std::make_shared<ExpressionActions>(source_columns, context);
518 519 520 521 522

    if (select_query && select_query->array_join_expression_list())
    {
        getRootActions(select_query->array_join_expression_list(), true, false, temp_actions);
        addMultipleArrayJoinAction(temp_actions);
523
        array_join_columns = temp_actions->getSampleBlock().getNamesAndTypesList();
524 525 526 527 528 529 530
    }

    if (select_query)
    {
        const ASTTablesInSelectQueryElement * join = select_query->join();
        if (join)
        {
531 532 533 534 535 536
            const auto table_join = static_cast<const ASTTableJoin &>(*join->table_join);
            if (table_join.using_expression_list)
                getRootActions(table_join.using_expression_list, true, false, temp_actions);
            if (table_join.on_expression)
                for (const auto & key_ast : analyzed_join.key_asts_left)
                    getRootActions(key_ast, true, false, temp_actions);
537 538 539 540 541

            addJoinAction(temp_actions, true);
        }
    }

542
    getAggregates(query, temp_actions);
543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566

    if (has_aggregation)
    {
        assertSelect();

        /// Find out aggregation keys.
        if (select_query->group_expression_list)
        {
            NameSet unique_keys;
            ASTs & group_asts = select_query->group_expression_list->children;
            for (ssize_t i = 0; i < ssize_t(group_asts.size()); ++i)
            {
                ssize_t size = group_asts.size();
                getRootActions(group_asts[i], true, false, temp_actions);

                const auto & column_name = group_asts[i]->getColumnName();
                const auto & block = temp_actions->getSampleBlock();

                if (!block.has(column_name))
                    throw Exception("Unknown identifier (in GROUP BY): " + column_name, ErrorCodes::UNKNOWN_IDENTIFIER);

                const auto & col = block.getByName(column_name);

                /// Constant expressions have non-null column pointer at this stage.
567
                if (col.column && col.column->isColumnConst())
568 569 570 571 572 573 574 575 576 577 578 579 580 581
                {
                    /// But don't remove last key column if no aggregate functions, otherwise aggregation will not work.
                    if (!aggregate_descriptions.empty() || size > 1)
                    {
                        if (i + 1 < static_cast<ssize_t>(size))
                            group_asts[i] = std::move(group_asts.back());

                        group_asts.pop_back();

                        --i;
                        continue;
                    }
                }

582
                NameAndTypePair key{column_name, col.type};
583 584 585 586 587

                /// Aggregation keys are uniqued.
                if (!unique_keys.count(key.name))
                {
                    unique_keys.insert(key.name);
588
                    aggregation_keys.push_back(key);
589 590

                    /// Key is no longer needed, therefore we can save a little by moving it.
591
                    aggregated_columns.push_back(std::move(key));
592 593 594 595 596 597
                }
            }

            if (group_asts.empty())
            {
                select_query->group_expression_list = nullptr;
598
                has_aggregation = select_query->having_expression || aggregate_descriptions.size();
599 600 601 602 603 604
            }
        }

        for (size_t i = 0; i < aggregate_descriptions.size(); ++i)
        {
            AggregateDescription & desc = aggregate_descriptions[i];
605
            aggregated_columns.emplace_back(desc.column_name, desc.function->getReturnType());
606 607
        }
    }
608 609 610 611
    else
    {
        aggregated_columns = temp_actions->getSampleBlock().getNamesAndTypesList();
    }
612 613 614
}


615 616
void ExpressionAnalyzer::initGlobalSubqueriesAndExternalTables()
{
F
f1yegor 已提交
617
    /// Adds existing external tables (not subqueries) to the external_tables dictionary.
618
    findExternalTables(query);
619

F
f1yegor 已提交
620
    /// Converts GLOBAL subqueries to external tables; Puts them into the external_tables dictionary: name -> StoragePtr.
621
    initGlobalSubqueries(query);
622 623 624 625 626
}


void ExpressionAnalyzer::initGlobalSubqueries(ASTPtr & ast)
{
F
f1yegor 已提交
627
    /// Recursive calls. We do not go into subqueries.
628 629 630 631 632

    for (auto & child : ast->children)
        if (!typeid_cast<ASTSelectQuery *>(child.get()))
            initGlobalSubqueries(child);

F
f1yegor 已提交
633
    /// Bottom-up actions.
634

635
    if (ASTFunction * func = typeid_cast<ASTFunction *>(ast.get()))
636
    {
F
f1yegor 已提交
637
        /// For GLOBAL IN.
638 639
        if (do_global && (func->name == "globalIn" || func->name == "globalNotIn"))
            addExternalStorage(func->arguments->children.at(1));
640
    }
641
    else if (ASTTablesInSelectQueryElement * table_elem = typeid_cast<ASTTablesInSelectQueryElement *>(ast.get()))
642
    {
F
f1yegor 已提交
643
        /// For GLOBAL JOIN.
644 645 646
        if (do_global && table_elem->table_join
            && static_cast<const ASTTableJoin &>(*table_elem->table_join).locality == ASTTableJoin::Locality::Global)
            addExternalStorage(table_elem->table_expression);
647
    }
648 649 650 651 652
}


void ExpressionAnalyzer::findExternalTables(ASTPtr & ast)
{
F
f1yegor 已提交
653
    /// Traverse from the bottom. Intentionally go into subqueries.
654 655
    for (auto & child : ast->children)
        findExternalTables(child);
656

F
f1yegor 已提交
657
    /// If table type identifier
658
    StoragePtr external_storage;
659

660
    if (ASTIdentifier * node = typeid_cast<ASTIdentifier *>(ast.get()))
C
chertus 已提交
661
        if (node->special())
662 663
            if ((external_storage = context.tryGetExternalTable(node->name)))
                external_tables[node->name] = external_storage;
664 665
}

666
static std::shared_ptr<InterpreterSelectWithUnionQuery> interpretSubquery(
667
    const ASTPtr & table_expression, const Context & context, size_t subquery_depth, const Names & required_source_columns)
668 669
{
    /// Subquery or table name. The name of the table is similar to the subquery `SELECT * FROM t`.
670 671 672
    const ASTSubquery * subquery = typeid_cast<const ASTSubquery *>(table_expression.get());
    const ASTFunction * function = typeid_cast<const ASTFunction *>(table_expression.get());
    const ASTIdentifier * table = typeid_cast<const ASTIdentifier *>(table_expression.get());
673

674 675
    if (!subquery && !table && !function)
        throw Exception("Table expression is undefined, Method: ExpressionAnalyzer::interpretSubquery." , ErrorCodes::LOGICAL_ERROR);
676 677 678 679 680 681 682 683 684 685

    /** The subquery in the IN / JOIN section does not have any restrictions on the maximum size of the result.
      * Because the result of this query is not the result of the entire query.
      * Constraints work instead
      *  max_rows_in_set, max_bytes_in_set, set_overflow_mode,
      *  max_rows_in_join, max_bytes_in_join, join_overflow_mode,
      *  which are checked separately (in the Set, Join objects).
      */
    Context subquery_context = context;
    Settings subquery_settings = context.getSettings();
686 687
    subquery_settings.max_result_rows = 0;
    subquery_settings.max_result_bytes = 0;
688 689 690 691 692
    /// The calculation of `extremes` does not make sense and is not necessary (if you do it, then the `extremes` of the subquery can be taken instead of the whole query).
    subquery_settings.extremes = 0;
    subquery_context.setSettings(subquery_settings);

    ASTPtr query;
693
    if (table || function)
694 695
    {
        /// create ASTSelectQuery for "SELECT * FROM table" as if written by hand
696 697 698 699 700
        const auto select_with_union_query = std::make_shared<ASTSelectWithUnionQuery>();
        query = select_with_union_query;

        select_with_union_query->list_of_selects = std::make_shared<ASTExpressionList>();

701
        const auto select_query = std::make_shared<ASTSelectQuery>();
702
        select_with_union_query->list_of_selects->children.push_back(select_query);
703 704 705 706 707

        const auto select_expression_list = std::make_shared<ASTExpressionList>();
        select_query->select_expression_list = select_expression_list;
        select_query->children.emplace_back(select_query->select_expression_list);

708 709
        NamesAndTypesList columns;

710
        /// get columns list for target table
711 712
        if (function)
        {
713 714
            auto query_context = const_cast<Context *>(&context.getQueryContext());
            const auto & storage = query_context->executeTableFunction(table_expression);
715 716 717 718 719 720 721 722 723 724
            columns = storage->getColumns().ordinary;
            select_query->addTableFunction(*const_cast<ASTPtr *>(&table_expression));
        }
        else
        {
            auto database_table = getDatabaseAndTableNameFromIdentifier(*table);
            const auto & storage = context.getTable(database_table.first, database_table.second);
            columns = storage->getColumns().ordinary;
            select_query->replaceDatabaseAndTable(database_table.first, database_table.second);
        }
725

726
        select_expression_list->children.reserve(columns.size());
727 728
        /// manually substitute column names in place of asterisk
        for (const auto & column : columns)
A
Alexey Milovidov 已提交
729
            select_expression_list->children.emplace_back(std::make_shared<ASTIdentifier>(column.name));
730 731 732 733 734 735 736 737 738 739 740 741 742 743 744
    }
    else
    {
        query = subquery->children.at(0);

        /** Columns with the same name can be specified in a subquery. For example, SELECT x, x FROM t
          * This is bad, because the result of such a query can not be saved to the table, because the table can not have the same name columns.
          * Saving to the table is required for GLOBAL subqueries.
          *
          * To avoid this situation, we will rename the same columns.
          */

        std::set<std::string> all_column_names;
        std::set<std::string> assigned_column_names;

745
        if (ASTSelectWithUnionQuery * select_with_union = typeid_cast<ASTSelectWithUnionQuery *>(query.get()))
746
        {
747
            if (ASTSelectQuery * select = typeid_cast<ASTSelectQuery *>(select_with_union->list_of_selects->children.at(0).get()))
748
            {
749 750
                for (auto & expr : select->select_expression_list->children)
                    all_column_names.insert(expr->getAliasOrColumnName());
751

752
                for (auto & expr : select->select_expression_list->children)
753
                {
754 755 756 757 758 759 760
                    auto name = expr->getAliasOrColumnName();

                    if (!assigned_column_names.insert(name).second)
                    {
                        size_t i = 1;
                        while (all_column_names.end() != all_column_names.find(name + "_" + toString(i)))
                            ++i;
761

762 763 764
                        name = name + "_" + toString(i);
                        expr = expr->clone();   /// Cancels fuse of the same expressions in the tree.
                        expr->setAlias(name);
765

766 767 768
                        all_column_names.insert(name);
                        assigned_column_names.insert(name);
                    }
769 770 771 772 773
                }
            }
        }
    }

774 775
    return std::make_shared<InterpreterSelectWithUnionQuery>(
        query, subquery_context, required_source_columns, QueryProcessingStage::Complete, subquery_depth + 1);
776
}
777 778


779
void ExpressionAnalyzer::addExternalStorage(ASTPtr & subquery_or_table_name_or_table_expression)
780
{
F
f1yegor 已提交
781
    /// With nondistributed queries, creating temporary tables does not make sense.
782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817
    if (!(storage && storage->isRemote()))
        return;

    ASTPtr subquery;
    ASTPtr table_name;
    ASTPtr subquery_or_table_name;

    if (typeid_cast<const ASTIdentifier *>(subquery_or_table_name_or_table_expression.get()))
    {
        table_name = subquery_or_table_name_or_table_expression;
        subquery_or_table_name = table_name;
    }
    else if (auto ast_table_expr = typeid_cast<const ASTTableExpression *>(subquery_or_table_name_or_table_expression.get()))
    {
        if (ast_table_expr->database_and_table_name)
        {
            table_name = ast_table_expr->database_and_table_name;
            subquery_or_table_name = table_name;
        }
        else if (ast_table_expr->subquery)
        {
            subquery = ast_table_expr->subquery;
            subquery_or_table_name = subquery;
        }
    }
    else if (typeid_cast<const ASTSubquery *>(subquery_or_table_name_or_table_expression.get()))
    {
        subquery = subquery_or_table_name_or_table_expression;
        subquery_or_table_name = subquery;
    }

    if (!subquery_or_table_name)
        throw Exception("Logical error: unknown AST element passed to ExpressionAnalyzer::addExternalStorage method", ErrorCodes::LOGICAL_ERROR);

    if (table_name)
    {
F
f1yegor 已提交
818
        /// If this is already an external table, you do not need to add anything. Just remember its presence.
819 820 821 822
        if (external_tables.end() != external_tables.find(static_cast<const ASTIdentifier &>(*table_name).name))
            return;
    }

F
f1yegor 已提交
823
    /// Generate the name for the external table.
824 825 826 827 828 829 830 831 832 833
    String external_table_name = "_data" + toString(external_table_id);
    while (external_tables.count(external_table_name))
    {
        ++external_table_id;
        external_table_name = "_data" + toString(external_table_id);
    }

    auto interpreter = interpretSubquery(subquery_or_table_name, context, subquery_depth, {});

    Block sample = interpreter->getSampleBlock();
834
    NamesAndTypesList columns = sample.getNamesAndTypesList();
835

836
    StoragePtr external_storage = StorageMemory::create(external_table_name, ColumnsDescription{columns});
837
    external_storage->startup();
838

839 840 841 842 843
    /** We replace the subquery with the name of the temporary table.
        * It is in this form, the request will go to the remote server.
        * This temporary table will go to the remote server, and on its side,
        *  instead of doing a subquery, you just need to read it.
        */
844

C
chertus 已提交
845
    auto database_and_table_name = ASTIdentifier::createSpecial(external_table_name);
846

847
    if (auto ast_table_expr = typeid_cast<ASTTableExpression *>(subquery_or_table_name_or_table_expression.get()))
848
    {
849 850
        ast_table_expr->subquery.reset();
        ast_table_expr->database_and_table_name = database_and_table_name;
851

852 853
        ast_table_expr->children.clear();
        ast_table_expr->children.emplace_back(database_and_table_name);
854 855
    }
    else
856
        subquery_or_table_name_or_table_expression = database_and_table_name;
857 858 859 860 861

    external_tables[external_table_name] = external_storage;
    subqueries_for_sets[external_table_name].source = interpreter->execute().in;
    subqueries_for_sets[external_table_name].table = external_storage;

F
f1yegor 已提交
862 863 864 865
    /** NOTE If it was written IN tmp_table - the existing temporary (but not external) table,
      *  then a new temporary table will be created (for example, _data1),
      *  and the data will then be copied to it.
      * Maybe this can be avoided.
866
      */
867 868 869
}


870
static NamesAndTypesList::iterator findColumn(const String & name, NamesAndTypesList & cols)
871
{
872
    return std::find_if(cols.begin(), cols.end(),
873
        [&](const NamesAndTypesList::value_type & val) { return val.name == name; });
874 875
}

876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900
static NamesAndTypesList getNamesAndTypeListFromTableExpression(const ASTTableExpression & table_expression, const Context & context)
{
    NamesAndTypesList names_and_type_list;
    if (table_expression.subquery)
    {
        const auto & subquery = table_expression.subquery->children.at(0);
        names_and_type_list = InterpreterSelectWithUnionQuery::getSampleBlock(subquery, context).getNamesAndTypesList();
    }
    else if (table_expression.table_function)
    {
        const auto table_function = table_expression.table_function;
        auto query_context = const_cast<Context *>(&context.getQueryContext());
        const auto & function_storage = query_context->executeTableFunction(table_function);
        names_and_type_list = function_storage->getSampleBlockNonMaterialized().getNamesAndTypesList();
    }
    else if (table_expression.database_and_table_name)
    {
        const auto & identifier = static_cast<const ASTIdentifier &>(*table_expression.database_and_table_name);
        auto database_table = getDatabaseAndTableNameFromIdentifier(identifier);
        const auto & table = context.getTable(database_table.first, database_table.second);
        names_and_type_list = table->getSampleBlockNonMaterialized().getNamesAndTypesList();
    }

    return names_and_type_list;
}
901 902 903

void ExpressionAnalyzer::normalizeTree()
{
904
    Names all_columns_name;
905

906 907
    auto columns_name = storage ? storage->getColumns().ordinary.getNames() : source_columns.getNames();
    all_columns_name.insert(all_columns_name.begin(), columns_name.begin(), columns_name.end());
908

909
    if (!settings.asterisk_left_columns_only)
910
    {
911 912
        auto columns_from_joined_table = analyzed_join.getColumnsFromJoinedTable(context, select_query).getNames();
        all_columns_name.insert(all_columns_name.end(), columns_from_joined_table.begin(), columns_from_joined_table.end());
913 914
    }

915 916
    if (all_columns_name.empty())
        throw Exception("Logical error: an asterisk cannot be replaced with empty columns.", ErrorCodes::LOGICAL_ERROR);
917

918 919 920
    TableNamesAndColumnsName table_names_nad_columns_name;
    if (select_query && select_query->tables && !select_query->tables->children.empty())
    {
921
        std::vector<ASTTableExpression> tables_expression = getTableExpressions(query);
922 923 924 925 926 927 928 929 930 931

        for (const auto & table_expression : tables_expression)
        {
            const auto table_name = getTableNameWithAliasFromTableExpression(table_expression, context);
            NamesAndTypesList names_and_types = getNamesAndTypeListFromTableExpression(table_expression, context);
            table_names_nad_columns_name.emplace_back(std::pair(table_name, names_and_types.getNames()));
        }
    }

    QueryNormalizer(query, aliases, settings, all_columns_name, table_names_nad_columns_name).perform();
932 933
}

934

935
void ExpressionAnalyzer::addAliasColumns()
936
{
937 938
    if (!select_query)
        return;
939

940 941
    if (!storage)
        return;
942

943 944
    const auto & storage_aliases = storage->getColumns().aliases;
    source_columns.insert(std::end(source_columns), std::begin(storage_aliases), std::end(storage_aliases));
945 946 947
}


948 949
void ExpressionAnalyzer::executeScalarSubqueries()
{
950
    if (!select_query)
951
        executeScalarSubqueriesImpl(query);
952 953
    else
    {
954
        for (auto & child : query->children)
955
        {
F
f1yegor 已提交
956
            /// Do not go to FROM, JOIN, UNION.
957
            if (!typeid_cast<const ASTTableExpression *>(child.get())
A
Alexey Milovidov 已提交
958
                && !typeid_cast<const ASTSelectQuery *>(child.get()))
959 960 961 962 963
            {
                executeScalarSubqueriesImpl(child);
            }
        }
    }
964 965
}

966

967
static ASTPtr addTypeConversion(std::unique_ptr<ASTLiteral> && ast, const String & type_name)
968
{
A
Alexey Milovidov 已提交
969
    auto func = std::make_shared<ASTFunction>();
970 971
    ASTPtr res = func;
    func->alias = ast->alias;
972
    func->prefer_alias_to_column_name = ast->prefer_alias_to_column_name;
973 974
    ast->alias.clear();
    func->name = "CAST";
A
Alexey Milovidov 已提交
975
    auto exp_list = std::make_shared<ASTExpressionList>();
976 977 978
    func->arguments = exp_list;
    func->children.push_back(func->arguments);
    exp_list->children.emplace_back(ast.release());
A
Alexey Milovidov 已提交
979
    exp_list->children.emplace_back(std::make_shared<ASTLiteral>(type_name));
980
    return res;
981 982 983
}


984 985
void ExpressionAnalyzer::executeScalarSubqueriesImpl(ASTPtr & ast)
{
F
f1yegor 已提交
986 987
    /** Replace subqueries that return exactly one row
      * ("scalar" subqueries) to the corresponding constants.
988
      *
F
f1yegor 已提交
989
      * If the subquery returns more than one column, it is replaced by a tuple of constants.
990
      *
F
f1yegor 已提交
991
      * Features
992
      *
F
f1yegor 已提交
993 994 995
      * A replacement occurs during query analysis, and not during the main runtime.
      * This means that the progress indicator will not work during the execution of these requests,
      *  and also such queries can not be aborted.
996
      *
F
f1yegor 已提交
997
      * But the query result can be used for the index in the table.
998
      *
F
f1yegor 已提交
999 1000
      * Scalar subqueries are executed on the request-initializer server.
      * The request is sent to remote servers with already substituted constants.
1001 1002 1003 1004 1005 1006
      */

    if (ASTSubquery * subquery = typeid_cast<ASTSubquery *>(ast.get()))
    {
        Context subquery_context = context;
        Settings subquery_settings = context.getSettings();
1007
        subquery_settings.max_result_rows = 1;
1008 1009 1010
        subquery_settings.extremes = 0;
        subquery_context.setSettings(subquery_settings);

1011 1012
        ASTPtr subquery_select = subquery->children.at(0);
        BlockIO res = InterpreterSelectWithUnionQuery(subquery_select, subquery_context, {}, QueryProcessingStage::Complete, subquery_depth + 1).execute();
1013 1014 1015 1016 1017 1018 1019 1020 1021

        Block block;
        try
        {
            block = res.in->read();

            if (!block)
            {
                /// Interpret subquery with empty result as Null literal
A
Alexey Milovidov 已提交
1022
                auto ast_new = std::make_unique<ASTLiteral>(Null());
1023 1024
                ast_new->setAlias(ast->tryGetAlias());
                ast = std::move(ast_new);
1025 1026 1027 1028 1029 1030 1031 1032
                return;
            }

            if (block.rows() != 1 || res.in->read())
                throw Exception("Scalar subquery returned more than one row", ErrorCodes::INCORRECT_RESULT_OF_SCALAR_SUBQUERY);
        }
        catch (const Exception & e)
        {
A
Alexey Milovidov 已提交
1033
            if (e.code() == ErrorCodes::TOO_MANY_ROWS)
1034 1035 1036 1037 1038 1039 1040 1041
                throw Exception("Scalar subquery returned more than one row", ErrorCodes::INCORRECT_RESULT_OF_SCALAR_SUBQUERY);
            else
                throw;
        }

        size_t columns = block.columns();
        if (columns == 1)
        {
A
Alexey Milovidov 已提交
1042
            auto lit = std::make_unique<ASTLiteral>((*block.safeGetByPosition(0).column)[0]);
1043
            lit->alias = subquery->alias;
1044
            lit->prefer_alias_to_column_name = subquery->prefer_alias_to_column_name;
1045 1046 1047 1048
            ast = addTypeConversion(std::move(lit), block.safeGetByPosition(0).type->getName());
        }
        else
        {
A
Alexey Milovidov 已提交
1049
            auto tuple = std::make_shared<ASTFunction>();
1050 1051 1052
            tuple->alias = subquery->alias;
            ast = tuple;
            tuple->name = "tuple";
A
Alexey Milovidov 已提交
1053
            auto exp_list = std::make_shared<ASTExpressionList>();
1054 1055 1056 1057 1058 1059 1060
            tuple->arguments = exp_list;
            tuple->children.push_back(tuple->arguments);

            exp_list->children.resize(columns);
            for (size_t i = 0; i < columns; ++i)
            {
                exp_list->children[i] = addTypeConversion(
A
Alexey Milovidov 已提交
1061
                    std::make_unique<ASTLiteral>((*block.safeGetByPosition(i).column)[0]),
1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076
                    block.safeGetByPosition(i).type->getName());
            }
        }
    }
    else
    {
        /** Don't descend into subqueries in FROM section.
          */
        if (!typeid_cast<ASTTableExpression *>(ast.get()))
        {
            /** Don't descend into subqueries in arguments of IN operator.
              * But if an argument is not subquery, than deeper may be scalar subqueries and we need to descend in them.
              */
            ASTFunction * func = typeid_cast<ASTFunction *>(ast.get());

1077
            if (func && functionIsInOrGlobalInOperator(func->name))
1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093
            {
                for (auto & child : ast->children)
                {
                    if (child != func->arguments)
                        executeScalarSubqueriesImpl(child);
                    else
                        for (size_t i = 0, size = func->arguments->children.size(); i < size; ++i)
                            if (i != 1 || !typeid_cast<ASTSubquery *>(func->arguments->children[i].get()))
                                executeScalarSubqueriesImpl(func->arguments->children[i]);
                }
            }
            else
                for (auto & child : ast->children)
                    executeScalarSubqueriesImpl(child);
        }
    }
1094 1095 1096
}


1097
void ExpressionAnalyzer::optimizeGroupBy()
1098
{
1099 1100 1101
    if (!(select_query && select_query->group_expression_list))
        return;

A
Alexey Milovidov 已提交
1102 1103 1104
    const auto is_literal = [] (const ASTPtr & ast)
    {
        return typeid_cast<const ASTLiteral *>(ast.get());
1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179
    };

    auto & group_exprs = select_query->group_expression_list->children;

    /// removes expression at index idx by making it last one and calling .pop_back()
    const auto remove_expr_at_index = [&group_exprs] (const size_t idx)
    {
        if (idx < group_exprs.size() - 1)
            std::swap(group_exprs[idx], group_exprs.back());

        group_exprs.pop_back();
    };

    /// iterate over each GROUP BY expression, eliminate injective function calls and literals
    for (size_t i = 0; i < group_exprs.size();)
    {
        if (const auto function = typeid_cast<ASTFunction *>(group_exprs[i].get()))
        {
            /// assert function is injective
            if (possibly_injective_function_names.count(function->name))
            {
                /// do not handle semantic errors here
                if (function->arguments->children.size() < 2)
                {
                    ++i;
                    continue;
                }

                const auto & dict_name = typeid_cast<const ASTLiteral &>(*function->arguments->children[0])
                    .value.safeGet<String>();

                const auto & dict_ptr = context.getExternalDictionaries().getDictionary(dict_name);

                const auto & attr_name = typeid_cast<const ASTLiteral &>(*function->arguments->children[1])
                    .value.safeGet<String>();

                if (!dict_ptr->isInjective(attr_name))
                {
                    ++i;
                    continue;
                }
            }
            else if (!injective_function_names.count(function->name))
            {
                ++i;
                continue;
            }

            /// copy shared pointer to args in order to ensure lifetime
            auto args_ast = function->arguments;

            /** remove function call and take a step back to ensure
              * next iteration does not skip not yet processed data
              */
            remove_expr_at_index(i);

            /// copy non-literal arguments
            std::remove_copy_if(
                std::begin(args_ast->children), std::end(args_ast->children),
                std::back_inserter(group_exprs), is_literal
            );
        }
        else if (is_literal(group_exprs[i]))
        {
            remove_expr_at_index(i);
        }
        else
        {
            /// if neither a function nor literal - advance to next expression
            ++i;
        }
    }

    if (group_exprs.empty())
    {
F
f1yegor 已提交
1180 1181 1182
        /** You can not completely remove GROUP BY. Because if there were no aggregate functions, then it turns out that there will be no aggregation.
          * Instead, leave `GROUP BY const`.
          * Next, see deleting the constants in the analyzeAggregation method.
1183 1184
          */

F
f1yegor 已提交
1185
        /// You must insert a constant that is not the name of the column in the table. Such a case is rare, but it happens.
1186 1187 1188
        UInt64 unused_column = 0;
        String unused_column_name = toString(unused_column);

1189
        while (source_columns.end() != std::find_if(source_columns.begin(), source_columns.end(),
1190
            [&unused_column_name](const NameAndTypePair & name_type) { return name_type.name == unused_column_name; }))
1191 1192 1193 1194 1195 1196
        {
            ++unused_column;
            unused_column_name = toString(unused_column);
        }

        select_query->group_expression_list = std::make_shared<ASTExpressionList>();
A
Alexey Milovidov 已提交
1197
        select_query->group_expression_list->children.emplace_back(std::make_shared<ASTLiteral>(UInt64(unused_column)));
1198
    }
1199 1200 1201
}


1202 1203
void ExpressionAnalyzer::optimizeOrderBy()
{
1204 1205
    if (!(select_query && select_query->order_expression_list))
        return;
1206

F
f1yegor 已提交
1207
    /// Make unique sorting conditions.
1208 1209
    using NameAndLocale = std::pair<String, String>;
    std::set<NameAndLocale> elems_set;
1210

1211 1212 1213
    ASTs & elems = select_query->order_expression_list->children;
    ASTs unique_elems;
    unique_elems.reserve(elems.size());
1214

1215 1216 1217 1218
    for (const auto & elem : elems)
    {
        String name = elem->children.front()->getColumnName();
        const ASTOrderByElement & order_by_elem = typeid_cast<const ASTOrderByElement &>(*elem);
1219

1220 1221 1222
        if (elems_set.emplace(name, order_by_elem.collation ? order_by_elem.collation->getColumnName() : "").second)
            unique_elems.emplace_back(elem);
    }
1223

1224 1225
    if (unique_elems.size() < elems.size())
        elems = unique_elems;
1226 1227 1228
}


1229 1230
void ExpressionAnalyzer::optimizeLimitBy()
{
1231 1232
    if (!(select_query && select_query->limit_by_expression_list))
        return;
1233

1234
    std::set<String> elems_set;
1235

1236 1237 1238
    ASTs & elems = select_query->limit_by_expression_list->children;
    ASTs unique_elems;
    unique_elems.reserve(elems.size());
1239

1240 1241 1242 1243 1244
    for (const auto & elem : elems)
    {
        if (elems_set.emplace(elem->getColumnName()).second)
            unique_elems.emplace_back(elem);
    }
1245

1246 1247
    if (unique_elems.size() < elems.size())
        elems = unique_elems;
1248 1249
}

1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281
void ExpressionAnalyzer::optimizeUsing()
{
    if (!select_query)
        return;

    auto node = const_cast<ASTTablesInSelectQueryElement *>(select_query->join());
    if (!node)
        return;

    auto table_join = static_cast<ASTTableJoin *>(&*node->table_join);
    if (!(table_join && table_join->using_expression_list))
        return;

    ASTs & expression_list = table_join->using_expression_list->children;
    ASTs uniq_expressions_list;

    std::set<String> expressions_names;

    for (const auto & expression : expression_list)
    {
        auto expression_name = expression->getAliasOrColumnName();
        if (expressions_names.find(expression_name) == expressions_names.end())
        {
            uniq_expressions_list.push_back(expression);
            expressions_names.insert(expression_name);
        }
    }

    if (uniq_expressions_list.size() < expression_list.size())
        expression_list = uniq_expressions_list;
}

1282

1283
void ExpressionAnalyzer::makeSetsForIndex()
P
Pavel Kartavyy 已提交
1284
{
1285 1286 1287 1288 1289 1290 1291
    if (storage && select_query && storage->supportsIndexForIn())
    {
        if (select_query->where_expression)
            makeSetsForIndexImpl(select_query->where_expression, storage->getSampleBlock());
        if (select_query->prewhere_expression)
            makeSetsForIndexImpl(select_query->prewhere_expression, storage->getSampleBlock());
    }
P
Pavel Kartavyy 已提交
1292 1293
}

1294

1295
void ExpressionAnalyzer::tryMakeSetForIndexFromSubquery(const ASTPtr & subquery_or_table_name)
1296 1297 1298
{
    BlockIO res = interpretSubquery(subquery_or_table_name, context, subquery_depth + 1, {})->execute();

A
Alexey Milovidov 已提交
1299
    SizeLimits set_for_index_size_limits = SizeLimits(settings.max_rows_in_set, settings.max_bytes_in_set, settings.set_overflow_mode);
1300
    SetPtr set = std::make_shared<Set>(set_for_index_size_limits, true);
1301

1302
    set->setHeader(res.in->getHeader());
1303 1304 1305
    while (Block block = res.in->read())
    {
        /// If the limits have been exceeded, give up and let the default subquery processing actions take place.
A
Alexey Milovidov 已提交
1306
        if (!set->insertFromBlock(block))
1307 1308 1309
            return;
    }

1310
    prepared_sets[subquery_or_table_name->range] = std::move(set);
1311 1312 1313
}


1314
void ExpressionAnalyzer::makeSetsForIndexImpl(const ASTPtr & node, const Block & sample_block)
P
Pavel Kartavyy 已提交
1315
{
1316
    for (auto & child : node->children)
1317
    {
1318
        /// Don't descent into subqueries.
1319 1320 1321 1322 1323 1324 1325 1326 1327
        if (typeid_cast<ASTSubquery *>(child.get()))
            continue;

        /// Don't dive into lambda functions
        const ASTFunction * func = typeid_cast<const ASTFunction *>(child.get());
        if (func && func->name == "lambda")
            continue;

        makeSetsForIndexImpl(child, sample_block);
1328
    }
1329

1330
    const ASTFunction * func = typeid_cast<const ASTFunction *>(node.get());
1331
    if (func && functionIsInOperator(func->name))
1332
    {
1333
        const IAST & args = *func->arguments;
1334

1335
        if (storage && storage->mayBenefitFromIndexForIn(args.children.at(0)))
1336
        {
1337 1338
            const ASTPtr & arg = args.children.at(1);

1339
            if (!prepared_sets.count(arg->range)) /// Not already prepared.
1340
            {
1341 1342 1343
                if (typeid_cast<ASTSubquery *>(arg.get()) || typeid_cast<ASTIdentifier *>(arg.get()))
                {
                    if (settings.use_index_for_in_with_subqueries)
1344
                        tryMakeSetForIndexFromSubquery(arg);
1345 1346 1347
                }
                else
                {
1348
                    NamesAndTypesList temp_columns = source_columns;
1349
                    temp_columns.insert(temp_columns.end(), array_join_columns.begin(), array_join_columns.end());
N
Nikolai Kochetov 已提交
1350 1351
                    for (const auto & joined_column : analyzed_join.columns_added_by_join)
                        temp_columns.push_back(joined_column.name_and_type);
1352
                    ExpressionActionsPtr temp_actions = std::make_shared<ExpressionActions>(temp_columns, context);
1353
                    getRootActions(func->arguments->children.at(0), true, false, temp_actions);
1354

1355 1356 1357 1358
                    Block sample_block_with_calculated_columns = temp_actions->getSampleBlock();
                    if (sample_block_with_calculated_columns.has(args.children.at(0)->getColumnName()))
                        makeExplicitSet(func, sample_block_with_calculated_columns, true);
                }
1359 1360 1361
            }
        }
    }
P
Pavel Kartavyy 已提交
1362
}
1363

1364

1365
void ExpressionAnalyzer::makeSet(const ASTFunction * node, const Block & sample_block)
1366
{
F
f1yegor 已提交
1367 1368 1369
    /** You need to convert the right argument to a set.
      * This can be a table name, a value, a value enumeration, or a subquery.
      * The enumeration of values is parsed as a function `tuple`.
1370
      */
1371 1372
    const IAST & args = *node->arguments;
    const ASTPtr & arg = args.children.at(1);
1373

F
f1yegor 已提交
1374
    /// Already converted.
1375
    if (prepared_sets.count(arg->range))
1376 1377
        return;

F
f1yegor 已提交
1378
    /// If the subquery or table name for SELECT.
1379 1380
    const ASTIdentifier * identifier = typeid_cast<const ASTIdentifier *>(arg.get());
    if (typeid_cast<const ASTSubquery *>(arg.get()) || identifier)
1381
    {
F
f1yegor 已提交
1382
        /// We get the stream of blocks for the subquery. Create Set and put it in place of the subquery.
1383 1384
        String set_id = arg->getColumnName();

1385 1386
        /// A special case is if the name of the table is specified on the right side of the IN statement,
        ///  and the table has the type Set (a previously prepared set).
1387 1388
        if (identifier)
        {
1389 1390
            auto database_table = getDatabaseAndTableNameFromIdentifier(*identifier);
            StoragePtr table = context.tryGetTable(database_table.first, database_table.second);
1391 1392 1393

            if (table)
            {
A
Alexey Milovidov 已提交
1394
                StorageSet * storage_set = dynamic_cast<StorageSet *>(table.get());
1395 1396 1397

                if (storage_set)
                {
1398
                    prepared_sets[arg->range] = storage_set->getSet();
1399 1400 1401 1402 1403 1404 1405
                    return;
                }
            }
        }

        SubqueryForSet & subquery_for_set = subqueries_for_sets[set_id];

F
f1yegor 已提交
1406
        /// If you already created a Set with the same subquery / table.
1407 1408
        if (subquery_for_set.set)
        {
1409
            prepared_sets[arg->range] = subquery_for_set.set;
1410 1411 1412
            return;
        }

A
Alexey Milovidov 已提交
1413
        SetPtr set = std::make_shared<Set>(SizeLimits(settings.max_rows_in_set, settings.max_bytes_in_set, settings.set_overflow_mode), false);
1414

F
f1yegor 已提交
1415 1416 1417 1418
        /** The following happens for GLOBAL INs:
          * - in the addExternalStorage function, the IN (SELECT ...) subquery is replaced with IN _data1,
          *   in the subquery_for_set object, this subquery is set as source and the temporary table _data1 as the table.
          * - this function shows the expression IN_data1.
1419
          */
1420
        if (!subquery_for_set.source && (!storage || !storage->isRemote()))
1421 1422 1423
        {
            auto interpreter = interpretSubquery(arg, context, subquery_depth, {});
            subquery_for_set.source = std::make_shared<LazyBlockInputStream>(
1424
                interpreter->getSampleBlock(), [interpreter]() mutable { return interpreter->execute().in; });
1425

F
f1yegor 已提交
1426
            /** Why is LazyBlockInputStream used?
1427
              *
A
Alexey Milovidov 已提交
1428
              * The fact is that when processing a query of the form
1429
              *  SELECT ... FROM remote_test WHERE column GLOBAL IN (subquery),
F
f1yegor 已提交
1430
              *  if the distributed remote_test table contains localhost as one of the servers,
A
Alexey Milovidov 已提交
1431
              *  the query will be interpreted locally again (and not sent over TCP, as in the case of a remote server).
1432
              *
F
f1yegor 已提交
1433
              * The query execution pipeline will be:
1434
              * CreatingSets
F
f1yegor 已提交
1435
              *  subquery execution, filling the temporary table with _data1 (1)
1436
              *  CreatingSets
F
f1yegor 已提交
1437 1438
              *   reading from the table _data1, creating the set (2)
              *   read from the table subordinate to remote_test.
1439
              *
A
Alexey Milovidov 已提交
1440
              * (The second part of the pipeline under CreateSets is a reinterpretation of the query inside StorageDistributed,
F
f1yegor 已提交
1441
              *  the query differs in that the database name and tables are replaced with subordinates, and the subquery is replaced with _data1.)
1442
              *
F
f1yegor 已提交
1443 1444 1445
              * But when creating the pipeline, when creating the source (2), it will be found that the _data1 table is empty
              *  (because the query has not started yet), and empty source will be returned as the source.
              * And then, when the query is executed, an empty set will be created in step (2).
1446
              *
F
f1yegor 已提交
1447 1448
              * Therefore, we make the initialization of step (2) lazy
              *  - so that it does not occur until step (1) is completed, on which the table will be populated.
1449
              *
F
f1yegor 已提交
1450
              * Note: this solution is not very good, you need to think better.
1451 1452 1453
              */
        }

1454
        subquery_for_set.set = set;
1455
        prepared_sets[arg->range] = set;
1456 1457 1458
    }
    else
    {
F
f1yegor 已提交
1459
        /// An explicit enumeration of values in parentheses.
1460 1461
        makeExplicitSet(node, sample_block, false);
    }
P
Pavel Kartavyy 已提交
1462 1463
}

F
f1yegor 已提交
1464
/// The case of an explicit enumeration of values.
1465
void ExpressionAnalyzer::makeExplicitSet(const ASTFunction * node, const Block & sample_block, bool create_ordered_set)
P
Pavel Kartavyy 已提交
1466
{
1467
    const IAST & args = *node->arguments;
1468 1469 1470 1471

    if (args.children.size() != 2)
        throw Exception("Wrong number of arguments passed to function in", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);

1472
    const ASTPtr & left_arg = args.children.at(0);
1473
    const ASTPtr & right_arg = args.children.at(1);
1474

1475
    auto getTupleTypeFromAst = [this](const ASTPtr & tuple_ast) -> DataTypePtr
1476
    {
1477
        auto ast_function = typeid_cast<const ASTFunction *>(tuple_ast.get());
1478 1479 1480 1481 1482 1483 1484 1485
        if (ast_function && ast_function->name == "tuple" && !ast_function->arguments->children.empty())
        {
            /// Won't parse all values of outer tuple.
            auto element = ast_function->arguments->children.at(0);
            std::pair<Field, DataTypePtr> value_raw = evaluateConstantExpression(element, context);
            return std::make_shared<DataTypeTuple>(DataTypes({value_raw.second}));
        }

1486
        return evaluateConstantExpression(tuple_ast, context).second;
1487 1488
    };

1489
    const DataTypePtr & left_arg_type = sample_block.getByName(left_arg->getColumnName()).type;
1490
    const DataTypePtr & right_arg_type = getTupleTypeFromAst(right_arg);
1491

1492 1493
    std::function<size_t(const DataTypePtr &)> getTupleDepth;
    getTupleDepth = [&getTupleDepth](const DataTypePtr & type) -> size_t
1494
    {
1495 1496
        if (auto tuple_type = typeid_cast<const DataTypeTuple *>(type.get()))
            return 1 + (tuple_type->getElements().empty() ? 0 : getTupleDepth(tuple_type->getElements().at(0)));
1497

1498 1499
        return 0;
    };
1500

1501
    size_t left_tuple_depth = getTupleDepth(left_arg_type);
1502
    size_t right_tuple_depth = getTupleDepth(right_arg_type);
1503

1504 1505 1506
    DataTypes set_element_types = {left_arg_type};
    auto left_tuple_type = typeid_cast<const DataTypeTuple *>(left_arg_type.get());
    if (left_tuple_type && left_tuple_type->getElements().size() != 1)
1507
        set_element_types = left_tuple_type->getElements();
1508

1509 1510 1511 1512
    for (auto & element_type : set_element_types)
        if (const auto * low_cardinality_type = typeid_cast<const DataTypeWithDictionary *>(element_type.get()))
            element_type = low_cardinality_type->getDictionaryType();

1513
    ASTPtr elements_ast = nullptr;
1514

1515 1516
    /// 1 in 1; (1, 2) in (1, 2); identity(tuple(tuple(tuple(1)))) in tuple(tuple(tuple(1))); etc.
    if (left_tuple_depth == right_tuple_depth)
1517 1518
    {
        ASTPtr exp_list = std::make_shared<ASTExpressionList>();
1519
        exp_list->children.push_back(right_arg);
1520 1521
        elements_ast = exp_list;
    }
1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537
    /// 1 in (1, 2); (1, 2) in ((1, 2), (3, 4)); etc.
    else if (left_tuple_depth + 1 == right_tuple_depth)
    {
        ASTFunction * set_func = typeid_cast<ASTFunction *>(right_arg.get());

        if (!set_func || set_func->name != "tuple")
            throw Exception("Incorrect type of 2nd argument for function " + node->name
                            + ". Must be subquery or set of elements with type " + left_arg_type->getName() + ".",
                            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

        elements_ast = set_func->arguments;
    }
    else
        throw Exception("Invalid types for IN function: "
                        + left_arg_type->getName() + " and " + right_arg_type->getName() + ".",
                        ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
1538

A
Alexey Milovidov 已提交
1539 1540
    SetPtr set = std::make_shared<Set>(SizeLimits(settings.max_rows_in_set, settings.max_bytes_in_set, settings.set_overflow_mode), create_ordered_set);
    set->createFromAST(set_element_types, elements_ast, context);
1541
    prepared_sets[right_arg->range] = std::move(set);
1542 1543 1544
}


1545
static String getUniqueName(const Block & block, const String & prefix)
1546
{
1547 1548 1549 1550
    int i = 1;
    while (block.has(prefix + toString(i)))
        ++i;
    return prefix + toString(i);
1551 1552
}

F
f1yegor 已提交
1553 1554 1555 1556 1557
/** For getActionsImpl.
  * A stack of ExpressionActions corresponding to nested lambda expressions.
  * The new action should be added to the highest possible level.
  * For example, in the expression "select arrayMap(x -> x + column1 * column2, array1)"
  *  calculation of the product must be done outside the lambda expression (it does not depend on x), and the calculation of the sum is inside (depends on x).
A
Alexey Milovidov 已提交
1558
  */
1559 1560
ScopeStack::ScopeStack(const ExpressionActionsPtr & actions, const Context & context_)
    : context(context_)
A
Alexey Milovidov 已提交
1561
{
T
Tsarkova Anastasia 已提交
1562 1563 1564 1565 1566 1567 1568
    stack.emplace_back();
    stack.back().actions = actions;

    const Block & sample_block = actions->getSampleBlock();
    for (size_t i = 0, size = sample_block.columns(); i < size; ++i)
        stack.back().new_columns.insert(sample_block.getByPosition(i).name);
}
1569

T
Tsarkova Anastasia 已提交
1570 1571 1572 1573
void ScopeStack::pushLevel(const NamesAndTypesList & input_columns)
{
    stack.emplace_back();
    Level & prev = stack[stack.size() - 2];
1574

T
Tsarkova Anastasia 已提交
1575 1576
    ColumnsWithTypeAndName all_columns;
    NameSet new_names;
1577

T
Tsarkova Anastasia 已提交
1578
    for (NamesAndTypesList::const_iterator it = input_columns.begin(); it != input_columns.end(); ++it)
1579
    {
T
Tsarkova Anastasia 已提交
1580 1581 1582
        all_columns.emplace_back(nullptr, it->type, it->name);
        new_names.insert(it->name);
        stack.back().new_columns.insert(it->name);
1583 1584
    }

T
Tsarkova Anastasia 已提交
1585 1586
    const Block & prev_sample_block = prev.actions->getSampleBlock();
    for (size_t i = 0, size = prev_sample_block.columns(); i < size; ++i)
1587
    {
T
Tsarkova Anastasia 已提交
1588 1589 1590
        const ColumnWithTypeAndName & col = prev_sample_block.getByPosition(i);
        if (!new_names.count(col.name))
            all_columns.push_back(col);
1591 1592
    }

1593
    stack.back().actions = std::make_shared<ExpressionActions>(all_columns, context);
T
Tsarkova Anastasia 已提交
1594
}
1595

T
Tsarkova Anastasia 已提交
1596 1597 1598 1599 1600
size_t ScopeStack::getColumnLevel(const std::string & name)
{
    for (int i = static_cast<int>(stack.size()) - 1; i >= 0; --i)
        if (stack[i].new_columns.count(name))
            return i;
1601

T
Tsarkova Anastasia 已提交
1602 1603
    throw Exception("Unknown identifier: " + name, ErrorCodes::UNKNOWN_IDENTIFIER);
}
1604

T
Tsarkova Anastasia 已提交
1605 1606 1607 1608 1609 1610
void ScopeStack::addAction(const ExpressionAction & action)
{
    size_t level = 0;
    Names required = action.getNeededColumns();
    for (size_t i = 0; i < required.size(); ++i)
        level = std::max(level, getColumnLevel(required[i]));
1611

T
Tsarkova Anastasia 已提交
1612 1613
    Names added;
    stack[level].actions->add(action, added);
1614

T
Tsarkova Anastasia 已提交
1615
    stack[level].new_columns.insert(added.begin(), added.end());
1616

T
Tsarkova Anastasia 已提交
1617
    for (size_t i = 0; i < added.size(); ++i)
1618
    {
T
Tsarkova Anastasia 已提交
1619 1620 1621
        const ColumnWithTypeAndName & col = stack[level].actions->getSampleBlock().getByName(added[i]);
        for (size_t j = level + 1; j < stack.size(); ++j)
            stack[j].actions->addInput(col);
1622
    }
T
Tsarkova Anastasia 已提交
1623
}
1624

T
Tsarkova Anastasia 已提交
1625 1626 1627 1628 1629 1630
ExpressionActionsPtr ScopeStack::popLevel()
{
    ExpressionActionsPtr res = stack.back().actions;
    stack.pop_back();
    return res;
}
A
Alexey Milovidov 已提交
1631

T
Tsarkova Anastasia 已提交
1632 1633 1634 1635
const Block & ScopeStack::getSampleBlock() const
{
    return stack.back().actions->getSampleBlock();
}
A
Alexey Milovidov 已提交
1636

1637
void ExpressionAnalyzer::getRootActions(const ASTPtr & ast, bool no_subqueries, bool only_consts, ExpressionActionsPtr & actions)
1638
{
1639
    ScopeStack scopes(actions, context);
1640

T
Tsarkova Anastasia 已提交
1641 1642 1643 1644 1645
    ProjectionManipulatorPtr projection_manipulator;
    if (!isThereArrayJoin(ast) && settings.enable_conditional_computation && !only_consts)
        projection_manipulator = std::make_shared<ConditionalTree>(scopes, context);
    else
        projection_manipulator = std::make_shared<DefaultProjectionManipulator>(scopes);
1646

T
Tsarkova Anastasia 已提交
1647
    getActionsImpl(ast, no_subqueries, only_consts, scopes, projection_manipulator);
1648

1649
    actions = scopes.popLevel();
1650 1651
}

1652 1653
void ExpressionAnalyzer::getArrayJoinedColumns()
{
1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671
    if (select_query && select_query->array_join_expression_list())
    {
        ASTs & array_join_asts = select_query->array_join_expression_list()->children;
        for (const auto & ast : array_join_asts)
        {
            const String nested_table_name = ast->getColumnName();
            const String nested_table_alias = ast->getAliasOrColumnName();

            if (nested_table_alias == nested_table_name && !typeid_cast<const ASTIdentifier *>(ast.get()))
                throw Exception("No alias for non-trivial value in ARRAY JOIN: " + nested_table_name, ErrorCodes::ALIAS_REQUIRED);

            if (array_join_alias_to_name.count(nested_table_alias) || aliases.count(nested_table_alias))
                throw Exception("Duplicate alias in ARRAY JOIN: " + nested_table_alias, ErrorCodes::MULTIPLE_EXPRESSIONS_FOR_ALIAS);

            array_join_alias_to_name[nested_table_alias] = nested_table_name;
            array_join_name_to_alias[nested_table_name] = nested_table_alias;
        }

1672
        getArrayJoinedColumnsImpl(query);
1673

F
f1yegor 已提交
1674 1675
        /// If the result of ARRAY JOIN is not used, it is necessary to ARRAY-JOIN any column,
        /// to get the correct number of rows.
1676 1677 1678 1679 1680 1681
        if (array_join_result_to_source.empty())
        {
            ASTPtr expr = select_query->array_join_expression_list()->children.at(0);
            String source_name = expr->getColumnName();
            String result_name = expr->getAliasOrColumnName();

F
f1yegor 已提交
1682
            /// This is an array.
1683
            if (!typeid_cast<ASTIdentifier *>(expr.get()) || findColumn(source_name, source_columns) != source_columns.end())
1684 1685 1686
            {
                array_join_result_to_source[result_name] = source_name;
            }
F
f1yegor 已提交
1687
            else /// This is a nested table.
1688 1689
            {
                bool found = false;
1690
                for (const auto & column_name_type : source_columns)
1691
                {
1692 1693
                    auto splitted = Nested::splitName(column_name_type.name);
                    if (splitted.first == source_name && !splitted.second.empty())
1694
                    {
1695
                        array_join_result_to_source[Nested::concatenateName(result_name, splitted.second)] = column_name_type.name;
1696 1697 1698 1699 1700 1701 1702 1703 1704
                        found = true;
                        break;
                    }
                }
                if (!found)
                    throw Exception("No columns in nested table " + source_name, ErrorCodes::EMPTY_NESTED_TABLE);
            }
        }
    }
1705 1706 1707
}


F
f1yegor 已提交
1708
/// Fills the array_join_result_to_source: on which columns-arrays to replicate, and how to call them after that.
1709
void ExpressionAnalyzer::getArrayJoinedColumnsImpl(const ASTPtr & ast)
1710
{
1711 1712 1713 1714 1715
    if (typeid_cast<ASTTablesInSelectQuery *>(ast.get()))
        return;

    if (ASTIdentifier * node = typeid_cast<ASTIdentifier *>(ast.get()))
    {
C
chertus 已提交
1716
        if (node->general())
1717
        {
1718
            auto splitted = Nested::splitName(node->name);  /// ParsedParams, Key1
1719 1720 1721

            if (array_join_alias_to_name.count(node->name))
            {
F
f1yegor 已提交
1722
                /// ARRAY JOIN was written with an array column. Example: SELECT K1 FROM ... ARRAY JOIN ParsedParams.Key1 AS K1
1723 1724
                array_join_result_to_source[node->name] = array_join_alias_to_name[node->name];    /// K1 -> ParsedParams.Key1
            }
1725
            else if (array_join_alias_to_name.count(splitted.first) && !splitted.second.empty())
1726
            {
F
f1yegor 已提交
1727
                /// ARRAY JOIN was written with a nested table. Example: SELECT PP.KEY1 FROM ... ARRAY JOIN ParsedParams AS PP
1728
                array_join_result_to_source[node->name]    /// PP.Key1 -> ParsedParams.Key1
1729
                    = Nested::concatenateName(array_join_alias_to_name[splitted.first], splitted.second);
1730
            }
1731
            else if (array_join_name_to_alias.count(node->name))
1732
            {
1733
                /** Example: SELECT ParsedParams.Key1 FROM ... ARRAY JOIN ParsedParams.Key1 AS PP.Key1.
F
f1yegor 已提交
1734
                  * That is, the query uses the original array, replicated by itself.
1735 1736
                  */
                array_join_result_to_source[    /// PP.Key1 -> ParsedParams.Key1
1737 1738
                    array_join_name_to_alias[node->name]] = node->name;
            }
1739
            else if (array_join_name_to_alias.count(splitted.first) && !splitted.second.empty())
1740 1741 1742 1743
            {
                /** Example: SELECT ParsedParams.Key1 FROM ... ARRAY JOIN ParsedParams AS PP.
                 */
                array_join_result_to_source[    /// PP.Key1 -> ParsedParams.Key1
1744
                Nested::concatenateName(array_join_name_to_alias[splitted.first], splitted.second)] = node->name;
1745 1746 1747 1748 1749 1750 1751 1752 1753 1754
            }
        }
    }
    else
    {
        for (auto & child : ast->children)
            if (!typeid_cast<const ASTSubquery *>(child.get())
                && !typeid_cast<const ASTSelectQuery *>(child.get()))
                getArrayJoinedColumnsImpl(child);
    }
1755 1756
}

T
Tsarkova Anastasia 已提交
1757 1758
bool ExpressionAnalyzer::isThereArrayJoin(const ASTPtr & ast)
{
T
Tsarkova Anastasia 已提交
1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782
    if (typeid_cast<ASTIdentifier *>(ast.get()))
    {
        return false;
    }
    else if (ASTFunction * node = typeid_cast<ASTFunction *>(ast.get()))
    {
        if (node->name == "arrayJoin")
        {
            return true;
        }
        if (functionIsInOrGlobalInOperator(node->name))
        {
            return isThereArrayJoin(node->arguments->children.at(0));
        }
        if (node->name == "indexHint")
        {
            return false;
        }
        if (AggregateFunctionFactory::instance().isAggregateFunctionName(node->name))
        {
            return false;
        }
        for (auto & child : node->arguments->children)
        {
T
Tsarkova Anastasia 已提交
1783 1784
            if (isThereArrayJoin(child))
            {
T
Tsarkova Anastasia 已提交
1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797
                return true;
            }
        }
        return false;
    }
    else if (typeid_cast<ASTLiteral *>(ast.get()))
    {
        return false;
    }
    else
    {
        for (auto & child : ast->children)
        {
T
Tsarkova Anastasia 已提交
1798 1799
            if (isThereArrayJoin(child))
            {
T
Tsarkova Anastasia 已提交
1800 1801 1802 1803 1804 1805
                return true;
            }
        }
        return false;
    }
}
1806

1807 1808 1809
void ExpressionAnalyzer::getActionsFromJoinKeys(const ASTTableJoin & table_join, bool no_subqueries, bool only_consts,
                                                ExpressionActionsPtr & actions)
{
1810
    ScopeStack scopes(actions, context);
1811 1812

    ProjectionManipulatorPtr projection_manipulator;
1813
    if (!isThereArrayJoin(query) && settings.enable_conditional_computation && !only_consts)
1814 1815 1816 1817 1818 1819 1820 1821
        projection_manipulator = std::make_shared<ConditionalTree>(scopes, context);
    else
        projection_manipulator = std::make_shared<DefaultProjectionManipulator>(scopes);

    if (table_join.using_expression_list)
        getActionsImpl(table_join.using_expression_list, no_subqueries, only_consts, scopes, projection_manipulator);
    else if (table_join.on_expression)
    {
1822
        for (const auto & ast : analyzed_join.key_asts_left)
1823
            getActionsImpl(ast, no_subqueries, only_consts, scopes, projection_manipulator);
1824 1825 1826 1827 1828
    }

    actions = scopes.popLevel();
}

T
Tsarkova Anastasia 已提交
1829 1830
void ExpressionAnalyzer::getActionsImpl(const ASTPtr & ast, bool no_subqueries, bool only_consts, ScopeStack & actions_stack,
                                        ProjectionManipulatorPtr projection_manipulator)
1831
{
1832 1833 1834 1835 1836 1837 1838 1839 1840
    String ast_column_name;
    auto getColumnName = [&ast, &ast_column_name]()
    {
        if (ast_column_name.empty())
            ast_column_name = ast->getColumnName();

        return ast_column_name;
    };

F
f1yegor 已提交
1841
    /// If the result of the calculation already exists in the block.
1842
    if ((typeid_cast<ASTFunction *>(ast.get()) || typeid_cast<ASTLiteral *>(ast.get()))
1843
        && projection_manipulator->tryToGetFromUpperProjection(getColumnName()))
1844 1845
        return;

1846
    if (typeid_cast<ASTIdentifier *>(ast.get()))
1847
    {
1848
        if (!only_consts && !projection_manipulator->tryToGetFromUpperProjection(getColumnName()))
1849
        {
F
f1yegor 已提交
1850 1851
            /// The requested column is not in the block.
            /// If such a column exists in the table, then the user probably forgot to surround it with an aggregate function or add it to GROUP BY.
1852 1853

            bool found = false;
1854
            for (const auto & column_name_type : source_columns)
1855
                if (column_name_type.name == getColumnName())
1856 1857 1858
                    found = true;

            if (found)
1859
                throw Exception("Column " + getColumnName() + " is not under aggregate function and not in GROUP BY.",
1860 1861 1862 1863 1864
                    ErrorCodes::NOT_AN_AGGREGATE);
        }
    }
    else if (ASTFunction * node = typeid_cast<ASTFunction *>(ast.get()))
    {
1865
        if (node->name == "lambda")
1866 1867
            throw Exception("Unexpected lambda expression", ErrorCodes::UNEXPECTED_EXPRESSION);

F
f1yegor 已提交
1868
        /// Function arrayJoin.
1869
        if (node->name == "arrayJoin")
1870 1871 1872 1873 1874
        {
            if (node->arguments->children.size() != 1)
                throw Exception("arrayJoin requires exactly 1 argument", ErrorCodes::TYPE_MISMATCH);

            ASTPtr arg = node->arguments->children.at(0);
T
Tsarkova Anastasia 已提交
1875
            getActionsImpl(arg, no_subqueries, only_consts, actions_stack, projection_manipulator);
1876 1877
            if (!only_consts)
            {
1878
                String result_name = projection_manipulator->getColumnName(getColumnName());
T
Tsarkova Anastasia 已提交
1879
                actions_stack.addAction(ExpressionAction::copyColumn(projection_manipulator->getColumnName(arg->getColumnName()), result_name));
1880 1881 1882 1883 1884 1885 1886 1887
                NameSet joined_columns;
                joined_columns.insert(result_name);
                actions_stack.addAction(ExpressionAction::arrayJoin(joined_columns, false, context));
            }

            return;
        }

1888
        if (functionIsInOrGlobalInOperator(node->name))
1889
        {
1890 1891 1892
            /// Let's find the type of the first argument (then getActionsImpl will be called again and will not affect anything).
            getActionsImpl(node->arguments->children.at(0), no_subqueries, only_consts, actions_stack, projection_manipulator);

1893
            if (!no_subqueries)
1894
            {
1895 1896
                /// Transform tuple or subquery into a set.
                makeSet(node, actions_stack.getSampleBlock());
1897
            }
1898
            else
1899
            {
1900 1901 1902
                if (!only_consts)
                {
                    /// We are in the part of the tree that we are not going to compute. You just need to define types.
1903 1904 1905 1906 1907 1908 1909
                    /// Do not subquery and create sets. We treat "IN" as "ignore" function.

                    actions_stack.addAction(ExpressionAction::applyFunction(
                            FunctionFactory::instance().get("ignore", context),
                            { node->arguments->children.at(0)->getColumnName() },
                            projection_manipulator->getColumnName(getColumnName()),
                            projection_manipulator->getProjectionSourceColumn()));
1910
                }
1911 1912
                return;
            }
1913
        }
1914

1915
        /// A special function `indexHint`. Everything that is inside it is not calculated
1916
        /// (and is used only for index analysis, see KeyCondition).
1917 1918 1919
        if (node->name == "indexHint")
        {
            actions_stack.addAction(ExpressionAction::addColumn(ColumnWithTypeAndName(
T
Tsarkova Anastasia 已提交
1920
                ColumnConst::create(ColumnUInt8::create(1, 1), 1), std::make_shared<DataTypeUInt8>(),
1921
                    projection_manipulator->getColumnName(getColumnName())), projection_manipulator->getProjectionSourceColumn(), false));
1922 1923 1924 1925 1926
            return;
        }

        if (AggregateFunctionFactory::instance().isAggregateFunctionName(node->name))
            return;
1927

1928 1929 1930 1931 1932 1933 1934
        /// Context object that we pass to function should live during query.
        const Context & function_context = context.hasQueryContext()
            ? context.getQueryContext()
            : context;

        const FunctionBuilderPtr & function_builder = FunctionFactory::instance().get(node->name, function_context);
        auto projection_action = getProjectionAction(node->name, actions_stack, projection_manipulator, getColumnName(), function_context);
1935

1936 1937 1938
        Names argument_names;
        DataTypes argument_types;
        bool arguments_present = true;
1939

1940 1941 1942
        /// If the function has an argument-lambda expression, you need to determine its type before the recursive call.
        bool has_lambda_arguments = false;

1943
        for (size_t arg = 0; arg < node->arguments->children.size(); ++arg)
1944
        {
1945
            auto & child = node->arguments->children[arg];
1946
            auto child_column_name = child->getColumnName();
1947

1948 1949
            ASTFunction * lambda = typeid_cast<ASTFunction *>(child.get());
            if (lambda && lambda->name == "lambda")
1950
            {
1951 1952 1953
                /// If the argument is a lambda expression, just remember its approximate type.
                if (lambda->arguments->children.size() != 2)
                    throw Exception("lambda requires two arguments", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
1954

1955
                ASTFunction * lambda_args_tuple = typeid_cast<ASTFunction *>(lambda->arguments->children.at(0).get());
1956

1957 1958
                if (!lambda_args_tuple || lambda_args_tuple->name != "tuple")
                    throw Exception("First argument of lambda must be a tuple", ErrorCodes::TYPE_MISMATCH);
1959

1960 1961 1962 1963 1964
                has_lambda_arguments = true;
                argument_types.emplace_back(std::make_shared<DataTypeFunction>(DataTypes(lambda_args_tuple->arguments->children.size())));
                /// Select the name in the next cycle.
                argument_names.emplace_back();
            }
1965
            else if (prepared_sets.count(child->range) && functionIsInOrGlobalInOperator(node->name) && arg == 1)
1966 1967 1968
            {
                ColumnWithTypeAndName column;
                column.type = std::make_shared<DataTypeSet>();
1969

1970
                const SetPtr & set = prepared_sets[child->range];
1971

1972
                /// If the argument is a set given by an enumeration of values (so, the set was already built), give it a unique name,
1973
                ///  so that sets with the same literal representation do not fuse together (they can have different types).
1974 1975 1976
                if (!set->empty())
                    column.name = getUniqueName(actions_stack.getSampleBlock(), "__set");
                else
1977
                    column.name = child_column_name;
1978

T
Tsarkova Anastasia 已提交
1979 1980
                column.name = projection_manipulator->getColumnName(column.name);

1981 1982 1983
                if (!actions_stack.getSampleBlock().has(column.name))
                {
                    column.column = ColumnSet::create(1, set);
1984

1985
                    actions_stack.addAction(ExpressionAction::addColumn(column, projection_manipulator->getProjectionSourceColumn(), false));
1986
                }
1987

1988 1989 1990 1991 1992 1993
                argument_types.push_back(column.type);
                argument_names.push_back(column.name);
            }
            else
            {
                /// If the argument is not a lambda expression, call it recursively and find out its type.
T
Tsarkova Anastasia 已提交
1994 1995 1996
                projection_action->preArgumentAction();
                getActionsImpl(child, no_subqueries, only_consts, actions_stack,
                               projection_manipulator);
1997 1998
                std::string name = projection_manipulator->getColumnName(child_column_name);
                projection_action->postArgumentAction(child_column_name);
1999 2000 2001 2002
                if (actions_stack.getSampleBlock().has(name))
                {
                    argument_types.push_back(actions_stack.getSampleBlock().getByName(name).type);
                    argument_names.push_back(name);
2003 2004 2005
                }
                else
                {
2006
                    if (only_consts)
2007
                    {
2008
                        arguments_present = false;
2009 2010 2011
                    }
                    else
                    {
T
Tsarkova Anastasia 已提交
2012
                        throw Exception("Unknown identifier: " + name + ", projection layer " + projection_manipulator->getProjectionExpression() , ErrorCodes::UNKNOWN_IDENTIFIER);
2013 2014 2015
                    }
                }
            }
2016
        }
2017

2018 2019
        if (only_consts && !arguments_present)
            return;
2020

2021 2022 2023 2024 2025 2026
        if (has_lambda_arguments && !only_consts)
        {
            function_builder->getLambdaArgumentTypes(argument_types);

            /// Call recursively for lambda expressions.
            for (size_t i = 0; i < node->arguments->children.size(); ++i)
2027
            {
2028
                ASTPtr child = node->arguments->children[i];
2029

2030 2031
                ASTFunction * lambda = typeid_cast<ASTFunction *>(child.get());
                if (lambda && lambda->name == "lambda")
2032
                {
2033 2034 2035 2036
                    const DataTypeFunction * lambda_type = typeid_cast<const DataTypeFunction *>(argument_types[i].get());
                    ASTFunction * lambda_args_tuple = typeid_cast<ASTFunction *>(lambda->arguments->children.at(0).get());
                    ASTs lambda_arg_asts = lambda_args_tuple->arguments->children;
                    NamesAndTypesList lambda_arguments;
2037

2038
                    for (size_t j = 0; j < lambda_arg_asts.size(); ++j)
2039
                    {
2040 2041 2042
                        ASTIdentifier * identifier = typeid_cast<ASTIdentifier *>(lambda_arg_asts[j].get());
                        if (!identifier)
                            throw Exception("lambda argument declarations must be identifiers", ErrorCodes::TYPE_MISMATCH);
2043

2044
                        String arg_name = identifier->name;
2045

2046 2047
                        lambda_arguments.emplace_back(arg_name, lambda_type->getArgumentTypes()[j]);
                    }
2048

T
Tsarkova Anastasia 已提交
2049
                    projection_action->preArgumentAction();
2050
                    actions_stack.pushLevel(lambda_arguments);
T
Tsarkova Anastasia 已提交
2051 2052
                    getActionsImpl(lambda->arguments->children.at(1), no_subqueries, only_consts, actions_stack,
                                   projection_manipulator);
2053
                    ExpressionActionsPtr lambda_actions = actions_stack.popLevel();
2054

T
Tsarkova Anastasia 已提交
2055
                    String result_name = projection_manipulator->getColumnName(lambda->arguments->children.at(1)->getColumnName());
2056 2057
                    lambda_actions->finalize(Names(1, result_name));
                    DataTypePtr result_type = lambda_actions->getSampleBlock().getByName(result_name).type;
2058

2059 2060
                    Names captured;
                    Names required = lambda_actions->getRequiredColumns();
2061 2062 2063
                    for (const auto & required_arg : required)
                        if (findColumn(required_arg, lambda_arguments) == lambda_arguments.end())
                            captured.push_back(required_arg);
2064

2065 2066 2067
                    /// We can not name `getColumnName()`,
                    ///  because it does not uniquely define the expression (the types of arguments can be different).
                    String lambda_name = getUniqueName(actions_stack.getSampleBlock(), "__lambda");
2068

2069 2070
                    auto function_capture = std::make_shared<FunctionCapture>(
                            lambda_actions, captured, lambda_arguments, result_type, result_name);
T
Tsarkova Anastasia 已提交
2071
                    actions_stack.addAction(ExpressionAction::applyFunction(function_capture, captured, lambda_name,
2072
                                            projection_manipulator->getProjectionSourceColumn()));
2073

2074 2075
                    argument_types[i] = std::make_shared<DataTypeFunction>(lambda_type->getArgumentTypes(), result_type);
                    argument_names[i] = lambda_name;
T
Tsarkova Anastasia 已提交
2076
                    projection_action->postArgumentAction(lambda_name);
2077 2078
                }
            }
2079
        }
2080

2081 2082
        if (only_consts)
        {
2083
            for (const auto & argument_name : argument_names)
2084
            {
2085
                if (!actions_stack.getSampleBlock().has(argument_name))
2086
                {
2087 2088
                    arguments_present = false;
                    break;
2089 2090 2091
                }
            }
        }
2092 2093

        if (arguments_present)
T
Tsarkova Anastasia 已提交
2094 2095 2096 2097 2098 2099 2100
        {
            projection_action->preCalculation();
            if (projection_action->isCalculationRequired())
            {
                actions_stack.addAction(
                    ExpressionAction::applyFunction(function_builder,
                                                    argument_names,
2101
                                                    projection_manipulator->getColumnName(getColumnName()),
2102
                                                    projection_manipulator->getProjectionSourceColumn()));
T
Tsarkova Anastasia 已提交
2103 2104
            }
        }
2105
    }
2106
    else if (ASTLiteral * literal = typeid_cast<ASTLiteral *>(ast.get()))
2107
    {
2108
        DataTypePtr type = applyVisitor(FieldToDataType(), literal->value);
2109 2110

        ColumnWithTypeAndName column;
2111
        column.column = type->createColumnConst(1, convertFieldToType(literal->value, *type));
2112
        column.type = type;
2113
        column.name = getColumnName();
2114

2115
        actions_stack.addAction(ExpressionAction::addColumn(column, "", false));
2116
        projection_manipulator->tryToGetFromUpperProjection(column.name);
2117 2118 2119 2120
    }
    else
    {
        for (auto & child : ast->children)
A
Alexey Milovidov 已提交
2121 2122 2123 2124
        {
            /// Do not go to FROM, JOIN, UNION.
            if (!typeid_cast<const ASTTableExpression *>(child.get())
                && !typeid_cast<const ASTSelectQuery *>(child.get()))
T
Tsarkova Anastasia 已提交
2125
                getActionsImpl(child, no_subqueries, only_consts, actions_stack, projection_manipulator);
A
Alexey Milovidov 已提交
2126
        }
2127
    }
2128 2129 2130
}


2131
void ExpressionAnalyzer::getAggregates(const ASTPtr & ast, ExpressionActionsPtr & actions)
2132
{
F
f1yegor 已提交
2133
    /// There can not be aggregate functions inside the WHERE and PREWHERE.
2134 2135 2136 2137 2138 2139
    if (select_query && (ast.get() == select_query->where_expression.get() || ast.get() == select_query->prewhere_expression.get()))
    {
        assertNoAggregates(ast, "in WHERE or PREWHERE");
        return;
    }

F
f1yegor 已提交
2140
    /// If we are not analyzing a SELECT query, but a separate expression, then there can not be aggregate functions in it.
2141 2142 2143 2144 2145 2146 2147
    if (!select_query)
    {
        assertNoAggregates(ast, "in wrong place");
        return;
    }

    const ASTFunction * node = typeid_cast<const ASTFunction *>(ast.get());
2148
    if (node && AggregateFunctionFactory::instance().isAggregateFunctionName(node->name))
2149 2150 2151 2152 2153
    {
        has_aggregation = true;
        AggregateDescription aggregate;
        aggregate.column_name = node->getColumnName();

F
f1yegor 已提交
2154
        /// Make unique aggregate functions.
2155 2156 2157 2158 2159 2160 2161 2162 2163 2164
        for (size_t i = 0; i < aggregate_descriptions.size(); ++i)
            if (aggregate_descriptions[i].column_name == aggregate.column_name)
                return;

        const ASTs & arguments = node->arguments->children;
        aggregate.argument_names.resize(arguments.size());
        DataTypes types(arguments.size());

        for (size_t i = 0; i < arguments.size(); ++i)
        {
F
f1yegor 已提交
2165
            /// There can not be other aggregate functions within the aggregate functions.
2166 2167 2168 2169 2170 2171 2172 2173
            assertNoAggregates(arguments[i], "inside another aggregate function");

            getRootActions(arguments[i], true, false, actions);
            const std::string & name = arguments[i]->getColumnName();
            types[i] = actions->getSampleBlock().getByName(name).type;
            aggregate.argument_names[i] = name;
        }

2174 2175
        aggregate.parameters = (node->parameters) ? getAggregateFunctionParametersArray(node->parameters) : Array();
        aggregate.function = AggregateFunctionFactory::instance().get(node->name, types, aggregate.parameters);
2176 2177 2178 2179 2180 2181 2182 2183 2184 2185

        aggregate_descriptions.push_back(aggregate);
    }
    else
    {
        for (const auto & child : ast->children)
            if (!typeid_cast<const ASTSubquery *>(child.get())
                && !typeid_cast<const ASTSelectQuery *>(child.get()))
                getAggregates(child, actions);
    }
2186 2187
}

2188 2189 2190

void ExpressionAnalyzer::assertNoAggregates(const ASTPtr & ast, const char * description)
{
2191
    const ASTFunction * node = typeid_cast<const ASTFunction *>(ast.get());
2192

2193
    if (node && AggregateFunctionFactory::instance().isAggregateFunctionName(node->name))
2194 2195
        throw Exception("Aggregate function " + node->getColumnName()
            + " is found " + String(description) + " in query", ErrorCodes::ILLEGAL_AGGREGATION);
2196

2197 2198 2199 2200
    for (const auto & child : ast->children)
        if (!typeid_cast<const ASTSubquery *>(child.get())
            && !typeid_cast<const ASTSelectQuery *>(child.get()))
            assertNoAggregates(child, description);
2201 2202 2203
}


2204
void ExpressionAnalyzer::assertSelect() const
2205
{
2206 2207
    if (!select_query)
        throw Exception("Not a select query", ErrorCodes::LOGICAL_ERROR);
2208
}
2209

2210
void ExpressionAnalyzer::assertAggregation() const
2211
{
2212 2213
    if (!has_aggregation)
        throw Exception("No aggregation", ErrorCodes::LOGICAL_ERROR);
2214
}
2215

2216
void ExpressionAnalyzer::initChain(ExpressionActionsChain & chain, const NamesAndTypesList & columns) const
2217
{
2218 2219
    if (chain.steps.empty())
    {
2220
        chain.steps.emplace_back(std::make_shared<ExpressionActions>(columns, context));
2221
    }
2222
}
2223

2224
/// "Big" ARRAY JOIN.
2225
void ExpressionAnalyzer::addMultipleArrayJoinAction(ExpressionActionsPtr & actions) const
2226
{
2227 2228 2229 2230 2231 2232
    NameSet result_columns;
    for (const auto & result_source : array_join_result_to_source)
    {
        /// Assign new names to columns, if needed.
        if (result_source.first != result_source.second)
            actions->add(ExpressionAction::copyColumn(result_source.second, result_source.first));
2233

F
f1yegor 已提交
2234
        /// Make ARRAY JOIN (replace arrays with their insides) for the columns in these new names.
2235 2236
        result_columns.insert(result_source.first);
    }
2237

2238
    actions->add(ExpressionAction::arrayJoin(result_columns, select_query->array_join_is_left(), context));
2239 2240
}

2241
bool ExpressionAnalyzer::appendArrayJoin(ExpressionActionsChain & chain, bool only_types)
2242
{
2243
    assertSelect();
2244

2245 2246
    if (!select_query->array_join_expression_list())
        return false;
2247

2248
    initChain(chain, source_columns);
2249
    ExpressionActionsChain::Step & step = chain.steps.back();
2250

2251
    getRootActions(select_query->array_join_expression_list(), only_types, false, step.actions);
2252

2253
    addMultipleArrayJoinAction(step.actions);
2254

2255
    return true;
2256 2257
}

2258
void ExpressionAnalyzer::addJoinAction(ExpressionActionsPtr & actions, bool only_types) const
2259
{
2260
    if (only_types)
2261
        actions->add(ExpressionAction::ordinaryJoin(nullptr, analyzed_join.key_names_left,
N
Nikolai Kochetov 已提交
2262
                                                    analyzed_join.getColumnsAddedByJoin()));
2263 2264 2265
    else
        for (auto & subquery_for_set : subqueries_for_sets)
            if (subquery_for_set.second.join)
2266
                actions->add(ExpressionAction::ordinaryJoin(subquery_for_set.second.join, analyzed_join.key_names_left,
N
Nikolai Kochetov 已提交
2267
                                                            analyzed_join.getColumnsAddedByJoin()));
2268 2269
}

2270

2271
void ExpressionAnalyzer::AnalyzedJoin::createJoinedBlockActions(const ASTSelectQuery * select_query_with_join,
N
Nikolai Kochetov 已提交
2272
                                                                const Context & context)
2273
{
2274
    if (!select_query_with_join)
N
Nikolai Kochetov 已提交
2275 2276
        return;

2277
    const ASTTablesInSelectQueryElement * join = select_query_with_join->join();
N
Nikolai Kochetov 已提交
2278 2279 2280 2281 2282

    if (!join)
        return;

    const auto & join_params = static_cast<const ASTTableJoin &>(*join->table_join);
2283 2284 2285 2286 2287 2288 2289 2290 2291

    /// Create custom expression list with join keys from right table.
    auto expression_list = std::make_shared<ASTExpressionList>();
    ASTs & children = expression_list->children;

    if (join_params.on_expression)
        for (const auto & join_right_key : key_asts_right)
            children.emplace_back(join_right_key);

N
Nikolai Kochetov 已提交
2292 2293 2294 2295 2296 2297 2298 2299
    NameSet required_columns_set(key_names_right.begin(), key_names_right.end());
    for (const auto & joined_column : columns_added_by_join)
        required_columns_set.insert(joined_column.original_name);

    required_columns_set.insert(key_names_right.begin(), key_names_right.end());

    required_columns_from_joined_table.insert(required_columns_from_joined_table.end(),
                                              required_columns_set.begin(), required_columns_set.end());
2300

A
Alexey Milovidov 已提交
2301
    const auto & source_columns_name = getColumnsFromJoinedTable(context, select_query_with_join);
2302
    ExpressionAnalyzer analyzer(expression_list, context, nullptr, source_columns_name, required_columns_from_joined_table);
2303
    joined_block_actions = analyzer.getActions(false);
N
Nikolai Kochetov 已提交
2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317

    for (const auto & column_required_from_actions : joined_block_actions->getRequiredColumns())
        if (!required_columns_set.count(column_required_from_actions))
            required_columns_from_joined_table.push_back(column_required_from_actions);
}


NamesAndTypesList ExpressionAnalyzer::AnalyzedJoin::getColumnsAddedByJoin() const
{
    NamesAndTypesList result;
    for (const auto & joined_column : columns_added_by_join)
        result.push_back(joined_column.name_and_type);

    return result;
2318 2319
}

A
Alexey Milovidov 已提交
2320
NamesAndTypesList ExpressionAnalyzer::AnalyzedJoin::getColumnsFromJoinedTable(const Context & context, const ASTSelectQuery * select_query_with_join)
2321
{
2322
    if (select_query_with_join && !columns_from_joined_table.size())
2323
    {
2324
        if (const ASTTablesInSelectQueryElement * node = select_query_with_join->join())
2325 2326 2327
        {
            const auto & table_expression = static_cast<const ASTTableExpression &>(*node->table_expression);

2328
            columns_from_joined_table = getNamesAndTypeListFromTableExpression(table_expression, context);
2329 2330 2331 2332 2333 2334
        }
    }

    return columns_from_joined_table;
}

N
Nikolai Kochetov 已提交
2335

2336 2337
bool ExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, bool only_types)
{
2338 2339 2340 2341 2342
    assertSelect();

    if (!select_query->join())
        return false;

2343
    initChain(chain, source_columns);
2344 2345
    ExpressionActionsChain::Step & step = chain.steps.back();

2346
    const auto & join_element = static_cast<const ASTTablesInSelectQueryElement &>(*select_query->join());
2347 2348
    auto & join_params = static_cast<ASTTableJoin &>(*join_element.table_join);

2349
    if (join_params.strictness == ASTTableJoin::Strictness::Unspecified && join_params.kind != ASTTableJoin::Kind::Cross)
2350 2351 2352 2353 2354 2355
    {
        if (settings.join_default_strictness.toString() == "ANY")
            join_params.strictness = ASTTableJoin::Strictness::Any;
        else if (settings.join_default_strictness.toString() == "ALL")
            join_params.strictness = ASTTableJoin::Strictness::All;
        else
2356
            throw Exception("Expected ANY or ALL in JOIN section, because setting (join_default_strictness) is empty", DB::ErrorCodes::EXPECTED_ALL_OR_ANY);
2357 2358
    }

2359
    const auto & table_to_join = static_cast<const ASTTableExpression &>(*join_element.table_expression);
2360

2361
    getActionsFromJoinKeys(join_params, only_types, false, step.actions);
2362

F
f1yegor 已提交
2363
    /// Two JOINs are not supported with the same subquery, but different USINGs.
A
Alexey Milovidov 已提交
2364
    auto join_hash = join_element.getTreeHash();
2365

A
Alexey Milovidov 已提交
2366
    SubqueryForSet & subquery_for_set = subqueries_for_sets[toString(join_hash.first) + "_" + toString(join_hash.second)];
2367

F
f1yegor 已提交
2368 2369
    /// Special case - if table name is specified on the right of JOIN, then the table has the type Join (the previously prepared mapping).
    /// TODO This syntax does not support specifying a database name.
2370 2371
    if (table_to_join.database_and_table_name)
    {
N
Nikolai Kochetov 已提交
2372 2373
        const auto & identifier = static_cast<const ASTIdentifier &>(*table_to_join.database_and_table_name);
        auto database_table = getDatabaseAndTableNameFromIdentifier(identifier);
2374
        StoragePtr table = context.tryGetTable(database_table.first, database_table.second);
2375 2376 2377

        if (table)
        {
A
Alexey Milovidov 已提交
2378
            StorageJoin * storage_join = dynamic_cast<StorageJoin *>(table.get());
2379 2380 2381 2382

            if (storage_join)
            {
                storage_join->assertCompatible(join_params.kind, join_params.strictness);
F
f1yegor 已提交
2383
                /// TODO Check the set of keys.
2384 2385 2386 2387 2388 2389 2390 2391 2392 2393

                JoinPtr & join = storage_join->getJoin();
                subquery_for_set.join = join;
            }
        }
    }

    if (!subquery_for_set.join)
    {
        JoinPtr join = std::make_shared<Join>(
2394
            analyzed_join.key_names_left, analyzed_join.key_names_right, analyzed_join.columns_added_by_join_from_right_keys,
2395
            settings.join_use_nulls, SizeLimits(settings.max_rows_in_join, settings.max_bytes_in_join, settings.join_overflow_mode),
2396 2397
            join_params.kind, join_params.strictness);

F
f1yegor 已提交
2398 2399 2400 2401
        /** For GLOBAL JOINs (in the case, for example, of the push method for executing GLOBAL subqueries), the following occurs
          * - in the addExternalStorage function, the JOIN (SELECT ...) subquery is replaced with JOIN _data1,
          *   in the subquery_for_set object this subquery is exposed as source and the temporary table _data1 as the `table`.
          * - this function shows the expression JOIN _data1.
2402 2403 2404 2405
          */
        if (!subquery_for_set.source)
        {
            ASTPtr table;
2406

2407
            if (table_to_join.subquery)
2408
                table = table_to_join.subquery;
2409 2410 2411 2412
            else if (table_to_join.table_function)
                table = table_to_join.table_function;
            else if (table_to_join.database_and_table_name)
                table = table_to_join.database_and_table_name;
2413

N
Nikolai Kochetov 已提交
2414
            auto interpreter = interpretSubquery(table, context, subquery_depth, analyzed_join.required_columns_from_joined_table);
2415 2416 2417
            subquery_for_set.source = std::make_shared<LazyBlockInputStream>(
                interpreter->getSampleBlock(),
                [interpreter]() mutable { return interpreter->execute().in; });
2418 2419
        }

N
Nikolai Kochetov 已提交
2420 2421 2422 2423 2424 2425 2426 2427
        /// Alias duplicating columns.
        for (const auto & joined_column : analyzed_join.columns_added_by_join)
        {
            const auto & qualified_name = joined_column.name_and_type.name;
            if (joined_column.original_name != qualified_name)
                subquery_for_set.joined_block_aliases.emplace_back(joined_column.original_name, qualified_name);
        }

2428 2429
        auto sample_block = subquery_for_set.source->getHeader();
        analyzed_join.joined_block_actions->execute(sample_block);
N
Nikolai Kochetov 已提交
2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441
        for (const auto & name_with_alias : subquery_for_set.joined_block_aliases)
        {
            if (sample_block.has(name_with_alias.first))
            {
                auto pos = sample_block.getPositionByName(name_with_alias.first);
                auto column = sample_block.getByPosition(pos);
                sample_block.erase(pos);
                column.name = name_with_alias.second;
                sample_block.insert(std::move(column));
            }
        }

F
f1yegor 已提交
2442
        /// TODO You do not need to set this up when JOIN is only needed on remote servers.
2443
        subquery_for_set.join = join;
2444 2445
        subquery_for_set.join->setSampleBlock(sample_block);
        subquery_for_set.joined_block_actions = analyzed_join.joined_block_actions;
2446 2447 2448 2449 2450
    }

    addJoinAction(step.actions, false);

    return true;
2451 2452
}

2453
bool ExpressionAnalyzer::appendPrewhere(ExpressionActionsChain & chain, bool only_types, const ASTPtr & sampling_expression)
2454 2455 2456 2457 2458 2459
{
    assertSelect();

    if (!select_query->prewhere_expression)
        return false;

2460 2461
    Names required_sample_columns;
    if (sampling_expression)
2462
        required_sample_columns = ExpressionAnalyzer(sampling_expression, context, storage).getRequiredSourceColumns();
2463

2464
    initChain(chain, source_columns);
N
Nikolai Kochetov 已提交
2465
    auto & step = chain.getLastStep();
2466
    getRootActions(select_query->prewhere_expression, only_types, false, step.actions);
N
Nikolai Kochetov 已提交
2467 2468
    String prewhere_column_name = select_query->prewhere_expression->getColumnName();
    step.required_output.push_back(prewhere_column_name);
2469
    step.can_remove_required_output.push_back(true);
N
Nikolai Kochetov 已提交
2470 2471 2472

    {
        /// Remove unused source_columns from prewhere actions.
2473
        auto tmp_actions = std::make_shared<ExpressionActions>(source_columns, context);
N
Nikolai Kochetov 已提交
2474 2475 2476 2477 2478
        getRootActions(select_query->prewhere_expression, only_types, false, tmp_actions);
        tmp_actions->finalize({prewhere_column_name});
        auto required_columns = tmp_actions->getRequiredColumns();
        NameSet required_source_columns(required_columns.begin(), required_columns.end());

2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490
        /// Add required columns for sample expression to required output in order not to remove them after
        /// prewhere execution because sampling is executed after prewhere.
        /// TODO: add sampling execution to common chain.
        for (const auto & column : required_sample_columns)
        {
            if (required_source_columns.count(column))
            {
                step.required_output.push_back(column);
                step.can_remove_required_output.push_back(true);
            }
        }

N
Nikolai Kochetov 已提交
2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508
        auto names = step.actions->getSampleBlock().getNames();
        NameSet name_set(names.begin(), names.end());

        for (const auto & column : source_columns)
            if (required_source_columns.count(column.name) == 0)
                name_set.erase(column.name);

        Names required_output(name_set.begin(), name_set.end());
        step.actions->finalize(required_output);
    }

    {
        /// Add empty action with input = {prewhere actions output} + {unused source columns}
        /// Reasons:
        /// 1. Remove remove source columns which are used only in prewhere actions during prewhere actions execution.
        ///    Example: select A prewhere B > 0. B can be removed at prewhere step.
        /// 2. Store side columns which were calculated during prewhere actions execution if they are used.
        ///    Example: select F(A) prewhere F(A) > 0. F(A) can be saved from prewhere step.
N
Nikolai Kochetov 已提交
2509
        /// 3. Check if we can remove filter column at prewhere step. If we can, action will store single REMOVE_COLUMN.
N
Nikolai Kochetov 已提交
2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523
        ColumnsWithTypeAndName columns = step.actions->getSampleBlock().getColumnsWithTypeAndName();
        auto required_columns = step.actions->getRequiredColumns();
        NameSet prewhere_input_names(required_columns.begin(), required_columns.end());
        NameSet unused_source_columns;

        for (const auto & column : source_columns)
        {
            if (prewhere_input_names.count(column.name) == 0)
            {
                columns.emplace_back(column.type, column.name);
                unused_source_columns.emplace(column.name);
            }
        }

2524
        chain.steps.emplace_back(std::make_shared<ExpressionActions>(std::move(columns), context));
N
Nikolai Kochetov 已提交
2525 2526
        chain.steps.back().additional_input = std::move(unused_source_columns);
    }
2527 2528 2529

    return true;
}
2530

2531
bool ExpressionAnalyzer::appendWhere(ExpressionActionsChain & chain, bool only_types)
2532
{
2533
    assertSelect();
2534

2535 2536
    if (!select_query->where_expression)
        return false;
2537

2538
    initChain(chain, source_columns);
2539
    ExpressionActionsChain::Step & step = chain.steps.back();
2540

2541
    step.required_output.push_back(select_query->where_expression->getColumnName());
2542
    step.can_remove_required_output = {true};
2543

2544
    getRootActions(select_query->where_expression, only_types, false, step.actions);
2545

2546
    return true;
2547 2548
}

2549
bool ExpressionAnalyzer::appendGroupBy(ExpressionActionsChain & chain, bool only_types)
2550
{
2551
    assertAggregation();
2552

2553 2554
    if (!select_query->group_expression_list)
        return false;
2555

2556
    initChain(chain, source_columns);
2557
    ExpressionActionsChain::Step & step = chain.steps.back();
2558

2559 2560 2561 2562 2563 2564
    ASTs asts = select_query->group_expression_list->children;
    for (size_t i = 0; i < asts.size(); ++i)
    {
        step.required_output.push_back(asts[i]->getColumnName());
        getRootActions(asts[i], only_types, false, step.actions);
    }
2565

2566
    return true;
2567 2568
}

2569
void ExpressionAnalyzer::appendAggregateFunctionsArguments(ExpressionActionsChain & chain, bool only_types)
2570
{
2571
    assertAggregation();
2572

2573
    initChain(chain, source_columns);
2574
    ExpressionActionsChain::Step & step = chain.steps.back();
2575

2576 2577 2578 2579 2580 2581 2582
    for (size_t i = 0; i < aggregate_descriptions.size(); ++i)
    {
        for (size_t j = 0; j < aggregate_descriptions[i].argument_names.size(); ++j)
        {
            step.required_output.push_back(aggregate_descriptions[i].argument_names[j]);
        }
    }
2583

2584
    getActionsBeforeAggregation(select_query->select_expression_list, step.actions, only_types);
2585

2586 2587
    if (select_query->having_expression)
        getActionsBeforeAggregation(select_query->having_expression, step.actions, only_types);
2588

2589 2590
    if (select_query->order_expression_list)
        getActionsBeforeAggregation(select_query->order_expression_list, step.actions, only_types);
2591 2592
}

2593
bool ExpressionAnalyzer::appendHaving(ExpressionActionsChain & chain, bool only_types)
2594
{
2595
    assertAggregation();
2596

2597 2598
    if (!select_query->having_expression)
        return false;
2599

2600 2601
    initChain(chain, aggregated_columns);
    ExpressionActionsChain::Step & step = chain.steps.back();
2602

2603 2604
    step.required_output.push_back(select_query->having_expression->getColumnName());
    getRootActions(select_query->having_expression, only_types, false, step.actions);
2605

2606
    return true;
2607 2608
}

2609
void ExpressionAnalyzer::appendSelect(ExpressionActionsChain & chain, bool only_types)
2610
{
2611
    assertSelect();
2612

2613 2614
    initChain(chain, aggregated_columns);
    ExpressionActionsChain::Step & step = chain.steps.back();
2615

2616
    getRootActions(select_query->select_expression_list, only_types, false, step.actions);
2617

2618
    for (const auto & child : select_query->select_expression_list->children)
2619
        step.required_output.push_back(child->getColumnName());
2620
}
2621

2622
bool ExpressionAnalyzer::appendOrderBy(ExpressionActionsChain & chain, bool only_types)
2623
{
2624
    assertSelect();
2625

2626 2627
    if (!select_query->order_expression_list)
        return false;
2628

2629 2630
    initChain(chain, aggregated_columns);
    ExpressionActionsChain::Step & step = chain.steps.back();
2631

2632
    getRootActions(select_query->order_expression_list, only_types, false, step.actions);
2633

2634 2635 2636 2637 2638 2639 2640 2641 2642
    ASTs asts = select_query->order_expression_list->children;
    for (size_t i = 0; i < asts.size(); ++i)
    {
        ASTOrderByElement * ast = typeid_cast<ASTOrderByElement *>(asts[i].get());
        if (!ast || ast->children.size() < 1)
            throw Exception("Bad order expression AST", ErrorCodes::UNKNOWN_TYPE_OF_AST_NODE);
        ASTPtr order_expression = ast->children.at(0);
        step.required_output.push_back(order_expression->getColumnName());
    }
2643

2644
    return true;
2645 2646
}

2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664
bool ExpressionAnalyzer::appendLimitBy(ExpressionActionsChain & chain, bool only_types)
{
    assertSelect();

    if (!select_query->limit_by_expression_list)
        return false;

    initChain(chain, aggregated_columns);
    ExpressionActionsChain::Step & step = chain.steps.back();

    getRootActions(select_query->limit_by_expression_list, only_types, false, step.actions);

    for (const auto & child : select_query->limit_by_expression_list->children)
        step.required_output.push_back(child->getColumnName());

    return true;
}

A
Alexey Milovidov 已提交
2665
void ExpressionAnalyzer::appendProjectResult(ExpressionActionsChain & chain) const
2666
{
2667
    assertSelect();
2668

2669 2670
    initChain(chain, aggregated_columns);
    ExpressionActionsChain::Step & step = chain.steps.back();
2671

2672
    NamesWithAliases result_columns;
2673

2674 2675 2676
    ASTs asts = select_query->select_expression_list->children;
    for (size_t i = 0; i < asts.size(); ++i)
    {
2677
        String result_name = asts[i]->getAliasOrColumnName();
2678 2679
        if (required_result_columns.empty()
            || std::find(required_result_columns.begin(), required_result_columns.end(), result_name) !=  required_result_columns.end())
2680 2681 2682 2683
        {
            result_columns.emplace_back(asts[i]->getColumnName(), result_name);
            step.required_output.push_back(result_columns.back().second);
        }
2684
    }
2685

2686
    step.actions->add(ExpressionAction::project(result_columns));
2687 2688 2689
}


2690
void ExpressionAnalyzer::appendExpression(ExpressionActionsChain & chain, const ASTPtr & expr, bool only_types)
2691 2692 2693
{
    initChain(chain, source_columns);
    ExpressionActionsChain::Step & step = chain.steps.back();
2694
    getRootActions(expr, only_types, false, step.actions);
2695 2696 2697 2698
    step.required_output.push_back(expr->getColumnName());
}


2699
void ExpressionAnalyzer::getActionsBeforeAggregation(const ASTPtr & ast, ExpressionActionsPtr & actions, bool no_subqueries)
2700
{
2701
    ASTFunction * node = typeid_cast<ASTFunction *>(ast.get());
2702

2703
    if (node && AggregateFunctionFactory::instance().isAggregateFunctionName(node->name))
2704 2705 2706 2707 2708
        for (auto & argument : node->arguments->children)
            getRootActions(argument, no_subqueries, false, actions);
    else
        for (auto & child : ast->children)
            getActionsBeforeAggregation(child, actions, no_subqueries);
2709 2710 2711
}


2712
ExpressionActionsPtr ExpressionAnalyzer::getActions(bool add_aliases, bool project_result)
2713
{
2714
    ExpressionActionsPtr actions = std::make_shared<ExpressionActions>(source_columns, context);
2715 2716 2717 2718 2719
    NamesWithAliases result_columns;
    Names result_names;

    ASTs asts;

2720
    if (auto node = typeid_cast<const ASTExpressionList *>(query.get()))
2721 2722
        asts = node->children;
    else
2723
        asts = ASTs(1, query);
2724 2725 2726 2727 2728

    for (size_t i = 0; i < asts.size(); ++i)
    {
        std::string name = asts[i]->getColumnName();
        std::string alias;
2729
        if (add_aliases)
2730 2731 2732 2733 2734 2735 2736 2737
            alias = asts[i]->getAliasOrColumnName();
        else
            alias = name;
        result_columns.emplace_back(name, alias);
        result_names.push_back(alias);
        getRootActions(asts[i], false, false, actions);
    }

2738
    if (add_aliases)
2739
    {
2740 2741 2742 2743
        if (project_result)
            actions->add(ExpressionAction::project(result_columns));
        else
            actions->add(ExpressionAction::addAliases(result_columns));
2744
    }
N
Nikolai Kochetov 已提交
2745 2746

    if (!(add_aliases && project_result))
2747
    {
F
f1yegor 已提交
2748
        /// We will not delete the original columns.
2749
        for (const auto & column_name_type : source_columns)
2750 2751 2752 2753 2754 2755
            result_names.push_back(column_name_type.name);
    }

    actions->finalize(result_names);

    return actions;
2756 2757 2758 2759 2760
}


ExpressionActionsPtr ExpressionAnalyzer::getConstActions()
{
2761
    ExpressionActionsPtr actions = std::make_shared<ExpressionActions>(NamesAndTypesList(), context);
2762

2763
    getRootActions(query, true, true, actions);
2764

2765
    return actions;
2766 2767
}

2768
void ExpressionAnalyzer::getAggregateInfo(Names & key_names, AggregateDescriptions & aggregates) const
2769
{
2770 2771
    for (const auto & name_and_type : aggregation_keys)
        key_names.emplace_back(name_and_type.name);
2772

2773
    aggregates = aggregate_descriptions;
2774 2775
}

2776
void ExpressionAnalyzer::collectUsedColumns()
2777
{
F
f1yegor 已提交
2778 2779 2780
    /** Calculate which columns are required to execute the expression.
      * Then, delete all other columns from the list of available columns.
      * After execution, columns will only contain the list of columns needed to read from the table.
2781 2782 2783 2784 2785
      */

    NameSet required;
    NameSet ignored;

2786
    NameSet available_columns;
2787
    for (const auto & column : source_columns)
2788 2789
        available_columns.insert(column.name);

2790 2791 2792 2793 2794
    if (select_query && select_query->array_join_expression_list())
    {
        ASTs & expressions = select_query->array_join_expression_list()->children;
        for (size_t i = 0; i < expressions.size(); ++i)
        {
F
f1yegor 已提交
2795 2796
            /// Ignore the top-level identifiers from the ARRAY JOIN section.
            /// Then add them separately.
2797 2798 2799 2800 2801 2802
            if (typeid_cast<ASTIdentifier *>(expressions[i].get()))
            {
                ignored.insert(expressions[i]->getColumnName());
            }
            else
            {
F
f1yegor 已提交
2803
                /// Nothing needs to be ignored for expressions in ARRAY JOIN.
2804
                NameSet empty;
2805
                getRequiredSourceColumnsImpl(expressions[i], available_columns, required, empty, empty, empty);
2806 2807 2808 2809 2810 2811
            }

            ignored.insert(expressions[i]->getAliasOrColumnName());
        }
    }

F
f1yegor 已提交
2812 2813
    /** You also need to ignore the identifiers of the columns that are obtained by JOIN.
      * (Do not assume that they are required for reading from the "left" table).
2814 2815
      */
    NameSet available_joined_columns;
2816
    collectJoinedColumns(available_joined_columns);
2817 2818

    NameSet required_joined_columns;
2819

2820
    for (const auto & left_key_ast : analyzed_join.key_asts_left)
2821 2822
        getRequiredSourceColumnsImpl(left_key_ast, available_columns, required, ignored, {}, required_joined_columns);

2823
    getRequiredSourceColumnsImpl(query, available_columns, required, ignored, available_joined_columns, required_joined_columns);
2824

2825
    for (auto it = analyzed_join.columns_added_by_join.begin(); it != analyzed_join.columns_added_by_join.end();)
2826
    {
N
Nikolai Kochetov 已提交
2827
        if (required_joined_columns.count(it->name_and_type.name))
2828 2829
            ++it;
        else
2830
            analyzed_join.columns_added_by_join.erase(it++);
2831
    }
2832

N
Nikolai Kochetov 已提交
2833 2834
    analyzed_join.createJoinedBlockActions(select_query, context);

2835 2836
    /// Some columns from right join key may be used in query. This columns will be appended to block during join.
    for (const auto & right_key_name : analyzed_join.key_names_right)
N
Nikolai Kochetov 已提交
2837
        if (required_joined_columns.count(right_key_name))
2838
            analyzed_join.columns_added_by_join_from_right_keys.insert(right_key_name);
N
Nikolai Kochetov 已提交
2839

F
f1yegor 已提交
2840
    /// Insert the columns required for the ARRAY JOIN calculation into the required columns list.
2841 2842 2843 2844
    NameSet array_join_sources;
    for (const auto & result_source : array_join_result_to_source)
        array_join_sources.insert(result_source.second);

2845
    for (const auto & column_name_type : source_columns)
2846 2847 2848
        if (array_join_sources.count(column_name_type.name))
            required.insert(column_name_type.name);

F
f1yegor 已提交
2849
    /// You need to read at least one column to find the number of rows.
Z
zhang2014 已提交
2850
    if (select_query && required.empty())
2851
        required.insert(ExpressionActions::getSmallestColumn(source_columns));
2852

2853
    NameSet unknown_required_source_columns = required;
2854

2855
    for (NamesAndTypesList::iterator it = source_columns.begin(); it != source_columns.end();)
2856
    {
2857
        unknown_required_source_columns.erase(it->name);
2858 2859

        if (!required.count(it->name))
2860
            source_columns.erase(it++);
2861 2862
        else
            ++it;
2863 2864
    }

2865 2866
    /// If there are virtual columns among the unknown columns. Remove them from the list of unknown and add
    /// in columns list, so that when further processing they are also considered.
2867 2868
    if (storage)
    {
2869
        for (auto it = unknown_required_source_columns.begin(); it != unknown_required_source_columns.end();)
2870 2871 2872
        {
            if (storage->hasColumn(*it))
            {
2873 2874
                source_columns.push_back(storage->getColumn(*it));
                unknown_required_source_columns.erase(it++);
2875 2876 2877 2878 2879
            }
            else
                ++it;
        }
    }
2880 2881 2882

    if (!unknown_required_source_columns.empty())
        throw Exception("Unknown identifier: " + *unknown_required_source_columns.begin(), ErrorCodes::UNKNOWN_IDENTIFIER);
2883 2884
}

2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919

void ExpressionAnalyzer::collectJoinedColumnsFromJoinOnExpr()
{
    const auto & tables = static_cast<const ASTTablesInSelectQuery &>(*select_query->tables);
    const auto * left_tables_element = static_cast<const ASTTablesInSelectQueryElement *>(tables.children.at(0).get());
    const auto * right_tables_element = select_query->join();

    if (!left_tables_element || !right_tables_element)
        return;

    const auto & table_join = static_cast<const ASTTableJoin &>(*right_tables_element->table_join);
    if (!table_join.on_expression)
        return;

    const auto & left_table_expression = static_cast<const ASTTableExpression &>(*left_tables_element->table_expression);
    const auto & right_table_expression = static_cast<const ASTTableExpression &>(*right_tables_element->table_expression);

    auto left_source_names = getTableNameWithAliasFromTableExpression(left_table_expression, context);
    auto right_source_names = getTableNameWithAliasFromTableExpression(right_table_expression, context);

    /// Stores examples of columns which are only from one table.
    struct TableBelonging
    {
        const ASTIdentifier * example_only_from_left = nullptr;
        const ASTIdentifier * example_only_from_right = nullptr;
    };

    /// Check all identifiers in ast and decide their possible table belonging.
    /// Throws if there are two identifiers definitely from different tables.
    std::function<TableBelonging(const ASTPtr &)> get_table_belonging;
    get_table_belonging = [&](const ASTPtr & ast) -> TableBelonging
    {
        auto * identifier = typeid_cast<const ASTIdentifier *>(ast.get());
        if (identifier)
        {
C
chertus 已提交
2920
            if (identifier->general())
2921 2922 2923
            {
                auto left_num_components = getNumComponentsToStripInOrderToTranslateQualifiedName(*identifier, left_source_names);
                auto right_num_components = getNumComponentsToStripInOrderToTranslateQualifiedName(*identifier, right_source_names);
N
Nikolai Kochetov 已提交
2924 2925

                /// Assume that component from definite table if num_components is greater than for the other table.
2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958
                if (left_num_components > right_num_components)
                    return {identifier, nullptr};
                if (left_num_components < right_num_components)
                    return {nullptr, identifier};
            }
            return {};
        }

        TableBelonging table_belonging;
        for (const auto & child : ast->children)
        {
            auto children_belonging = get_table_belonging(child);
            if (!table_belonging.example_only_from_left)
                table_belonging.example_only_from_left = children_belonging.example_only_from_left;
            if (!table_belonging.example_only_from_right)
                table_belonging.example_only_from_right = children_belonging.example_only_from_right;
        }

        if (table_belonging.example_only_from_left && table_belonging.example_only_from_right)
            throw Exception("Invalid columns in JOIN ON section. Columns "
                            + table_belonging.example_only_from_left->getAliasOrColumnName() + " and "
                            + table_belonging.example_only_from_right->getAliasOrColumnName()
                            + " are from different tables.", ErrorCodes::INVALID_JOIN_ON_EXPRESSION);

        return table_belonging;
    };

    std::function<void(ASTPtr &, const DatabaseAndTableWithAlias &)> translate_qualified_names;
    translate_qualified_names = [&](ASTPtr & ast, const DatabaseAndTableWithAlias & source_names)
    {
        auto * identifier = typeid_cast<const ASTIdentifier *>(ast.get());
        if (identifier)
        {
C
chertus 已提交
2959
            if (identifier->general())
2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000
            {
                auto num_components = getNumComponentsToStripInOrderToTranslateQualifiedName(*identifier, source_names);
                stripIdentifier(ast, num_components);
            }
            return;
        }

        for (auto & child : ast->children)
            translate_qualified_names(child, source_names);
    };

    const auto supported_syntax = " Supported syntax: JOIN ON Expr([table.]column, ...) = Expr([table.]column, ...) "
                                  "[AND Expr([table.]column, ...) = Expr([table.]column, ...) ...]";
    auto throwSyntaxException = [&](const String & msg)
    {
        throw Exception("Invalid expression for JOIN ON. " + msg + supported_syntax, ErrorCodes::INVALID_JOIN_ON_EXPRESSION);
    };

    /// For equal expression find out corresponding table for each part, translate qualified names and add asts to join keys.
    auto add_columns_from_equals_expr = [&](const ASTPtr & expr)
    {
        auto * func_equals = typeid_cast<const ASTFunction *>(expr.get());
        if (!func_equals || func_equals->name != "equals")
            throwSyntaxException("Expected equals expression, got " + queryToString(expr) + ".");

        ASTPtr left_ast = func_equals->arguments->children.at(0)->clone();
        ASTPtr right_ast = func_equals->arguments->children.at(1)->clone();

        auto left_table_belonging = get_table_belonging(left_ast);
        auto right_table_belonging = get_table_belonging(right_ast);

        bool can_be_left_part_from_left_table = left_table_belonging.example_only_from_right == nullptr;
        bool can_be_left_part_from_right_table = left_table_belonging.example_only_from_left == nullptr;
        bool can_be_right_part_from_left_table = right_table_belonging.example_only_from_right == nullptr;
        bool can_be_right_part_from_right_table = right_table_belonging.example_only_from_left == nullptr;

        auto add_join_keys = [&](ASTPtr & ast_to_left_table, ASTPtr & ast_to_right_table)
        {
            translate_qualified_names(ast_to_left_table, left_source_names);
            translate_qualified_names(ast_to_right_table, right_source_names);

3001 3002 3003 3004
            analyzed_join.key_asts_left.push_back(ast_to_left_table);
            analyzed_join.key_names_left.push_back(ast_to_left_table->getColumnName());
            analyzed_join.key_asts_right.push_back(ast_to_right_table);
            analyzed_join.key_names_right.push_back(ast_to_right_table->getAliasOrColumnName());
3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040
        };

        /// Default variant when all identifiers may be from any table.
        if (can_be_left_part_from_left_table && can_be_right_part_from_right_table)
            add_join_keys(left_ast, right_ast);
        else if (can_be_left_part_from_right_table && can_be_right_part_from_left_table)
            add_join_keys(right_ast, left_ast);
        else
        {
            auto * left_example = left_table_belonging.example_only_from_left ?
                                  left_table_belonging.example_only_from_left :
                                  left_table_belonging.example_only_from_right;

            auto * right_example = right_table_belonging.example_only_from_left ?
                                   right_table_belonging.example_only_from_left :
                                   right_table_belonging.example_only_from_right;

            auto left_name = queryToString(*left_example);
            auto right_name = queryToString(*right_example);
            auto expr_name = queryToString(expr);

            throwSyntaxException("In expression " + expr_name + " columns " + left_name + " and " + right_name
                                 + " are from the same table but from different arguments of equal function.");
        }
    };

    auto * func = typeid_cast<const ASTFunction *>(table_join.on_expression.get());
    if (func && func->name == "and")
    {
        for (const auto & expr : func->arguments->children)
            add_columns_from_equals_expr(expr);
    }
    else
        add_columns_from_equals_expr(table_join.on_expression);
}

3041
void ExpressionAnalyzer::collectJoinedColumns(NameSet & joined_columns)
3042
{
3043 3044 3045 3046 3047 3048 3049 3050
    if (!select_query)
        return;

    const ASTTablesInSelectQueryElement * node = select_query->join();

    if (!node)
        return;

3051 3052 3053
    const auto & table_join = static_cast<const ASTTableJoin &>(*node->table_join);
    const auto & table_expression = static_cast<const ASTTableExpression &>(*node->table_expression);
    auto joined_table_name = getTableNameWithAliasFromTableExpression(table_expression, context);
3054

3055
    auto add_name_to_join_keys = [](Names & join_keys, ASTs & join_asts, const String & name, const ASTPtr & ast)
3056
    {
3057 3058
        join_keys.push_back(name);
        join_asts.push_back(ast);
3059 3060
    };

3061 3062 3063 3064 3065
    if (table_join.using_expression_list)
    {
        auto & keys = typeid_cast<ASTExpressionList &>(*table_join.using_expression_list);
        for (const auto & key : keys.children)
        {
3066 3067
            add_name_to_join_keys(analyzed_join.key_names_left, analyzed_join.key_asts_left, key->getColumnName(), key);
            add_name_to_join_keys(analyzed_join.key_names_right, analyzed_join.key_asts_right, key->getAliasOrColumnName(), key);
3068 3069 3070
        }
    }
    else if (table_join.on_expression)
3071
        collectJoinedColumnsFromJoinOnExpr();
3072

N
Nikolai Kochetov 已提交
3073 3074 3075 3076 3077
    /// When we use JOIN ON syntax, non_joined_columns are columns from join_key_names_left,
    ///     because even if a column from join_key_names_right, we may need to join it if it has different name.
    /// If we use USING syntax, join_key_names_left and join_key_names_right are almost the same, but we need to use
    ///     join_key_names_right in order to support aliases in USING list. Example:
    ///     SELECT x FROM tab1 ANY LEFT JOIN tab2 USING (x as y) - will join column x from tab1 with column y from tab2.
3078
    auto & not_joined_columns = table_join.using_expression_list ? analyzed_join.key_names_right : analyzed_join.key_names_left;
3079
    auto columns_from_joined_table = analyzed_join.getColumnsFromJoinedTable(context, select_query);
3080

3081
    for (auto & column_name_and_type : columns_from_joined_table)
3082
    {
3083 3084 3085
        auto & column_name = column_name_and_type.name;
        auto & column_type = column_name_and_type.type;
        if (not_joined_columns.end() == std::find(not_joined_columns.begin(), not_joined_columns.end(), column_name))
3086
        {
3087
            auto qualified_name = column_name;
N
Nikolai Kochetov 已提交
3088
            /// Change name for duplicate column form joined table.
3089 3090
            if (source_columns.contains(qualified_name))
                qualified_name = joined_table_name.getQualifiedNamePrefix() + qualified_name;
N
Nikolai Kochetov 已提交
3091

3092
            if (joined_columns.count(qualified_name)) /// Duplicate columns in the subquery for JOIN do not make sense.
3093
                continue;
3094

3095
            joined_columns.insert(qualified_name);
3096

3097 3098
            bool make_nullable = settings.join_use_nulls && (table_join.kind == ASTTableJoin::Kind::Left ||
                                                             table_join.kind == ASTTableJoin::Kind::Full);
3099 3100
            auto type = make_nullable ? makeNullable(column_type) : column_type;
            analyzed_join.columns_added_by_join.emplace_back(NameAndTypePair(qualified_name, std::move(type)), column_name);
3101 3102
        }
    }
3103 3104
}

3105

3106
Names ExpressionAnalyzer::getRequiredSourceColumns() const
3107
{
3108
    return source_columns.getNames();
3109 3110
}

3111

3112 3113
void ExpressionAnalyzer::getRequiredSourceColumnsImpl(const ASTPtr & ast,
    const NameSet & available_columns, NameSet & required_source_columns, NameSet & ignored_names,
3114 3115
    const NameSet & available_joined_columns, NameSet & required_joined_columns)
{
F
f1yegor 已提交
3116
    /** Find all the identifiers in the query.
3117
      * We will use depth first search in AST.
F
f1yegor 已提交
3118 3119
      * In this case
      * - for lambda functions we will not take formal parameters;
3120 3121 3122
      * - do not go into subqueries (they have their own identifiers);
      * - there is some exception for the ARRAY JOIN clause (it has a slightly different identifiers);
      * - we put identifiers available from JOIN in required_joined_columns.
3123 3124 3125 3126
      */

    if (ASTIdentifier * node = typeid_cast<ASTIdentifier *>(ast.get()))
    {
C
chertus 已提交
3127
        if (node->general()
3128
            && !ignored_names.count(node->name)
3129
            && !ignored_names.count(Nested::extractTableName(node->name)))
3130
        {
3131 3132
            if (!available_joined_columns.count(node->name)
                || available_columns.count(node->name)) /// Read column from left table if has.
3133
                required_source_columns.insert(node->name);
3134 3135 3136 3137 3138 3139 3140 3141 3142
            else
                required_joined_columns.insert(node->name);
        }

        return;
    }

    if (ASTFunction * node = typeid_cast<ASTFunction *>(ast.get()))
    {
3143
        if (node->name == "lambda")
3144 3145 3146 3147 3148 3149 3150 3151 3152
        {
            if (node->arguments->children.size() != 2)
                throw Exception("lambda requires two arguments", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);

            ASTFunction * lambda_args_tuple = typeid_cast<ASTFunction *>(node->arguments->children.at(0).get());

            if (!lambda_args_tuple || lambda_args_tuple->name != "tuple")
                throw Exception("First argument of lambda must be a tuple", ErrorCodes::TYPE_MISMATCH);

3153
            /// You do not need to add formal parameters of the lambda expression in required_source_columns.
3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168
            Names added_ignored;
            for (auto & child : lambda_args_tuple->arguments->children)
            {
                ASTIdentifier * identifier = typeid_cast<ASTIdentifier *>(child.get());
                if (!identifier)
                    throw Exception("lambda argument declarations must be identifiers", ErrorCodes::TYPE_MISMATCH);

                String & name = identifier->name;
                if (!ignored_names.count(name))
                {
                    ignored_names.insert(name);
                    added_ignored.push_back(name);
                }
            }

3169 3170
            getRequiredSourceColumnsImpl(node->arguments->children.at(1),
                available_columns, required_source_columns, ignored_names,
3171 3172 3173 3174 3175 3176 3177 3178
                available_joined_columns, required_joined_columns);

            for (size_t i = 0; i < added_ignored.size(); ++i)
                ignored_names.erase(added_ignored[i]);

            return;
        }

F
f1yegor 已提交
3179
        /// A special function `indexHint`. Everything that is inside it is not calculated
3180
        /// (and is used only for index analysis, see KeyCondition).
3181 3182 3183 3184
        if (node->name == "indexHint")
            return;
    }

F
f1yegor 已提交
3185
    /// Recursively traverses an expression.
3186 3187
    for (auto & child : ast->children)
    {
F
f1yegor 已提交
3188 3189
        /** We will not go to the ARRAY JOIN section, because we need to look at the names of non-ARRAY-JOIN columns.
          * There, `collectUsedColumns` will send us separately.
3190
          */
A
Alexey Milovidov 已提交
3191 3192
        if (!typeid_cast<const ASTSelectQuery *>(child.get())
            && !typeid_cast<const ASTArrayJoin *>(child.get())
3193 3194
            && !typeid_cast<const ASTTableExpression *>(child.get())
            && !typeid_cast<const ASTTableJoin *>(child.get()))
3195 3196
            getRequiredSourceColumnsImpl(child, available_columns, required_source_columns,
                ignored_names, available_joined_columns, required_joined_columns);
3197
    }
3198 3199
}

3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214

static bool hasArrayJoin(const ASTPtr & ast)
{
    if (const ASTFunction * function = typeid_cast<const ASTFunction *>(&*ast))
        if (function->name == "arrayJoin")
            return true;

    for (const auto & child : ast->children)
        if (!typeid_cast<ASTSelectQuery *>(child.get()) && hasArrayJoin(child))
            return true;

    return false;
}


3215
void ExpressionAnalyzer::removeUnneededColumnsFromSelectClause()
3216 3217 3218 3219
{
    if (!select_query)
        return;

3220
    if (required_result_columns.empty())
3221 3222 3223 3224
        return;

    ASTs & elements = select_query->select_expression_list->children;

3225 3226 3227 3228 3229 3230 3231 3232 3233 3234
    ASTs new_elements;
    new_elements.reserve(elements.size());

    /// Some columns may be queried multiple times, like SELECT x, y, y FROM table.
    /// In that case we keep them exactly same number of times.
    std::map<String, size_t> required_columns_with_duplicate_count;
    for (const auto & name : required_result_columns)
        ++required_columns_with_duplicate_count[name];

    for (const auto & elem : elements)
3235
    {
3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250
        String name = elem->getAliasOrColumnName();

        auto it = required_columns_with_duplicate_count.find(name);
        if (required_columns_with_duplicate_count.end() != it && it->second)
        {
            new_elements.push_back(elem);
            --it->second;
        }
        else if (select_query->distinct || hasArrayJoin(elem))
        {
            new_elements.push_back(elem);
        }
    }

    elements = std::move(new_elements);
3251 3252
}

3253
}