diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index c3a19431c480b5133b1e097d98e267ff347a7298..d6d12363883fc3d639751db5570fb9a18ffa0945 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.259 2005/08/01 20:31:08 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.260 2005/08/27 22:13:43 tgl Exp $ * * NOTES * Every node type that can appear in stored rules' parsetrees *must* @@ -1169,6 +1169,9 @@ _outPlannerInfo(StringInfo str, PlannerInfo *node) WRITE_NODE_FIELD(full_join_clauses); WRITE_NODE_FIELD(in_info_list); WRITE_NODE_FIELD(query_pathkeys); + WRITE_NODE_FIELD(group_pathkeys); + WRITE_NODE_FIELD(sort_pathkeys); + WRITE_FLOAT_FIELD(tuple_fraction, "%.4f"); WRITE_BOOL_FIELD(hasJoinRTEs); WRITE_BOOL_FIELD(hasOuterJoins); WRITE_BOOL_FIELD(hasHavingQual); diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c index 3e4bcffe2e85ab6062ae4856de5e001ad0fa04ad..09ad68ecd93c61fd0e84ad1c270a77ec7ab1f534 100644 --- a/src/backend/optimizer/path/pathkeys.c +++ b/src/backend/optimizer/path/pathkeys.c @@ -11,7 +11,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/path/pathkeys.c,v 1.71 2005/07/28 22:27:00 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/path/pathkeys.c,v 1.72 2005/08/27 22:13:43 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -800,54 +800,6 @@ compare_pathkeys(List *keys1, List *keys2) return PATHKEYS_BETTER2; /* key2 is longer */ } -/* - * compare_noncanonical_pathkeys - * Compare two pathkeys to see if they are equivalent, and if not whether - * one is "better" than the other. This is used when we must compare - * non-canonicalized pathkeys. - * - * A pathkey can be considered better than another if it is a superset: - * it contains all the keys of the other plus more. For example, either - * ((A) (B)) or ((A B)) is better than ((A)). - * - * Currently, the only user of this routine is grouping_planner(), - * and it will only pass single-element sublists (from - * make_pathkeys_for_sortclauses). Therefore we don't have to do the - * full two-way-subset-inclusion test on each pair of sublists that is - * implied by the above statement. Instead we just verify they are - * singleton lists and then do an equal(). This could be improved if - * necessary. - */ -PathKeysComparison -compare_noncanonical_pathkeys(List *keys1, List *keys2) -{ - ListCell *key1, - *key2; - - forboth(key1, keys1, key2, keys2) - { - List *subkey1 = (List *) lfirst(key1); - List *subkey2 = (List *) lfirst(key2); - - Assert(list_length(subkey1) == 1); - Assert(list_length(subkey2) == 1); - if (!equal(subkey1, subkey2)) - return PATHKEYS_DIFFERENT; /* no need to keep looking */ - } - - /* - * If we reached the end of only one list, the other is longer and - * therefore not a subset. (We assume the additional sublist(s) of - * the other list are not NIL --- no pathkey list should ever have a - * NIL sublist.) - */ - if (key1 == NULL && key2 == NULL) - return PATHKEYS_EQUAL; - if (key1 != NULL) - return PATHKEYS_BETTER1; /* key1 is longer */ - return PATHKEYS_BETTER2; /* key2 is longer */ -} - /* * pathkeys_contained_in * Common special case of compare_pathkeys: we just want to know @@ -867,24 +819,6 @@ pathkeys_contained_in(List *keys1, List *keys2) return false; } -/* - * noncanonical_pathkeys_contained_in - * The same, when we don't have canonical pathkeys. - */ -bool -noncanonical_pathkeys_contained_in(List *keys1, List *keys2) -{ - switch (compare_noncanonical_pathkeys(keys1, keys2)) - { - case PATHKEYS_EQUAL: - case PATHKEYS_BETTER2: - return true; - default: - break; - } - return false; -} - /* * get_cheapest_path_for_pathkeys * Find the cheapest path (according to the specified criterion) that diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c index 7038a45ac64aac657f336cd694924ca7224cb77d..1aca1249d43da178474ce51ddeb331875b7b8289 100644 --- a/src/backend/optimizer/plan/planmain.c +++ b/src/backend/optimizer/plan/planmain.c @@ -14,7 +14,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.86 2005/07/02 23:00:41 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.87 2005/08/27 22:13:43 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -25,9 +25,11 @@ #include "optimizer/pathnode.h" #include "optimizer/paths.h" #include "optimizer/planmain.h" +#include "optimizer/tlist.h" +#include "utils/selfuncs.h" -/*-------------------- +/* * query_planner * Generate a path (that is, a simplified plan) for a basic query, * which may involve joins but not any fancier features. @@ -51,6 +53,8 @@ * *cheapest_path receives the overall-cheapest path for the query * *sorted_path receives the cheapest presorted path for the query, * if any (NULL if there is no useful presorted path) + * *num_groups receives the estimated number of groups, or 1 if query + * does not use grouping * * Note: the PlannerInfo node also includes a query_pathkeys field, which is * both an input and an output of query_planner(). The input value signals @@ -61,17 +65,21 @@ * PlannerInfo field and not a passed parameter is that the low-level routines * in indxpath.c need to see it.) * + * Note: the PlannerInfo node also includes group_pathkeys and sort_pathkeys, + * which like query_pathkeys need to be canonicalized once the info is + * available. + * * tuple_fraction is interpreted as follows: * 0: expect all tuples to be retrieved (normal case) * 0 < tuple_fraction < 1: expect the given fraction of tuples available * from the plan to be retrieved * tuple_fraction >= 1: tuple_fraction is the absolute number of tuples * expected to be retrieved (ie, a LIMIT specification) - *-------------------- */ void query_planner(PlannerInfo *root, List *tlist, double tuple_fraction, - Path **cheapest_path, Path **sorted_path) + Path **cheapest_path, Path **sorted_path, + double *num_groups) { Query *parse = root->parse; List *constant_quals; @@ -82,6 +90,8 @@ query_planner(PlannerInfo *root, List *tlist, double tuple_fraction, /* Make tuple_fraction accessible to lower-level routines */ root->tuple_fraction = tuple_fraction; + *num_groups = 1; /* default result */ + /* * If the query has an empty join tree, then it's something easy like * "SELECT 2+2;" or "INSERT ... VALUES()". Fall through quickly. @@ -156,9 +166,12 @@ query_planner(PlannerInfo *root, List *tlist, double tuple_fraction, /* * We should now have all the pathkey equivalence sets built, so it's * now possible to convert the requested query_pathkeys to canonical - * form. + * form. Also canonicalize the groupClause and sortClause pathkeys + * for use later. */ root->query_pathkeys = canonicalize_pathkeys(root, root->query_pathkeys); + root->group_pathkeys = canonicalize_pathkeys(root, root->group_pathkeys); + root->sort_pathkeys = canonicalize_pathkeys(root, root->sort_pathkeys); /* * Ready to do the primary planning. @@ -169,12 +182,87 @@ query_planner(PlannerInfo *root, List *tlist, double tuple_fraction, elog(ERROR, "failed to construct the join relation"); /* - * Now that we have an estimate of the final rel's size, we can - * convert a tuple_fraction specified as an absolute count (ie, a - * LIMIT option) into a fraction of the total tuples. + * If there's grouping going on, estimate the number of result groups. + * We couldn't do this any earlier because it depends on relation size + * estimates that were set up above. + * + * Then convert tuple_fraction to fractional form if it is absolute, + * and adjust it based on the knowledge that grouping_planner will be + * doing grouping or aggregation work with our result. + * + * This introduces some undesirable coupling between this code and + * grouping_planner, but the alternatives seem even uglier; we couldn't + * pass back completed paths without making these decisions here. */ - if (tuple_fraction >= 1.0) - tuple_fraction /= final_rel->rows; + if (parse->groupClause) + { + List *groupExprs; + + groupExprs = get_sortgrouplist_exprs(parse->groupClause, + parse->targetList); + *num_groups = estimate_num_groups(root, + groupExprs, + final_rel->rows); + + /* + * In GROUP BY mode, an absolute LIMIT is relative to the number + * of groups not the number of tuples. If the caller gave us + * a fraction, keep it as-is. (In both cases, we are effectively + * assuming that all the groups are about the same size.) + */ + if (tuple_fraction >= 1.0) + tuple_fraction /= *num_groups; + + /* + * If both GROUP BY and ORDER BY are specified, we will need two + * levels of sort --- and, therefore, certainly need to read all + * the tuples --- unless ORDER BY is a subset of GROUP BY. + */ + if (parse->groupClause && parse->sortClause && + !pathkeys_contained_in(root->sort_pathkeys, root->group_pathkeys)) + tuple_fraction = 0.0; + } + else if (parse->hasAggs || root->hasHavingQual) + { + /* + * Ungrouped aggregate will certainly want to read all the tuples, + * and it will deliver a single result row (so leave *num_groups 1). + */ + tuple_fraction = 0.0; + } + else if (parse->distinctClause) + { + /* + * Since there was no grouping or aggregation, it's reasonable to + * assume the UNIQUE filter has effects comparable to GROUP BY. + * Return the estimated number of output rows for use by caller. + * (If DISTINCT is used with grouping, we ignore its effects for + * rowcount estimation purposes; this amounts to assuming the grouped + * rows are distinct already.) + */ + List *distinctExprs; + + distinctExprs = get_sortgrouplist_exprs(parse->distinctClause, + parse->targetList); + *num_groups = estimate_num_groups(root, + distinctExprs, + final_rel->rows); + + /* + * Adjust tuple_fraction the same way as for GROUP BY, too. + */ + if (tuple_fraction >= 1.0) + tuple_fraction /= *num_groups; + } + else + { + /* + * Plain non-grouped, non-aggregated query: an absolute tuple + * fraction can be divided by the number of tuples. + */ + if (tuple_fraction >= 1.0) + tuple_fraction /= final_rel->rows; + } /* * Pick out the cheapest-total path and the cheapest presorted path diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 0b0cb4eabf665d337bc8f691238be0bfd695915b..d87e4089b51abf31d46a36f23fb5bb9136be82d0 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.191 2005/08/18 17:51:11 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.192 2005/08/27 22:13:43 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -63,7 +63,6 @@ static double preprocess_limit(PlannerInfo *root, int *offset_est, int *count_est); static bool choose_hashed_grouping(PlannerInfo *root, double tuple_fraction, Path *cheapest_path, Path *sorted_path, - List *sort_pathkeys, List *group_pathkeys, double dNumGroups, AggClauseCounts *agg_counts); static bool hash_safe_grouping(PlannerInfo *root); static List *make_subplanTargetList(PlannerInfo *root, List *tlist, @@ -655,6 +654,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) Plan *result_plan; List *current_pathkeys; List *sort_pathkeys; + double dNumGroups = 0; /* Tweak caller-supplied tuple_fraction if have LIMIT/OFFSET */ if (parse->limitCount || parse->limitOffset) @@ -727,11 +727,9 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) AttrNumber *groupColIdx = NULL; bool need_tlist_eval = true; QualCost tlist_cost; - double sub_tuple_fraction; Path *cheapest_path; Path *sorted_path; Path *best_path; - double dNumGroups = 0; long numGroups = 0; AggClauseCounts agg_counts; int numGroupCols = list_length(parse->groupClause); @@ -750,13 +748,14 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) &groupColIdx, &need_tlist_eval); /* - * Calculate pathkeys that represent grouping/ordering - * requirements + * Calculate pathkeys that represent grouping/ordering requirements. + * Stash them in PlannerInfo so that query_planner can canonicalize + * them. */ - group_pathkeys = make_pathkeys_for_sortclauses(parse->groupClause, - tlist); - sort_pathkeys = make_pathkeys_for_sortclauses(parse->sortClause, - tlist); + root->group_pathkeys = + make_pathkeys_for_sortclauses(parse->groupClause, tlist); + root->sort_pathkeys = + make_pathkeys_for_sortclauses(parse->sortClause, tlist); /* * Will need actual number of aggregates for estimating costs. @@ -787,112 +786,36 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) * Needs more thought...) */ if (parse->groupClause) - root->query_pathkeys = group_pathkeys; + root->query_pathkeys = root->group_pathkeys; else if (parse->sortClause) - root->query_pathkeys = sort_pathkeys; + root->query_pathkeys = root->sort_pathkeys; else root->query_pathkeys = NIL; - /* - * With grouping or aggregation, the tuple fraction to pass to - * query_planner() may be different from what it is at top level. - */ - sub_tuple_fraction = tuple_fraction; - - if (parse->groupClause) - { - /* - * In GROUP BY mode, we have the little problem that we don't - * really know how many input tuples will be needed to make a - * group, so we can't translate an output LIMIT count into an - * input count. For lack of a better idea, assume 25% of the - * input data will be processed if there is any output limit. - * However, if the caller gave us a fraction rather than an - * absolute count, we can keep using that fraction (which - * amounts to assuming that all the groups are about the same - * size). - */ - if (sub_tuple_fraction >= 1.0) - sub_tuple_fraction = 0.25; - - /* - * If both GROUP BY and ORDER BY are specified, we will need - * two levels of sort --- and, therefore, certainly need to - * read all the input tuples --- unless ORDER BY is a subset - * of GROUP BY. (We have not yet canonicalized the pathkeys, - * so must use the slower noncanonical comparison method.) - */ - if (parse->groupClause && parse->sortClause && - !noncanonical_pathkeys_contained_in(sort_pathkeys, - group_pathkeys)) - sub_tuple_fraction = 0.0; - } - else if (parse->hasAggs) - { - /* - * Ungrouped aggregate will certainly want all the input - * tuples. - */ - sub_tuple_fraction = 0.0; - } - else if (parse->distinctClause) - { - /* - * SELECT DISTINCT, like GROUP, will absorb an unpredictable - * number of input tuples per output tuple. Handle the same - * way. - */ - if (sub_tuple_fraction >= 1.0) - sub_tuple_fraction = 0.25; - } - /* * Generate the best unsorted and presorted paths for this Query - * (but note there may not be any presorted path). + * (but note there may not be any presorted path). query_planner + * will also estimate the number of groups in the query, and + * canonicalize all the pathkeys. */ - query_planner(root, sub_tlist, sub_tuple_fraction, - &cheapest_path, &sorted_path); + query_planner(root, sub_tlist, tuple_fraction, + &cheapest_path, &sorted_path, &dNumGroups); - /* - * We couldn't canonicalize group_pathkeys and sort_pathkeys - * before running query_planner(), so do it now. - */ - group_pathkeys = canonicalize_pathkeys(root, group_pathkeys); - sort_pathkeys = canonicalize_pathkeys(root, sort_pathkeys); + group_pathkeys = root->group_pathkeys; + sort_pathkeys = root->sort_pathkeys; /* - * If grouping, estimate the number of groups. (We can't do this - * until after running query_planner(), either.) Then decide - * whether we want to use hashed grouping. + * If grouping, decide whether we want to use hashed grouping. */ if (parse->groupClause) { - List *groupExprs; - double cheapest_path_rows; - - /* - * Beware of the possibility that cheapest_path->parent is NULL. - * This could happen if user does something silly like - * SELECT 'foo' GROUP BY 1; - */ - if (cheapest_path->parent) - cheapest_path_rows = cheapest_path->parent->rows; - else - cheapest_path_rows = 1; /* assume non-set result */ - - groupExprs = get_sortgrouplist_exprs(parse->groupClause, - parse->targetList); - dNumGroups = estimate_num_groups(root, - groupExprs, - cheapest_path_rows); - /* Also want it as a long int --- but 'ware overflow! */ - numGroups = (long) Min(dNumGroups, (double) LONG_MAX); - use_hashed_grouping = choose_hashed_grouping(root, tuple_fraction, cheapest_path, sorted_path, - sort_pathkeys, group_pathkeys, dNumGroups, &agg_counts); + + /* Also convert # groups to long int --- but 'ware overflow! */ + numGroups = (long) Min(dNumGroups, (double) LONG_MAX); } /* @@ -1130,19 +1053,10 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) /* * If there was grouping or aggregation, leave plan_rows as-is * (ie, assume the result was already mostly unique). If not, - * it's reasonable to assume the UNIQUE filter has effects - * comparable to GROUP BY. + * use the number of distinct-groups calculated by query_planner. */ if (!parse->groupClause && !root->hasHavingQual && !parse->hasAggs) - { - List *distinctExprs; - - distinctExprs = get_sortgrouplist_exprs(parse->distinctClause, - parse->targetList); - result_plan->plan_rows = estimate_num_groups(root, - distinctExprs, - result_plan->plan_rows); - } + result_plan->plan_rows = dNumGroups; } /* @@ -1360,7 +1274,6 @@ preprocess_limit(PlannerInfo *root, double tuple_fraction, static bool choose_hashed_grouping(PlannerInfo *root, double tuple_fraction, Path *cheapest_path, Path *sorted_path, - List *sort_pathkeys, List *group_pathkeys, double dNumGroups, AggClauseCounts *agg_counts) { int numGroupCols = list_length(root->parse->groupClause); @@ -1439,8 +1352,8 @@ choose_hashed_grouping(PlannerInfo *root, double tuple_fraction, cheapest_path->startup_cost, cheapest_path->total_cost, cheapest_path_rows); /* Result of hashed agg is always unsorted */ - if (sort_pathkeys) - cost_sort(&hashed_p, root, sort_pathkeys, hashed_p.total_cost, + if (root->sort_pathkeys) + cost_sort(&hashed_p, root, root->sort_pathkeys, hashed_p.total_cost, dNumGroups, cheapest_path_width); if (sorted_path) @@ -1455,12 +1368,11 @@ choose_hashed_grouping(PlannerInfo *root, double tuple_fraction, sorted_p.total_cost = cheapest_path->total_cost; current_pathkeys = cheapest_path->pathkeys; } - if (!pathkeys_contained_in(group_pathkeys, - current_pathkeys)) + if (!pathkeys_contained_in(root->group_pathkeys, current_pathkeys)) { - cost_sort(&sorted_p, root, group_pathkeys, sorted_p.total_cost, + cost_sort(&sorted_p, root, root->group_pathkeys, sorted_p.total_cost, cheapest_path_rows, cheapest_path_width); - current_pathkeys = group_pathkeys; + current_pathkeys = root->group_pathkeys; } if (root->parse->hasAggs) @@ -1473,9 +1385,9 @@ choose_hashed_grouping(PlannerInfo *root, double tuple_fraction, sorted_p.startup_cost, sorted_p.total_cost, cheapest_path_rows); /* The Agg or Group node will preserve ordering */ - if (sort_pathkeys && - !pathkeys_contained_in(sort_pathkeys, current_pathkeys)) - cost_sort(&sorted_p, root, sort_pathkeys, sorted_p.total_cost, + if (root->sort_pathkeys && + !pathkeys_contained_in(root->sort_pathkeys, current_pathkeys)) + cost_sort(&sorted_p, root, root->sort_pathkeys, sorted_p.total_cost, dNumGroups, cheapest_path_width); /* diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 88e535dc9b99526995b407b7e91f672766c65cad..3b23bfbeb4e02d39461d3cecc8cbf2735190758f 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.117 2005/07/23 21:05:48 tgl Exp $ + * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.118 2005/08/27 22:13:43 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -101,6 +101,9 @@ typedef struct PlannerInfo List *query_pathkeys; /* desired pathkeys for query_planner(), * and actual pathkeys afterwards */ + List *group_pathkeys; /* groupClause pathkeys, if any */ + List *sort_pathkeys; /* sortClause pathkeys, if any */ + double tuple_fraction; /* tuple_fraction passed to query_planner */ bool hasJoinRTEs; /* true if any RTEs are RTE_JOIN kind */ diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 8f26c8e8f0d96d2a206d6990dbbf9e39865400c5..7c8108b000a66cacfef76eae5806212db8ea331a 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/optimizer/paths.h,v 1.86 2005/07/28 20:26:22 tgl Exp $ + * $PostgreSQL: pgsql/src/include/optimizer/paths.h,v 1.87 2005/08/27 22:13:44 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -102,9 +102,6 @@ extern void generate_implied_equalities(PlannerInfo *root); extern List *canonicalize_pathkeys(PlannerInfo *root, List *pathkeys); extern PathKeysComparison compare_pathkeys(List *keys1, List *keys2); extern bool pathkeys_contained_in(List *keys1, List *keys2); -extern PathKeysComparison compare_noncanonical_pathkeys(List *keys1, - List *keys2); -extern bool noncanonical_pathkeys_contained_in(List *keys1, List *keys2); extern Path *get_cheapest_path_for_pathkeys(List *paths, List *pathkeys, CostSelector cost_criterion); extern Path *get_cheapest_fractional_path_for_pathkeys(List *paths, diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index 474a14da9a0cc80be3a6581ef09c990c0fb643a1..652431b894f94c04e06507d308b76e3323df77fd 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/optimizer/planmain.h,v 1.87 2005/08/18 17:51:12 tgl Exp $ + * $PostgreSQL: pgsql/src/include/optimizer/planmain.h,v 1.88 2005/08/27 22:13:44 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -22,7 +22,8 @@ */ extern void query_planner(PlannerInfo *root, List *tlist, double tuple_fraction, - Path **cheapest_path, Path **sorted_path); + Path **cheapest_path, Path **sorted_path, + double *num_groups); /* * prototypes for plan/planagg.c