提交 362fc756 编写于 作者: H Heikki Linnakangas 提交者: Ashwin Agrawal

Re-enable MIN/MAX optimization.

I'm not sure why it's been disabled. It's not very hard to make it work, so
let's do it. Might not be a very common query type, but if you happen to
have a query where it helps, it helps a lot.

This adds a GUC, gp_enable_minmax_optimization, to enable/disable the
optimization. There's no such GUC in upstream, but we need at least a flag
in PlannerConfig for it, so that we can disable the optimization for
correlated subqueries, along with some other optimizer tricks. Seems best
to also have a GUC for it, for consistency with other flags in
PlannerConfig.
上级 874fcdc5
......@@ -34,6 +34,7 @@
#include "utils/syscache.h"
#include "cdb/cdbllize.h" /* pull_up_Flow() */
#include "cdb/cdbsetop.h"
typedef struct
......@@ -107,6 +108,12 @@ optimize_minmax_aggregates(PlannerInfo *root, List *tlist, Path *best_path)
if (parse->groupClause || parse->hasWindowFuncs)
return NULL;
/*
* Reject if disabled by caller.
*/
if (!root->config->gp_enable_minmax_optimization)
return NULL;
/*
* We also restrict the query to reference exactly one table, since join
* conditions can't be handled reasonably. (We could perhaps handle a
......@@ -564,6 +571,27 @@ make_agg_subplan(PlannerInfo *root, MinMaxAggInfo *info)
attach_notnull_index_qual(info, iplan);
if (plan->flow->flotype == FLOW_SINGLETON)
{
/* ok */
}
else if (plan->flow->flotype == FLOW_PARTITIONED)
{
List *pathkeys;
/* Gather the results into a single node, preserving the order. */
pathkeys = make_pathkeys_for_sortclauses(root,
list_make1(sortcl),
plan->targetlist,
true);
plan = (Plan *) make_motion_gather(&subroot, plan, -1,
pathkeys);
}
else
elog(ERROR, "MIN/MAX subplan has unexpected flowtype: %d", plan->flow->type);
if (!focusPlan(plan, true, false))
elog(ERROR, "could not focus MIN/MAX subplan");
plan = (Plan *) make_limit(plan,
subparse->limitOffset,
subparse->limitCount,
......
......@@ -546,6 +546,7 @@ PlannerConfig *DefaultPlannerConfig(void)
c1->mpp_trying_fallback_plan = false;
c1->constraint_exclusion = constraint_exclusion;
c1->gp_enable_minmax_optimization = gp_enable_minmax_optimization;
c1->gp_enable_multiphase_agg = gp_enable_multiphase_agg;
c1->gp_enable_preunique = gp_enable_preunique;
c1->gp_eager_preunique = gp_eager_preunique;
......
......@@ -1672,6 +1672,25 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
else
best_path = sorted_path;
/*
* Check to see if it's possible to optimize MIN/MAX aggregates.
* If so, we will forget all the work we did so far to choose a
* "regular" path ... but we had to do it anyway to be able to
* tell which way is cheaper.
*/
result_plan = optimize_minmax_aggregates(root,
tlist,
best_path);
if (result_plan != NULL)
{
/*
* optimize_minmax_aggregates generated the full plan, with the
* right tlist, and it has no sort order.
*/
current_pathkeys = NIL;
mark_plan_entry(result_plan);
}
/*
* CDB: For now, we either - construct a general parallel plan, - let
* the sequential planner handle the situation, or - construct a
......@@ -1680,7 +1699,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
* Eventually we should add a parallel version of the min-max
* optimization. For now, it's either-or.
*/
if (Gp_role == GP_ROLE_DISPATCH)
if (Gp_role == GP_ROLE_DISPATCH && result_plan == NULL)
{
bool querynode_changed = false;
bool pass_subtlist = agg_counts.hasOrderedAggs;
......@@ -1757,30 +1776,8 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
true);
}
}
else /* Not GP_ROLE_DISPATCH */
{
/*
* Check to see if it's possible to optimize MIN/MAX aggregates.
* If so, we will forget all the work we did so far to choose a
* "regular" path ... but we had to do it anyway to be able to
* tell which way is cheaper.
*/
result_plan = optimize_minmax_aggregates(root,
tlist,
best_path);
if (result_plan != NULL)
{
/*
* optimize_minmax_aggregates generated the full plan, with
* the right tlist, and it has no sort order.
*/
current_pathkeys = NIL;
mark_plan_entry(result_plan);
}
}
if (result_plan == NULL)
if (result_plan == NULL) /* Not GP_ROLE_DISPATCH */
{
/*
* Normal case --- create a plan according to query_planner's
......
......@@ -505,6 +505,14 @@ make_subplan(PlannerInfo *root, Query *orig_subquery, SubLinkType subLinkType,
config->gp_enable_direct_dispatch = false;
config->gp_enable_multiphase_agg = false;
/*
* The MIN/MAX optimization works by inserting a subplan with LIMIT 1.
* That effectively turns a correlated subquery into a multi-level
* correlated subquery, which we don't currently support. (See check
* above.)
*/
config->gp_enable_minmax_optimization = false;
/*
* Only create subplans with sequential scans
*/
......
......@@ -327,6 +327,7 @@ bool dml_ignore_target_partition_check = false;
bool gp_enable_hashjoin_size_heuristic = false;
bool gp_enable_fallback_plan = true;
bool gp_enable_predicate_propagation = false;
bool gp_enable_minmax_optimization = true;
bool gp_enable_multiphase_agg = true;
bool gp_enable_preunique = TRUE;
bool gp_eager_preunique = FALSE;
......@@ -765,6 +766,16 @@ struct config_bool ConfigureNamesBool_gp[] =
false, NULL, NULL
},
{
{"gp_enable_minmax_optimization", PGC_USERSET, QUERY_TUNING_METHOD,
gettext_noop("Enables the planner's use of index scans with limit to implement MIN/MAX."),
NULL,
GUC_NOT_IN_SAMPLE
},
&gp_enable_minmax_optimization,
true, NULL, NULL
},
{
{"gp_enable_multiphase_agg", PGC_USERSET, QUERY_TUNING_METHOD,
gettext_noop("Enables the planner's use of two- or three-stage parallel aggregation plans."),
......
......@@ -593,6 +593,12 @@ extern double gp_motion_cost_per_row;
*/
extern int gp_segments_for_planner;
/*
* Enable/disable the special optimization of MIN/MAX aggregates as
* Index Scan with limit.
*/
extern bool gp_enable_minmax_optimization;
/*
* "gp_enable_multiphase_agg"
*
......
......@@ -30,6 +30,7 @@ typedef struct PlannerConfig
int cdbpath_segments;
int constraint_exclusion;
bool gp_enable_minmax_optimization;
bool gp_enable_multiphase_agg;
bool gp_enable_preunique;
bool gp_eager_preunique;
......
......@@ -521,9 +521,14 @@ select max(unique2) from tenk1 order by max(unique2)+1;
9999
(1 row)
-- MPP: This works in Postgres
select max(unique2), generate_series(1,3) as g from tenk1 order by g desc;
ERROR: set-valued function called in context that cannot accept a set
max | g
------+---
9999 | 3
9999 | 2
9999 | 1
(3 rows)
-- check for correct detection of nested-aggregate errors
select max(min(unique1)) from tenk1;
ERROR: aggregate function calls cannot be nested
......
......@@ -534,7 +534,6 @@ select max(unique2) from tenk1 order by max(unique2)+1;
9999
(1 row)
-- MPP: This works in Postgres
select max(unique2), generate_series(1,3) as g from tenk1 order by g desc;
max | g
------+---
......
......@@ -270,3 +270,18 @@ reset enable_hashagg;
reset enable_hashjoin;
reset enable_mergejoin;
drop table l, ps;
-- This wouldn't work in GPDB, if the MIN/MAX optimization in the planner
-- didn't turn this into an index scan with a Limit.
-- This is the same test we have in the upstream 'aggregates' test.
select max(unique2), generate_series(1,3) as g from tenk1 order by g desc;
max | g
------+---
9999 | 3
9999 | 2
9999 | 1
(3 rows)
-- Same test with avg(), so that the optimization doesn't apply. Fails,
-- currently.
select avg(unique2), generate_series(1,3) as g from tenk1 order by g desc;
ERROR: set-valued function called in context that cannot accept a set
-- array_agg tests
SELECT array_agg(a order by a) as a_by_a from aggtest;
a_by_a
---------------
{0,42,56,100}
(1 row)
SELECT array_agg(b order by b) as b_by_b from aggtest;
b_by_b
-----------------------------
{0.09561,7.8,99.097,324.78}
(1 row)
SELECT array_agg(a order by a) as a_by_a,
array_agg(a order by b) as a_by_b,
array_agg(b order by a) as b_by_a,
array_agg(b order by b) as b_by_b
FROM aggtest;
a_by_a | a_by_b | b_by_a | b_by_b
---------------+---------------+-----------------------------+-----------------------------
{0,42,56,100} | {0,56,100,42} | {0.09561,324.78,7.8,99.097} | {0.09561,7.8,99.097,324.78}
(1 row)
-- Negative test cases for ordered aggregate syntax
SELECT count(order by a) from aggtest; -- zero parameter aggregate
ERROR: syntax error at or near "order"
LINE 1: SELECT count(order by a) from aggtest;
^
SELECT count(a order by a) from aggtest; -- regular (non-orderd) aggregate
count
-------
4
(1 row)
SELECT abs(a order by a) from aggtest; -- regular function
ERROR: ORDER BY specified, but abs is not an aggregate function
LINE 1: SELECT abs(a order by a) from aggtest;
^
SELECT a(aggtest order by a) from aggtest; -- function-like column reference
ERROR: function a(aggtest) does not exist
LINE 1: SELECT a(aggtest order by a) from aggtest;
^
HINT: No function matches the given name and argument types. You might need to add explicit type casts.
SELECT nosuchagg(a order by a) FROM aggtest; -- no such function
ERROR: function nosuchagg(smallint) does not exist
LINE 1: SELECT nosuchagg(a order by a) FROM aggtest;
^
HINT: No function matches the given name and argument types. You might need to add explicit type casts.
SELECT lag(a order by a) from aggtest; -- window function (no window clause)
ERROR: window function lag requires an OVER clause
LINE 1: SELECT lag(a order by a) from aggtest;
^
SELECT lag(a order by a) over (order by a) FROM aggtest; -- window function
ERROR: aggregate ORDER BY is not implemented for window functions
LINE 1: SELECT lag(a order by a) over (order by a) FROM aggtest;
^
SELECT count(a order by a) over (order by a) FROM aggtest; -- window derived aggregate
ERROR: aggregate ORDER BY is not implemented for window functions
LINE 1: SELECT count(a order by a) over (order by a) FROM aggtest;
^
SELECT array_agg(a order by a) over (order by a) FROM aggtest; -- window derived ordered aggregate
ERROR: aggregate ORDER BY is not implemented for window functions
LINE 1: SELECT array_agg(a order by a) over (order by a) FROM aggtes...
^
-- check for mpp-2687
CREATE TEMPORARY TABLE mpp2687t (
dk text,
gk text
) DISTRIBUTED BY (dk);
CREATE VIEW mpp2687v AS
SELECT DISTINCT gk
FROM mpp2687t
GROUP BY gk;
NOTICE: view "mpp2687v" will be a temporary view
SELECT * FROM mpp2687v;
gk
----
(0 rows)
-- MPP-4617
select case when ten < 5 then ten else ten * 2 end, count(distinct two), count(distinct four) from tenk1 group by 1;
case | count | count
------+-------+-------
3 | 1 | 2
1 | 1 | 2
2 | 1 | 2
0 | 1 | 2
16 | 1 | 2
10 | 1 | 2
14 | 1 | 2
12 | 1 | 2
4 | 1 | 2
18 | 1 | 2
(10 rows)
select ten, ten, count(distinct two), count(distinct four) from tenk1 group by 1,2;
ten | ten | count | count
-----+-----+-------+-------
3 | 3 | 1 | 2
5 | 5 | 1 | 2
4 | 4 | 1 | 2
6 | 6 | 1 | 2
1 | 1 | 1 | 2
0 | 0 | 1 | 2
2 | 2 | 1 | 2
8 | 8 | 1 | 2
9 | 9 | 1 | 2
7 | 7 | 1 | 2
(10 rows)
--MPP-20151: distinct is transformed to a group-by
select distinct two from tenk1 order by two;
two
-----
0
1
(2 rows)
select distinct two, four from tenk1 order by two, four;
two | four
-----+------
0 | 0
0 | 2
1 | 1
1 | 3
(4 rows)
select distinct two, max(two) over() from tenk1 order by two;
two | max
-----+-----
0 | 1
1 | 1
(2 rows)
select distinct two, sum(four) over() from tenk1 order by two;
two | sum
-----+-------
0 | 15000
1 | 15000
(2 rows)
select distinct two, sum(four) from tenk1 group by two order by two;
two | sum
-----+-------
0 | 5000
1 | 10000
(2 rows)
select distinct two, sum(four) from tenk1 group by two having sum(four) > 5000;
two | sum
-----+-------
1 | 10000
(1 row)
select distinct t1.two, t2.two, t1.four, t2.four from tenk1 t1, tenk1 t2 where t1.hundred=t2.hundred order by t1.two, t1.four;
two | two | four | four
-----+-----+------+------
0 | 0 | 0 | 0
0 | 0 | 2 | 2
1 | 1 | 1 | 1
1 | 1 | 3 | 3
(4 rows)
-- A variant with more result rows. We had a bug at one point where the
-- Motion Gather node on top of this was missing the Merge Key, and hence
-- the output came out unsorted. But it was not visible if all the rows
-- were processed on the same segment, as is the case with the above variant
-- with only two distinct 'two' values.
select distinct ten, sum(ten) over() from tenk1 order by ten;
ten | sum
-----+-------
0 | 45000
1 | 45000
2 | 45000
3 | 45000
4 | 45000
5 | 45000
6 | 45000
7 | 45000
8 | 45000
9 | 45000
(10 rows)
-- Test for a planner bug we used to have, when this query gets planned
-- as a merge join. This should perform a merge join between 'l' and 'ps',
-- using both pk and sk as the merge keys. Due to the bug, the planner
-- used mix up the columns in the path keys, and used incorrect columns
-- as the merge keys. (This is a modified version of a TPC-H query)
create table l (ok bigint, pk integer, sk integer, quantity numeric) distributed by (ok);
create table ps (pk integer, sk integer, availqty integer) distributed by (pk);
insert into l select g%5, 50-g, g, 5 from generate_series(1, 50) g;
insert into ps select g, 50-g, 10 from generate_series(1, 25) g;
select g.pk, g.sk, ps.availqty
from ps,
(select sum(l.quantity) as qty_sum, l.pk, l.sk
from l
group by l.pk, l.sk ) g
where g.pk = ps.pk and g.sk = ps.sk
and ps.availqty > g.qty_sum ;
pk | sk | availqty
----+----+----------
6 | 44 | 10
3 | 47 | 10
21 | 29 | 10
15 | 35 | 10
20 | 30 | 10
25 | 25 | 10
13 | 37 | 10
22 | 28 | 10
7 | 43 | 10
16 | 34 | 10
24 | 26 | 10
10 | 40 | 10
19 | 31 | 10
8 | 42 | 10
9 | 41 | 10
4 | 46 | 10
14 | 36 | 10
5 | 45 | 10
11 | 39 | 10
18 | 32 | 10
12 | 38 | 10
2 | 48 | 10
23 | 27 | 10
1 | 49 | 10
17 | 33 | 10
(25 rows)
-- the same, but force a merge join and sorted agg.
set enable_hashagg=off;
set enable_hashjoin=off;
set enable_mergejoin=on;
select g.pk, g.sk, ps.availqty
from ps,
(select sum(l.quantity) as qty_sum, l.pk, l.sk
from l
group by l.pk, l.sk ) g
where g.pk = ps.pk and g.sk = ps.sk
and ps.availqty > g.qty_sum ;
pk | sk | availqty
----+----+----------
1 | 49 | 10
2 | 48 | 10
3 | 47 | 10
4 | 46 | 10
5 | 45 | 10
6 | 44 | 10
7 | 43 | 10
8 | 42 | 10
9 | 41 | 10
10 | 40 | 10
11 | 39 | 10
12 | 38 | 10
13 | 37 | 10
14 | 36 | 10
15 | 35 | 10
16 | 34 | 10
17 | 33 | 10
18 | 32 | 10
19 | 31 | 10
20 | 30 | 10
21 | 29 | 10
22 | 28 | 10
23 | 27 | 10
24 | 26 | 10
25 | 25 | 10
(25 rows)
reset enable_hashagg;
reset enable_hashjoin;
reset enable_mergejoin;
drop table l, ps;
-- This wouldn't work in GPDB, if the MIN/MAX optimization in the planner
-- didn't turn this into an index scan with a Limit.
-- This is the same test we have in the upstream 'aggregates' test.
select max(unique2), generate_series(1,3) as g from tenk1 order by g desc;
max | g
------+---
9999 | 3
9999 | 2
9999 | 1
(3 rows)
-- Same test with avg(), so that the optimization doesn't apply. Fails,
-- currently.
select avg(unique2), generate_series(1,3) as g from tenk1 order by g desc;
avg | g
--------+---
4999.5 | 3
4999.5 | 2
4999.5 | 1
(3 rows)
......@@ -233,8 +233,6 @@ select distinct max(unique2) from tenk1;
select max(unique2) from tenk1 order by 1;
select max(unique2) from tenk1 order by max(unique2);
select max(unique2) from tenk1 order by max(unique2)+1;
-- MPP: This works in Postgres
select max(unique2), generate_series(1,3) as g from tenk1 order by g desc;
-- check for correct detection of nested-aggregate errors
......
......@@ -89,3 +89,12 @@ reset enable_hashjoin;
reset enable_mergejoin;
drop table l, ps;
-- This wouldn't work in GPDB, if the MIN/MAX optimization in the planner
-- didn't turn this into an index scan with a Limit.
-- This is the same test we have in the upstream 'aggregates' test.
select max(unique2), generate_series(1,3) as g from tenk1 order by g desc;
-- Same test with avg(), so that the optimization doesn't apply. Fails,
-- currently.
select avg(unique2), generate_series(1,3) as g from tenk1 order by g desc;
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册