提交 a0a38df9 编写于 作者: D Dhanashree Kashid

Fix wrong results for queries of type NOT (select <>)

For queries of the form "NOT (subselect)", planner lost the "NOT" operator
during the initial pull-up in pull_up_sublinks_qual_recurse() which resulted in
incorrect filter and hence wrong results. Inside
pull_up_sublinks_qual_recurse(), when a qual contains NOT, we check if sublink
type is any one of these: EXISTS, ANY or ALL, and invoke appropriate sublink
pull up routines. In case of qual of the form "NOT (SELECT <>)" the sublink
type is EXPR; hence we recurse into the argument of NOT; at which point we
lose the information about NOT operator.

This commit fixes the issue by returning the node unmodified when the argument
of NOT is an EXPR sublink. The EXPR sublink, later gets pulled up by
preprocess_qual_conditions() wherein, pull_up_sublinks() is invoked again to
handle sublinks in an expression.
Signed-off-by: NSambitesh Dash <sdash@pivotal.io>
上级 30271024
......@@ -396,13 +396,13 @@ pull_up_sublinks_qual_recurse(PlannerInfo *root, Node *node,
if (not_clause(node))
{
/* If the immediate argument of NOT is EXISTS, try to convert */
SubLink *sublink = (SubLink *) get_notclausearg((Expr *) node);
Node *arg = (Node *) get_notclausearg((Expr *) node);
JoinExpr *j;
Relids child_rels;
if (sublink && IsA(sublink, SubLink))
if (arg && IsA(arg, SubLink))
{
SubLink *sublink = (SubLink *) arg;
if (sublink->subLinkType == EXISTS_SUBLINK)
{
Node* subst;
......@@ -440,17 +440,18 @@ pull_up_sublinks_qual_recurse(PlannerInfo *root, Node *node,
* NOT val op ANY (subq) => val op' ALL (subq)
* NOT val op ALL (subq) => val op' ANY (subq)
*/
else if (sublink->subLinkType == ANY_SUBLINK)
else if (sublink->subLinkType == ANY_SUBLINK || sublink->subLinkType == ALL_SUBLINK)
{
sublink->subLinkType = ALL_SUBLINK;
sublink->subLinkType = (sublink->subLinkType == ANY_SUBLINK) ? ALL_SUBLINK : ANY_SUBLINK;
sublink->testexpr = (Node *) canonicalize_qual(make_notclause((Expr *) sublink->testexpr));
return pull_up_sublinks_qual_recurse(root, (Node *) sublink, available_rels, jtlink);
}
else if (sublink->subLinkType == ALL_SUBLINK)
{
sublink->subLinkType = ANY_SUBLINK;
sublink->testexpr = (Node *) canonicalize_qual(make_notclause((Expr *) sublink->testexpr));
}
return pull_up_sublinks_qual_recurse(root, (Node *) sublink, available_rels, jtlink);
/*
* Return the node unmodified for "NOT (subq)"
* This subquery will get pulled up later during preprocess_qual_conditions()
*/
return node;
}
else if (not_clause(arg))
......
......@@ -3537,33 +3537,64 @@ select rnum, c1, c2 from qp_tjoin2 where 20 > all ( select c1 from qp_tjoin1) or
(0 rows)
-- start_ignore
DROP TABLE IF EXISTS foo;
NOTICE: table "foo" does not exist, skipping
DROP TABLE IF EXISTS bar;
NOTICE: table "bar" does not exist, skipping
CREATE TABLE foo(a int, b int);
DROP TABLE IF EXISTS qp_tab1;
NOTICE: table "qp_tab1" does not exist, skipping
DROP TABLE IF EXISTS qp_tab2;
NOTICE: table "qp_tab2" does not exist, skipping
DROP TABLE IF EXISTS qp_tab3;
NOTICE: table "qp_tab3" does not exist, skipping
CREATE TABLE qp_tab1(a int, b int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE bar(c int, d int);
CREATE TABLE qp_tab2(c int, d int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE qp_tab3(e int, f int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'e' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
INSERT INTO qp_tab1 VALUES (1,2);
INSERT INTO qp_tab2 VALUES (3,4);
INSERT INTO qp_tab3 VALUES (4,5);
-- end_ignore
EXPLAIN SELECT a FROM foo f1 LEFT JOIN bar on a=c WHERE NOT EXISTS(SELECT 1 FROM foo f2 WHERE f1.a = f2.a);
QUERY PLAN
----------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (cost=4074.50..1712216.48 rows=7414 width=4)
-> Hash Anti Join (cost=4074.50..1712216.48 rows=2472 width=4)
EXPLAIN SELECT a FROM qp_tab1 f1 LEFT JOIN qp_tab2 on a=c WHERE NOT EXISTS(SELECT 1 FROM qp_tab1 f2 WHERE f1.a = f2.a);
QUERY PLAN
-------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (cost=2.04..3.14 rows=4 width=4)
-> Hash Anti Join (cost=2.04..3.14 rows=2 width=4)
Hash Cond: f1.a = f2.a
-> Hash Left Join (cost=2037.25..95878.62 rows=2471070 width=4)
Hash Cond: f1.a = bar.c
-> Seq Scan on foo f1 (cost=0.00..961.00 rows=28700 width=4)
-> Hash (cost=961.00..961.00 rows=28700 width=4)
-> Seq Scan on bar (cost=0.00..961.00 rows=28700 width=4)
-> Hash (cost=961.00..961.00 rows=28700 width=4)
-> Seq Scan on foo f2 (cost=0.00..961.00 rows=28700 width=4)
Settings: optimizer=off
Optimizer status: legacy query optimizer
(12 rows)
-> Hash Left Join (cost=1.02..2.07 rows=2 width=4)
Hash Cond: f1.a = qp_tab2.c
-> Seq Scan on qp_tab1 f1 (cost=0.00..1.01 rows=1 width=4)
-> Hash (cost=1.01..1.01 rows=1 width=4)
-> Seq Scan on qp_tab2 (cost=0.00..1.01 rows=1 width=4)
-> Hash (cost=1.01..1.01 rows=1 width=4)
-> Seq Scan on qp_tab1 f2 (cost=0.00..1.01 rows=1 width=4)
Optimizer: legacy query optimizer
(11 rows)
EXPLAIN SELECT DISTINCT a FROM qp_tab1 WHERE NOT (SELECT TRUE FROM qp_tab2 WHERE EXISTS (SELECT * FROM qp_tab3 WHERE qp_tab2.c = qp_tab3.e));
QUERY PLAN
------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice2; segments: 3) (cost=3.08..3.09 rows=1 width=4)
-> HashAggregate (cost=3.08..3.09 rows=1 width=4)
Group Key: qp_tab1.a
InitPlan 1 (returns $0) (slice3)
-> Gather Motion 3:1 (slice1; segments: 3) (cost=1.02..2.07 rows=4 width=0)
-> Hash Semi Join (cost=1.02..2.07 rows=2 width=0)
Hash Cond: qp_tab2.c = qp_tab3.e
-> Seq Scan on qp_tab2 (cost=0.00..1.01 rows=1 width=4)
-> Hash (cost=1.01..1.01 rows=1 width=4)
-> Seq Scan on qp_tab3 (cost=0.00..1.01 rows=1 width=4)
-> Result (cost=0.00..1.01 rows=1 width=4)
One-Time Filter: NOT $0
-> Seq Scan on qp_tab1 (cost=0.00..1.01 rows=1 width=4)
Optimizer: legacy query optimizer
(14 rows)
SELECT DISTINCT a FROM qp_tab1 WHERE NOT (SELECT TRUE FROM qp_tab2 WHERE EXISTS (SELECT * FROM qp_tab3 WHERE qp_tab2.c = qp_tab3.e));
a
---
(0 rows)
-- ----------------------------------------------------------------------
-- Test: teardown.sql
......
......@@ -3694,34 +3694,75 @@ select rnum, c1, c2 from qp_tjoin2 where 20 > all ( select c1 from qp_tjoin1) or
(0 rows)
-- start_ignore
DROP TABLE IF EXISTS foo;
NOTICE: table "foo" does not exist, skipping
DROP TABLE IF EXISTS bar;
NOTICE: table "bar" does not exist, skipping
CREATE TABLE foo(a int, b int);
DROP TABLE IF EXISTS qp_tab1;
NOTICE: table "qp_tab1" does not exist, skipping
DROP TABLE IF EXISTS qp_tab2;
NOTICE: table "qp_tab2" does not exist, skipping
DROP TABLE IF EXISTS qp_tab3;
NOTICE: table "qp_tab3" does not exist, skipping
CREATE TABLE qp_tab1(a int, b int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE bar(c int, d int);
CREATE TABLE qp_tab2(c int, d int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE qp_tab3(e int, f int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'e' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
INSERT INTO qp_tab1 VALUES (1,2);
INSERT INTO qp_tab2 VALUES (3,4);
INSERT INTO qp_tab3 VALUES (4,5);
-- end_ignore
EXPLAIN SELECT a FROM foo f1 LEFT JOIN bar on a=c WHERE NOT EXISTS(SELECT 1 FROM foo f2 WHERE f1.a = f2.a);
QUERY PLAN
-------------------------------------------------------------------------------------
EXPLAIN SELECT a FROM qp_tab1 f1 LEFT JOIN qp_tab2 on a=c WHERE NOT EXISTS(SELECT 1 FROM qp_tab1 f2 WHERE f1.a = f2.a);
QUERY PLAN
-----------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1293.00 rows=2 width=4)
-> Hash Left Join (cost=0.00..1293.00 rows=1 width=4)
Hash Cond: qp_correlated_query.foo.a = bar.c
Hash Cond: qp_correlated_query.qp_tab1.a = qp_tab2.c
-> Hash Anti Join (cost=0.00..862.00 rows=1 width=4)
Hash Cond: qp_correlated_query.foo.a = qp_correlated_query.foo.a
-> Table Scan on foo (cost=0.00..431.00 rows=1 width=4)
Hash Cond: qp_correlated_query.qp_tab1.a = qp_correlated_query.qp_tab1.a
-> Table Scan on qp_tab1 (cost=0.00..431.00 rows=1 width=4)
-> Hash (cost=431.00..431.00 rows=1 width=4)
-> Result (cost=0.00..431.00 rows=1 width=4)
-> Table Scan on foo (cost=0.00..431.00 rows=1 width=4)
-> Table Scan on qp_tab1 (cost=0.00..431.00 rows=1 width=4)
-> Hash (cost=431.00..431.00 rows=1 width=4)
-> Table Scan on bar (cost=0.00..431.00 rows=1 width=4)
Settings: optimizer=on
Optimizer status: PQO version 2.46.2
(13 rows)
-> Table Scan on qp_tab2 (cost=0.00..431.00 rows=1 width=4)
Optimizer: PQO version 2.55.12
(12 rows)
EXPLAIN SELECT DISTINCT a FROM qp_tab1 WHERE NOT (SELECT TRUE FROM qp_tab2 WHERE EXISTS (SELECT * FROM qp_tab3 WHERE qp_tab2.c = qp_tab3.e));
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice3; segments: 3) (cost=0.00..1765377.02 rows=1 width=4)
-> GroupAggregate (cost=0.00..1765377.02 rows=1 width=4)
Group Key: qp_tab1.a
-> Sort (cost=0.00..1765377.02 rows=1 width=4)
Sort Key: qp_tab1.a
-> Result (cost=0.00..1765377.02 rows=1 width=4)
Filter: NOT "outer".bool
-> Nested Loop Left Join (cost=0.00..1765377.02 rows=1 width=5)
Join Filter: true
-> Table Scan on qp_tab1 (cost=0.00..431.00 rows=1 width=4)
-> Assert (cost=0.00..862.00 rows=1 width=1)
Assert Cond: (row_number() OVER (?)) = 1
-> Materialize (cost=0.00..862.00 rows=1 width=9)
-> Broadcast Motion 1:3 (slice2) (cost=0.00..862.00 rows=3 width=9)
-> Result (cost=0.00..862.00 rows=1 width=9)
-> WindowAgg (cost=0.00..862.00 rows=1 width=9)
-> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.00 rows=1 width=1)
-> Result (cost=0.00..862.00 rows=1 width=1)
-> Hash Semi Join (cost=0.00..862.00 rows=1 width=1)
Hash Cond: qp_tab2.c = qp_tab3.e
-> Table Scan on qp_tab2 (cost=0.00..431.00 rows=1 width=4)
-> Hash (cost=431.00..431.00 rows=1 width=4)
-> Table Scan on qp_tab3 (cost=0.00..431.00 rows=1 width=4)
Optimizer: PQO version 2.55.12
(24 rows)
SELECT DISTINCT a FROM qp_tab1 WHERE NOT (SELECT TRUE FROM qp_tab2 WHERE EXISTS (SELECT * FROM qp_tab3 WHERE qp_tab2.c = qp_tab3.e));
a
---
(0 rows)
-- ----------------------------------------------------------------------
-- Test: teardown.sql
......
......@@ -1710,3 +1710,11 @@ SELECT EXISTS(SELECT * FROM tenk1 WHERE tenk1.unique1 = tenk2.unique1) FROM tenk
t
(1 row)
--
-- Ensure that NOT is not lost during subquery pull-up
--
SELECT 1 AS col1 WHERE NOT (SELECT 1 = 1);
col1
------
(0 rows)
......@@ -1768,3 +1768,11 @@ SELECT EXISTS(SELECT * FROM tenk1 WHERE tenk1.unique1 = tenk2.unique1) FROM tenk
t
(1 row)
--
-- Ensure that NOT is not lost during subquery pull-up
--
SELECT 1 AS col1 WHERE NOT (SELECT 1 = 1);
col1
------
(0 rows)
......@@ -834,14 +834,22 @@ select rnum, c1, c2 from qp_tjoin2 where 75 > all ( select c2 from qp_tjoin1) or
select rnum, c1, c2 from qp_tjoin2 where 20 > all ( select c1 from qp_tjoin1) order by rnum;
-- start_ignore
DROP TABLE IF EXISTS foo;
DROP TABLE IF EXISTS bar;
CREATE TABLE foo(a int, b int);
CREATE TABLE bar(c int, d int);
DROP TABLE IF EXISTS qp_tab1;
DROP TABLE IF EXISTS qp_tab2;
DROP TABLE IF EXISTS qp_tab3;
CREATE TABLE qp_tab1(a int, b int);
CREATE TABLE qp_tab2(c int, d int);
CREATE TABLE qp_tab3(e int, f int);
INSERT INTO qp_tab1 VALUES (1,2);
INSERT INTO qp_tab2 VALUES (3,4);
INSERT INTO qp_tab3 VALUES (4,5);
-- end_ignore
EXPLAIN SELECT a FROM foo f1 LEFT JOIN bar on a=c WHERE NOT EXISTS(SELECT 1 FROM foo f2 WHERE f1.a = f2.a);
EXPLAIN SELECT a FROM qp_tab1 f1 LEFT JOIN qp_tab2 on a=c WHERE NOT EXISTS(SELECT 1 FROM qp_tab1 f2 WHERE f1.a = f2.a);
EXPLAIN SELECT DISTINCT a FROM qp_tab1 WHERE NOT (SELECT TRUE FROM qp_tab2 WHERE EXISTS (SELECT * FROM qp_tab3 WHERE qp_tab2.c = qp_tab3.e));
SELECT DISTINCT a FROM qp_tab1 WHERE NOT (SELECT TRUE FROM qp_tab2 WHERE EXISTS (SELECT * FROM qp_tab3 WHERE qp_tab2.c = qp_tab3.e));
-- ----------------------------------------------------------------------
-- Test: teardown.sql
......
......@@ -727,3 +727,8 @@ EXPLAIN select count(distinct ss.ten) from
EXPLAIN SELECT EXISTS(SELECT * FROM tenk1 WHERE tenk1.unique1 = tenk2.unique1) FROM tenk2 LIMIT 1;
SELECT EXISTS(SELECT * FROM tenk1 WHERE tenk1.unique1 = tenk2.unique1) FROM tenk2 LIMIT 1;
--
-- Ensure that NOT is not lost during subquery pull-up
--
SELECT 1 AS col1 WHERE NOT (SELECT 1 = 1);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册